mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-26 07:37:40 +02:00
Compare commits
26 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 5c7c22c3e1 | |||
| beac5309f1 | |||
| 9d5d882d8c | |||
| 1ec44d178d | |||
| c7cddefcbd | |||
| e9d1b76d0a | |||
| 099bf06952 | |||
| 60bc8866b1 | |||
| e8ecce53b8 | |||
| 683b04cc4a | |||
| f728adab68 | |||
| 3e61ea0e2f | |||
| fdbd6abee2 | |||
| e12a0128ab | |||
| b3ce5cedf4 | |||
| e9fb3b3fc0 | |||
| 9c10954865 | |||
| fdb2c11c70 | |||
| 09cedfd699 | |||
| 8be759e6f7 | |||
| 894bb27af3 | |||
| fb401045cc | |||
| 51eae8cfca | |||
| 1191758c5d | |||
| 00139b660b | |||
| ef9c13d4c2 |
+26
-19
@@ -35,8 +35,20 @@ AMD ZenDNN:
|
||||
documentation:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- "**/*.md"
|
||||
- docs/**
|
||||
- media/**
|
||||
examples:
|
||||
- all:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- app/**
|
||||
- examples/**
|
||||
- tools/**
|
||||
- all-globs-to-all-files:
|
||||
- '!tools/server/**'
|
||||
- '!tools/mtmd/**'
|
||||
- '!tools/ui/**'
|
||||
testing:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
@@ -47,28 +59,12 @@ build:
|
||||
- cmake/**
|
||||
- CMakeLists.txt
|
||||
- CMakePresets.json
|
||||
examples:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- examples/**
|
||||
- tools/**
|
||||
devops:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- .devops/**
|
||||
- .github/**
|
||||
- ci/**
|
||||
python:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- "**/*.py"
|
||||
- requirements/**
|
||||
- gguf-py/**
|
||||
- .flake8
|
||||
script:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- scripts/**
|
||||
android:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
@@ -81,9 +77,20 @@ server:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- tools/server/**
|
||||
|
||||
|
||||
|
||||
mtmd:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- tools/mtmd/**
|
||||
conversion:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- conversion/**
|
||||
- convert_*.py
|
||||
- gguf-py/**
|
||||
vendor:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- vendor/**
|
||||
ggml:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
|
||||
@@ -222,6 +222,16 @@ if (LLAMA_BUILD_APP)
|
||||
add_subdirectory(app)
|
||||
endif()
|
||||
|
||||
# Standalone libmtmd build without pulling in the rest of the tools/ tree.
|
||||
# Useful when packaging just the mtmd library for language bindings (e.g. an
|
||||
# Apple XCFramework, or a WASM build). When the full tools build is enabled,
|
||||
# mtmd is already built by the tools/ subdirectory above; this hook only fires
|
||||
# when LLAMA_BUILD_TOOLS is OFF to avoid double-adding the target.
|
||||
option(LLAMA_BUILD_MTMD "llama: build tools/mtmd library standalone" OFF)
|
||||
if (LLAMA_BUILD_MTMD AND NOT (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TOOLS))
|
||||
add_subdirectory(tools/mtmd)
|
||||
endif()
|
||||
|
||||
#
|
||||
# install
|
||||
#
|
||||
|
||||
+1
-1
@@ -1,6 +1,6 @@
|
||||
set(TARGET llama-app)
|
||||
|
||||
add_executable(${TARGET} llama.cpp)
|
||||
add_executable(${TARGET} llama.cpp download.cpp)
|
||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama)
|
||||
|
||||
target_link_libraries(${TARGET} PRIVATE
|
||||
|
||||
@@ -0,0 +1,71 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "download.h"
|
||||
#include "log.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <filesystem>
|
||||
|
||||
static void print_usage(int /*argc*/, char ** argv) {
|
||||
printf(
|
||||
"\nexamples:\n"
|
||||
" %s -hf ggml-org/gemma-3-4b-it-qat-GGUF\n"
|
||||
" %s -hf ggml-org/gemma-3-4b-it-qat-GGUF:Q4_K_M\n"
|
||||
" %s -hf ggml-org/models -hff model.gguf\n"
|
||||
" %s -mu https://example.com/model.gguf -m model.gguf\n"
|
||||
"\n",
|
||||
argv[0], argv[0], argv[0], argv[0]
|
||||
);
|
||||
}
|
||||
|
||||
int llama_download(int argc, char ** argv);
|
||||
|
||||
int llama_download(int argc, char ** argv) {
|
||||
common_init();
|
||||
|
||||
common_params params;
|
||||
params.verbosity = LOG_LEVEL_ERROR;
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_DOWNLOAD, print_usage)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
const bool has_source = !params.model.hf_repo.empty() || !params.model.url.empty() ||
|
||||
!params.model.path.empty() || !params.model.docker_repo.empty();
|
||||
if (!has_source) {
|
||||
fprintf(stderr, "error: no model source specified (use --hf-repo, --model-url, --model or --docker-repo)\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
try {
|
||||
common_models_handler handler = common_models_handler_init(params, LLAMA_EXAMPLE_DOWNLOAD);
|
||||
common_models_handler_apply(handler, params);
|
||||
} catch (const std::exception & e) {
|
||||
fprintf(stderr, "error: %s\n", e.what());
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (!params.models_preset.empty()) {
|
||||
// -hf pointed at a preset repo: print the preset path and stop
|
||||
printf("%s\n", params.models_preset.c_str());
|
||||
return 0;
|
||||
}
|
||||
if (params.model.path.empty()) {
|
||||
fprintf(stderr, "error: model download failed\n");
|
||||
return 1;
|
||||
}
|
||||
if (!std::filesystem::exists(params.model.path)) {
|
||||
fprintf(stderr, "error: model file does not exist: %s\n", params.model.path.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
printf("%s\n", params.model.path.c_str());
|
||||
if (!params.mmproj.path.empty()) {
|
||||
printf("%s\n", params.mmproj.path.c_str());
|
||||
}
|
||||
if (!params.speculative.draft.mparams.path.empty()) {
|
||||
printf("%s\n", params.speculative.draft.mparams.path.c_str());
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -19,6 +19,7 @@ int llama_batched_bench(int argc, char ** argv);
|
||||
int llama_fit_params(int argc, char ** argv);
|
||||
int llama_quantize(int argc, char ** argv);
|
||||
int llama_perplexity(int argc, char ** argv);
|
||||
int llama_download(int argc, char ** argv);
|
||||
|
||||
// Self-update is only supported for binaries built with llama-install.sh
|
||||
static int llama_update(int argc, char ** argv) {
|
||||
@@ -61,6 +62,7 @@ static const command cmds[] = {
|
||||
{"serve", "HTTP API server", {"server"}, false, llama_server },
|
||||
{"cli", "Command-line interactive interface", {"client"}, false, llama_cli },
|
||||
{"update", "Update llama to the latest release", {}, UPDATE_HIDDEN, llama_update },
|
||||
{"download", "Download a model", {"get"}, false, llama_download },
|
||||
{"completion", "Text completion", {"complete"}, true, llama_completion },
|
||||
{"bench", "Benchmark prompt processing and text generation", {}, true, llama_bench },
|
||||
{"batched-bench", "Benchmark batched decoding performance", {}, true, llama_batched_bench},
|
||||
|
||||
@@ -13,6 +13,7 @@ LLAMA_BUILD_EXAMPLES=OFF
|
||||
LLAMA_BUILD_TOOLS=OFF
|
||||
LLAMA_BUILD_TESTS=OFF
|
||||
LLAMA_BUILD_SERVER=OFF
|
||||
LLAMA_BUILD_MTMD=ON
|
||||
GGML_METAL=ON
|
||||
GGML_METAL_EMBED_LIBRARY=ON
|
||||
GGML_BLAS_DEFAULT=ON
|
||||
@@ -39,6 +40,7 @@ COMMON_CMAKE_ARGS=(
|
||||
-DLLAMA_BUILD_TOOLS=${LLAMA_BUILD_TOOLS}
|
||||
-DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS}
|
||||
-DLLAMA_BUILD_SERVER=${LLAMA_BUILD_SERVER}
|
||||
-DLLAMA_BUILD_MTMD=${LLAMA_BUILD_MTMD}
|
||||
-DGGML_METAL_EMBED_LIBRARY=${GGML_METAL_EMBED_LIBRARY}
|
||||
-DGGML_BLAS_DEFAULT=${GGML_BLAS_DEFAULT}
|
||||
-DGGML_METAL=${GGML_METAL}
|
||||
@@ -126,6 +128,8 @@ setup_framework_structure() {
|
||||
cp ggml/include/ggml-cpu.h ${header_path}
|
||||
cp ggml/include/ggml-blas.h ${header_path}
|
||||
cp ggml/include/gguf.h ${header_path}
|
||||
cp tools/mtmd/mtmd.h ${header_path}
|
||||
cp tools/mtmd/mtmd-helper.h ${header_path}
|
||||
|
||||
# Create module map (common for all platforms)
|
||||
cat > ${module_path}module.modulemap << EOF
|
||||
@@ -247,6 +251,7 @@ combine_static_libraries() {
|
||||
"${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-cpu.a"
|
||||
"${base_dir}/${build_dir}/ggml/src/ggml-metal/${release_dir}/libggml-metal.a"
|
||||
"${base_dir}/${build_dir}/ggml/src/ggml-blas/${release_dir}/libggml-blas.a"
|
||||
"${base_dir}/${build_dir}/tools/mtmd/${release_dir}/libmtmd.a"
|
||||
)
|
||||
|
||||
# Create temporary directory for processing
|
||||
@@ -410,6 +415,7 @@ cmake -B build-ios-sim -G Xcode \
|
||||
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DMTMD_VIDEO=OFF \
|
||||
-S .
|
||||
cmake --build build-ios-sim --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet
|
||||
|
||||
@@ -424,6 +430,7 @@ cmake -B build-ios-device -G Xcode \
|
||||
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DMTMD_VIDEO=OFF \
|
||||
-S .
|
||||
cmake --build build-ios-device --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet
|
||||
|
||||
@@ -450,6 +457,7 @@ cmake -B build-visionos -G Xcode \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
-DMTMD_VIDEO=OFF \
|
||||
-S .
|
||||
cmake --build build-visionos --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet
|
||||
|
||||
@@ -465,6 +473,7 @@ cmake -B build-visionos-sim -G Xcode \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
-DMTMD_VIDEO=OFF \
|
||||
-S .
|
||||
cmake --build build-visionos-sim --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet
|
||||
|
||||
@@ -481,6 +490,7 @@ cmake -B build-tvos-sim -G Xcode \
|
||||
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DMTMD_VIDEO=OFF \
|
||||
-S .
|
||||
cmake --build build-tvos-sim --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet
|
||||
|
||||
@@ -496,6 +506,7 @@ cmake -B build-tvos-device -G Xcode \
|
||||
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DMTMD_VIDEO=OFF \
|
||||
-S .
|
||||
cmake --build build-tvos-device --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet
|
||||
|
||||
|
||||
@@ -80,8 +80,6 @@ add_library(${TARGET}
|
||||
http.h
|
||||
imatrix-loader.cpp
|
||||
imatrix-loader.h
|
||||
json-partial.cpp
|
||||
json-partial.h
|
||||
json-schema-to-grammar.cpp
|
||||
llguidance.cpp
|
||||
log.cpp
|
||||
|
||||
+212
-123
@@ -297,60 +297,6 @@ struct handle_model_result {
|
||||
std::string preset_path;
|
||||
};
|
||||
|
||||
static handle_model_result common_params_handle_model(struct common_params_model & model,
|
||||
const common_download_opts & opts) {
|
||||
handle_model_result result;
|
||||
|
||||
// TODO @ngxson : refactor this into a new common_model_download_context
|
||||
|
||||
if (!model.docker_repo.empty()) {
|
||||
model.path = common_docker_resolve_model(model.docker_repo);
|
||||
} else if (!model.hf_repo.empty()) {
|
||||
// If -m was used with -hf, treat the model "path" as the hf_file to download
|
||||
if (model.hf_file.empty() && !model.path.empty()) {
|
||||
model.hf_file = model.path;
|
||||
model.path = "";
|
||||
}
|
||||
common_download_opts hf_opts = opts;
|
||||
auto download_result = common_download_model(model, hf_opts);
|
||||
|
||||
if (!download_result.preset_path.empty()) {
|
||||
result.found_preset = true;
|
||||
result.preset_path = download_result.preset_path;
|
||||
return result; // skip everything else if preset.ini is used
|
||||
}
|
||||
|
||||
if (download_result.model_path.empty()) {
|
||||
throw std::runtime_error("failed to download model from Hugging Face");
|
||||
}
|
||||
|
||||
model.path = download_result.model_path;
|
||||
|
||||
if (!download_result.mmproj_path.empty()) {
|
||||
result.found_mmproj = true;
|
||||
result.mmproj.path = download_result.mmproj_path;
|
||||
}
|
||||
|
||||
if (!download_result.mtp_path.empty()) {
|
||||
result.found_mtp = true;
|
||||
result.mtp.path = download_result.mtp_path;
|
||||
}
|
||||
} else if (!model.url.empty()) {
|
||||
if (model.path.empty()) {
|
||||
auto f = string_split<std::string>(model.url, '#').front();
|
||||
f = string_split<std::string>(f, '?').front();
|
||||
model.path = fs_get_cache_file(string_split<std::string>(f, '/').back());
|
||||
}
|
||||
|
||||
auto download_result = common_download_model(model, opts);
|
||||
if (download_result.model_path.empty()) {
|
||||
throw std::runtime_error("failed to download model from " + model.url);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
const std::vector<ggml_type> kv_cache_types = {
|
||||
GGML_TYPE_F32,
|
||||
GGML_TYPE_F16,
|
||||
@@ -395,77 +341,204 @@ static bool parse_bool_value(const std::string & value) {
|
||||
}
|
||||
|
||||
//
|
||||
// CLI argument parsing functions
|
||||
// common_models_handler
|
||||
//
|
||||
|
||||
bool common_params_handle_models(common_params & params, llama_example curr_ex, const common_params_handle_models_params & handle_params) {
|
||||
const bool spec_type_draft_mtp = std::find(params.speculative.types.begin(),
|
||||
params.speculative.types.end(),
|
||||
COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params.speculative.types.end();
|
||||
static std::string get_default_local_path(const std::string & url) {
|
||||
auto f = string_split<std::string>(url, '#').front();
|
||||
f = string_split<std::string>(f, '?').front();
|
||||
return fs_get_cache_file(string_split<std::string>(f, '/').back());
|
||||
}
|
||||
|
||||
common_models_handler common_models_handler_init(const common_params & params, llama_example curr_ex) {
|
||||
common_download_hf_plan plan;
|
||||
common_download_opts opts;
|
||||
|
||||
const bool spec_type_draft_mtp = std::find(params.speculative.types.begin(),
|
||||
params.speculative.types.end(),
|
||||
COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params.speculative.types.end();
|
||||
|
||||
// only download mmproj if the current example is using it
|
||||
bool use_mmproj = false;
|
||||
for (const auto & ex : mmproj_examples) {
|
||||
if (curr_ex == ex) {
|
||||
use_mmproj = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
opts.bearer_token = params.hf_token;
|
||||
opts.offline = params.offline;
|
||||
opts.skip_download = params.skip_download;
|
||||
opts.download_mtp = spec_type_draft_mtp;
|
||||
opts.download_mmproj = !params.no_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty();
|
||||
opts.preset_only = handle_params.preset_only;
|
||||
opts.download_mmproj = use_mmproj && !params.no_mmproj
|
||||
&& params.mmproj.path.empty() && params.mmproj.url.empty();
|
||||
|
||||
if (handle_params.callback) {
|
||||
opts.callback = handle_params.callback;
|
||||
if (!params.model.hf_repo.empty()) {
|
||||
plan = common_download_get_hf_plan(params.model, opts);
|
||||
}
|
||||
|
||||
// sub-models (draft, mmproj, vocoder) are explicitly specified by the user,
|
||||
// so we should not auto-discover mtp/mmproj siblings for them
|
||||
common_download_opts sub_opts = opts;
|
||||
sub_opts.download_mtp = false;
|
||||
sub_opts.download_mmproj = false;
|
||||
return common_models_handler{plan, opts};
|
||||
}
|
||||
|
||||
try {
|
||||
auto res = common_params_handle_model(params.model, opts);
|
||||
if (res.found_preset) {
|
||||
if (!params.models_preset.empty()) {
|
||||
throw std::invalid_argument("cannot use both --models-preset and -hf with a preset.ini file");
|
||||
bool common_models_handler_is_preset_repo(const common_models_handler & handler) {
|
||||
return !handler.plan.preset.url.empty();
|
||||
}
|
||||
|
||||
static std::vector<common_download_task> build_url_tasks(const common_params_model & model, common_download_opts opts) {
|
||||
auto parts = common_download_get_all_parts(model.url);
|
||||
std::vector<common_download_task> tasks;
|
||||
|
||||
// single-part: download straight to model.path if the user gave one (-m), else the cache default
|
||||
if (parts.size() == 1) {
|
||||
common_download_task task;
|
||||
task.url = parts[0];
|
||||
task.local_path = model.path.empty() ? get_default_local_path(parts[0]) : model.path;
|
||||
task.opts = opts;
|
||||
tasks.push_back(std::move(task));
|
||||
return tasks;
|
||||
}
|
||||
|
||||
// multi-part: place each part under the user's -m directory (if given), else the cache default
|
||||
std::string base_dir;
|
||||
if (!model.path.empty()) {
|
||||
auto pos = model.path.rfind('/');
|
||||
base_dir = pos == std::string::npos ? std::string(".") : model.path.substr(0, pos);
|
||||
}
|
||||
|
||||
for (const auto & part : parts) {
|
||||
common_download_task task;
|
||||
task.url = part;
|
||||
task.opts = opts;
|
||||
|
||||
std::string local = get_default_local_path(part);
|
||||
if (!base_dir.empty()) {
|
||||
auto pos = local.rfind('/');
|
||||
std::string name = pos == std::string::npos ? local : local.substr(pos + 1);
|
||||
local = base_dir + "/" + name;
|
||||
}
|
||||
task.local_path = local;
|
||||
tasks.push_back(std::move(task));
|
||||
}
|
||||
return tasks;
|
||||
}
|
||||
|
||||
void common_models_handler_apply(common_models_handler & handler, common_params & params, common_download_callback * callback) {
|
||||
std::vector<common_download_task> tasks;
|
||||
|
||||
auto & plan = handler.plan;
|
||||
|
||||
auto opts = handler.opts; // copy
|
||||
opts.callback = callback;
|
||||
|
||||
// handle plain "url" if needed
|
||||
auto handle_url = [&](common_params_model & model) {
|
||||
if (!model.url.empty()) {
|
||||
if (model.path.empty()) {
|
||||
model.path = get_default_local_path(model.url);
|
||||
}
|
||||
}
|
||||
};
|
||||
handle_url(params.model);
|
||||
handle_url(params.mmproj);
|
||||
handle_url(params.vocoder.model);
|
||||
handle_url(params.speculative.draft.mparams);
|
||||
|
||||
// optionally, if docker repo is set, resolve it
|
||||
if (!params.model.docker_repo.empty()) {
|
||||
params.model.url = common_docker_resolve_model(params.model.docker_repo);
|
||||
params.model.path = get_default_local_path(params.model.url);
|
||||
}
|
||||
|
||||
// handle plain "url" tasks (non-hf)
|
||||
if (!params.model.url.empty()) {
|
||||
auto url_tasks = build_url_tasks(params.model, opts);
|
||||
// the first part is what gets loaded, so point params.model.path at it
|
||||
if (!url_tasks.empty()) {
|
||||
std::string first_path = url_tasks.front().local_path;
|
||||
url_tasks.front().on_done = [&]() { params.model.path = first_path; };
|
||||
}
|
||||
for (auto & task : url_tasks) {
|
||||
tasks.push_back(std::move(task));
|
||||
}
|
||||
}
|
||||
if (!params.mmproj.url.empty()) {
|
||||
common_download_task task;
|
||||
task.url = params.mmproj.url;
|
||||
task.local_path = params.mmproj.path;
|
||||
task.opts = opts;
|
||||
tasks.push_back(task);
|
||||
}
|
||||
if (!params.vocoder.model.url.empty()) {
|
||||
common_download_task task;
|
||||
task.url = params.vocoder.model.url;
|
||||
task.local_path = params.vocoder.model.path;
|
||||
task.opts = opts;
|
||||
tasks.push_back(task);
|
||||
}
|
||||
if (!params.speculative.draft.mparams.url.empty()) {
|
||||
common_download_task task;
|
||||
task.url = params.speculative.draft.mparams.url;
|
||||
task.local_path = params.speculative.draft.mparams.path;
|
||||
task.opts = opts;
|
||||
tasks.push_back(task);
|
||||
}
|
||||
|
||||
// handle hf_plan tasks
|
||||
if (!plan.model_files.empty()) {
|
||||
for (size_t i = 0; i < plan.model_files.size(); ++i) {
|
||||
auto & model_file = plan.model_files[i];
|
||||
bool is_first = (i == 0);
|
||||
tasks.emplace_back(model_file, opts, [&, is_first]() {
|
||||
if (is_first) {
|
||||
// only use first part as model path
|
||||
params.model.path = hf_cache::finalize_file(model_file);
|
||||
} else {
|
||||
hf_cache::finalize_file(model_file);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
if (!plan.mmproj.local_path.empty()) {
|
||||
tasks.emplace_back(plan.mmproj, opts, [&]() {
|
||||
params.mmproj.path = hf_cache::finalize_file(plan.mmproj);
|
||||
});
|
||||
}
|
||||
if (!plan.mtp.local_path.empty()) {
|
||||
tasks.emplace_back(plan.mtp, opts, [&]() {
|
||||
// only fall back to the discovered MTP head when no draft was explicitly provided
|
||||
if (params.speculative.draft.mparams.empty()) {
|
||||
params.speculative.draft.mparams.path = hf_cache::finalize_file(plan.mtp);
|
||||
} else {
|
||||
hf_cache::finalize_file(plan.mtp);
|
||||
}
|
||||
});
|
||||
}
|
||||
if (!plan.preset.local_path.empty()) {
|
||||
tasks.emplace_back(plan.preset, opts, [&]() {
|
||||
// if HF repo is a preset repo, we simply run server in router mode with the preset.ini file
|
||||
params.models_preset_hf = params.model.hf_repo; // only for showing a warning
|
||||
params.models_preset = res.preset_path;
|
||||
params.models_preset = hf_cache::finalize_file(plan.preset);
|
||||
params.model = common_params_model{}; // make sure to clear model, so server starts in router mode
|
||||
return true;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (params.no_mmproj) {
|
||||
params.mmproj = {};
|
||||
} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
|
||||
// optionally, handle mmproj model when -hf is specified
|
||||
params.mmproj = res.mmproj;
|
||||
}
|
||||
// only download mmproj if the current example is using it
|
||||
for (const auto & ex : mmproj_examples) {
|
||||
if (curr_ex == ex) {
|
||||
common_params_handle_model(params.mmproj, sub_opts);
|
||||
break;
|
||||
}
|
||||
}
|
||||
// run all tasks in parallel
|
||||
if (!params.offline) {
|
||||
common_download_run_tasks(tasks);
|
||||
}
|
||||
|
||||
// when --spec-type mtp is set and no draft model was provided explicitly,
|
||||
// fall back to the MTP head discovered alongside the -hf model
|
||||
if (spec_type_draft_mtp && res.found_mtp &&
|
||||
params.speculative.draft.mparams.path.empty() &&
|
||||
params.speculative.draft.mparams.hf_repo.empty() &&
|
||||
params.speculative.draft.mparams.url.empty()) {
|
||||
params.speculative.draft.mparams.path = res.mtp.path;
|
||||
// download successful, update params with the downloaded paths
|
||||
for (const auto & task : tasks) {
|
||||
if (task.on_done) {
|
||||
task.on_done();
|
||||
}
|
||||
common_params_handle_model(params.speculative.draft.mparams, sub_opts);
|
||||
common_params_handle_model(params.vocoder.model, sub_opts);
|
||||
return true;
|
||||
} catch (const common_skip_download_exception &) {
|
||||
return false;
|
||||
} catch (const std::exception &) {
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// CLI argument parsing functions
|
||||
//
|
||||
|
||||
static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
|
||||
common_params & params = ctx_arg.params;
|
||||
|
||||
@@ -594,12 +667,15 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||
const bool skip_model_download =
|
||||
// server will call common_params_handle_models() later, so we skip it here
|
||||
ctx_arg.ex == LLAMA_EXAMPLE_SERVER ||
|
||||
// download calls common_params_handle_models() itself and prints the paths
|
||||
ctx_arg.ex == LLAMA_EXAMPLE_DOWNLOAD ||
|
||||
// export_graph_ops loads only metadata
|
||||
ctx_arg.ex == LLAMA_EXAMPLE_EXPORT_GRAPH_OPS;
|
||||
|
||||
if (!skip_model_download) {
|
||||
// handle model and download
|
||||
common_params_handle_models(params, ctx_arg.ex, {});
|
||||
common_models_handler handler = common_models_handler_init(params, ctx_arg.ex);
|
||||
common_models_handler_apply(handler, params);
|
||||
|
||||
// model is required (except for server)
|
||||
// TODO @ngxson : maybe show a list of available models in CLI in this case
|
||||
@@ -671,15 +747,19 @@ static void common_params_print_usage(common_params_context & ctx_arg) {
|
||||
common_options.push_back(&opt);
|
||||
}
|
||||
}
|
||||
printf("----- common params -----\n\n");
|
||||
print_options(common_options);
|
||||
printf("\n\n----- sampling params -----\n\n");
|
||||
print_options(sampling_options);
|
||||
printf("\n\n----- speculative params -----\n\n");
|
||||
print_options(spec_options);
|
||||
// TODO: maybe convert enum llama_example to string
|
||||
printf("\n\n----- example-specific params -----\n\n");
|
||||
print_options(specific_options);
|
||||
bool first = true;
|
||||
auto print_section = [&](const char * header, std::vector<common_arg *> & options) {
|
||||
if (options.empty()) {
|
||||
return;
|
||||
}
|
||||
printf("%s----- %s -----\n\n", first ? "" : "\n\n", header);
|
||||
first = false;
|
||||
print_options(options);
|
||||
};
|
||||
print_section("common params", common_options);
|
||||
print_section("sampling params", sampling_options);
|
||||
print_section("speculative params", spec_options);
|
||||
print_section("example-specific params", specific_options);
|
||||
}
|
||||
|
||||
static void common_params_print_completion(common_params_context & ctx_arg) {
|
||||
@@ -1079,7 +1159,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
* - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example
|
||||
*/
|
||||
auto add_opt = [&](common_arg arg) {
|
||||
if ((arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) && !arg.is_exclude(ex)) {
|
||||
// download only exposes the handful of args explicitly tagged for it
|
||||
const bool inherit_common = ex != LLAMA_EXAMPLE_DOWNLOAD;
|
||||
if ((arg.in_example(ex) || (inherit_common && arg.in_example(LLAMA_EXAMPLE_COMMON))) && !arg.is_exclude(ex)) {
|
||||
ctx_arg.options.push_back(std::move(arg));
|
||||
}
|
||||
};
|
||||
@@ -1090,7 +1172,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params) {
|
||||
params.usage = true;
|
||||
}
|
||||
));
|
||||
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}));
|
||||
add_opt(common_arg(
|
||||
{"--version"},
|
||||
"show version and build info",
|
||||
@@ -2212,7 +2294,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params, bool value) {
|
||||
params.no_mmproj = !value;
|
||||
}
|
||||
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_AUTO"));
|
||||
).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_MMPROJ_AUTO"));
|
||||
add_opt(common_arg(
|
||||
{"--mmproj-offload"},
|
||||
{"--no-mmproj-offload"},
|
||||
@@ -2611,14 +2693,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.model.path = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
|
||||
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_MODEL"));
|
||||
add_opt(common_arg(
|
||||
{"-mu", "--model-url"}, "MODEL_URL",
|
||||
"model download url (default: unused)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.model.url = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_MODEL_URL"));
|
||||
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_MODEL_URL"));
|
||||
add_opt(common_arg(
|
||||
{ "-dr", "--docker-repo" }, "[<repo>/]<model>[:quant]",
|
||||
"Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.\n"
|
||||
@@ -2627,7 +2709,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.model.docker_repo = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_DOCKER_REPO"));
|
||||
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_DOCKER_REPO"));
|
||||
add_opt(common_arg(
|
||||
{"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
|
||||
"Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
|
||||
@@ -2637,14 +2719,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.model.hf_repo = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_HF_REPO"));
|
||||
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_HF_REPO"));
|
||||
add_opt(common_arg(
|
||||
{"-hff", "--hf-file"}, "FILE",
|
||||
"Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.model.hf_file = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_HF_FILE"));
|
||||
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_HF_FILE"));
|
||||
add_opt(common_arg(
|
||||
{"-hfv", "-hfrv", "--hf-repo-v"}, "<user>/<model>[:quant]",
|
||||
"Hugging Face model repository for the vocoder model (default: unused)",
|
||||
@@ -2665,7 +2747,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.hf_token = value;
|
||||
}
|
||||
).set_env("HF_TOKEN"));
|
||||
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("HF_TOKEN"));
|
||||
add_opt(common_arg(
|
||||
{"--mtp"},
|
||||
"also download the multi-token prediction (MTP) head, if available (default: unused)",
|
||||
[](common_params & params) {
|
||||
params.speculative.types.push_back(COMMON_SPECULATIVE_TYPE_DRAFT_MTP);
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_DOWNLOAD}));
|
||||
add_opt(common_arg(
|
||||
{"--context-file"}, "FNAME",
|
||||
"file to load context from (use comma-separated values to specify multiple files)",
|
||||
|
||||
+12
-11
@@ -8,6 +8,7 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
|
||||
// pseudo-env variable to identify preset-only arguments
|
||||
#define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP"
|
||||
@@ -130,19 +131,19 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
|
||||
// see: https://github.com/ggml-org/llama.cpp/issues/18163
|
||||
void common_params_add_preset_options(std::vector<common_arg> & args);
|
||||
|
||||
struct common_params_handle_models_params {
|
||||
common_download_callback * callback = nullptr;
|
||||
bool preset_only = false; // if true, only check & download remote preset (for router mode)
|
||||
struct common_models_handler {
|
||||
common_download_hf_plan plan;
|
||||
common_download_opts opts;
|
||||
};
|
||||
|
||||
// populate model paths (main model, mmproj, etc) from -hf if necessary
|
||||
// return true if the model is ready to use
|
||||
// throw an exception if there is an error that prevents the model from being used (e.g. network error, model not found, etc)
|
||||
// if params.skip_download is true, no downloads will be attempted. return false if the model is invalid or missing (e.g. ETag check failed)
|
||||
bool common_params_handle_models(
|
||||
common_params & params,
|
||||
llama_example curr_ex,
|
||||
const common_params_handle_models_params & handle_params);
|
||||
// initialize downloading opts and hf_plan if needed, but does not download anything yet
|
||||
common_models_handler common_models_handler_init(const common_params & params, llama_example curr_ex);
|
||||
|
||||
// check if the model is a preset repo (i.e. has a preset file)
|
||||
bool common_models_handler_is_preset_repo(const common_models_handler & handler);
|
||||
|
||||
// download and update params with the downloaded model path
|
||||
void common_models_handler_apply(common_models_handler & handler, common_params & params, common_download_callback * callback = nullptr);
|
||||
|
||||
// initialize argument parser context - used by test-arg-parser and preset
|
||||
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
||||
|
||||
@@ -2758,5 +2758,9 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & src_pars
|
||||
std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_templates * chat_templates) {
|
||||
GGML_ASSERT(chat_templates != nullptr);
|
||||
GGML_ASSERT(chat_templates->template_default != nullptr);
|
||||
if (chat_templates->template_tool_use != nullptr) {
|
||||
// take the more expressive template when available
|
||||
return chat_templates->template_tool_use->caps.to_map();
|
||||
}
|
||||
return chat_templates->template_default->caps.to_map();
|
||||
}
|
||||
|
||||
+12
-8
@@ -96,6 +96,7 @@ enum llama_example {
|
||||
LLAMA_EXAMPLE_FIT_PARAMS,
|
||||
LLAMA_EXAMPLE_RESULTS,
|
||||
LLAMA_EXAMPLE_EXPORT_GRAPH_OPS,
|
||||
LLAMA_EXAMPLE_DOWNLOAD,
|
||||
|
||||
LLAMA_EXAMPLE_COUNT,
|
||||
};
|
||||
@@ -290,13 +291,13 @@ struct common_params_sampling {
|
||||
};
|
||||
|
||||
struct common_params_model {
|
||||
std::string path = ""; // model local path // NOLINT
|
||||
std::string url = ""; // model url to download // NOLINT
|
||||
std::string hf_repo = ""; // HF repo // NOLINT
|
||||
std::string hf_file = ""; // HF file // NOLINT
|
||||
std::string docker_repo = ""; // Docker repo // NOLINT
|
||||
std::string path = ""; // model local path
|
||||
std::string url = ""; // model url to download
|
||||
std::string hf_repo = ""; // HF repo
|
||||
std::string hf_file = ""; // HF file
|
||||
std::string docker_repo = ""; // Docker repo
|
||||
|
||||
std::string get_name() {
|
||||
std::string get_name() const {
|
||||
if (!hf_repo.empty()) {
|
||||
return hf_repo;
|
||||
}
|
||||
@@ -305,6 +306,10 @@ struct common_params_model {
|
||||
}
|
||||
return path;
|
||||
}
|
||||
|
||||
bool empty() const {
|
||||
return get_name().empty();
|
||||
}
|
||||
};
|
||||
|
||||
// draft-model-based speculative decoding parameters
|
||||
@@ -367,7 +372,7 @@ struct common_params_speculative {
|
||||
common_params_speculative_ngram_cache ngram_cache;
|
||||
|
||||
bool has_dft() const {
|
||||
return !draft.mparams.path.empty() || !draft.mparams.hf_repo.empty();
|
||||
return !draft.mparams.empty();
|
||||
}
|
||||
|
||||
uint32_t need_n_rs_seq() const {
|
||||
@@ -519,7 +524,6 @@ struct common_params {
|
||||
int32_t control_vector_layer_start = -1; // layer range for control vector
|
||||
int32_t control_vector_layer_end = -1; // layer range for control vector
|
||||
bool offline = false;
|
||||
bool skip_download = false; // skip model file downloading
|
||||
|
||||
int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
|
||||
int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
|
||||
|
||||
+22
-118
@@ -292,10 +292,6 @@ static int common_download_file_single_online(const std::string & url,
|
||||
|
||||
const bool file_exists = std::filesystem::exists(path);
|
||||
|
||||
if (!file_exists && opts.skip_download) {
|
||||
return -2; // file is missing and download is disabled
|
||||
}
|
||||
|
||||
if (file_exists && skip_etag) {
|
||||
LOG_DBG("%s: using cached file: %s\n", __func__, path.c_str());
|
||||
return 304; // 304 Not Modified - fake cached response
|
||||
@@ -362,9 +358,6 @@ static int common_download_file_single_online(const std::string & url,
|
||||
return 304; // 304 Not Modified - fake cached response
|
||||
}
|
||||
// pass this point, the file exists but is different from the server version, so we need to redownload it
|
||||
if (opts.skip_download) {
|
||||
return -2; // special code to indicate that the download was skipped due to etag mismatch
|
||||
}
|
||||
if (remove(path.c_str()) != 0) {
|
||||
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
|
||||
return -1;
|
||||
@@ -691,19 +684,8 @@ static void list_available_gguf_files(const hf_cache::hf_files & files) {
|
||||
}
|
||||
}
|
||||
|
||||
struct hf_plan {
|
||||
hf_cache::hf_file primary;
|
||||
hf_cache::hf_files model_files;
|
||||
hf_cache::hf_file mmproj;
|
||||
hf_cache::hf_file mtp;
|
||||
hf_cache::hf_file preset; // if set, only this file is downloaded
|
||||
};
|
||||
|
||||
static hf_plan get_hf_plan(const common_params_model & model,
|
||||
const common_download_opts & opts,
|
||||
bool download_mmproj,
|
||||
bool download_mtp) {
|
||||
hf_plan plan;
|
||||
common_download_hf_plan common_download_get_hf_plan(const common_params_model & model, const common_download_opts & opts) {
|
||||
common_download_hf_plan plan;
|
||||
hf_cache::hf_files all;
|
||||
|
||||
auto [repo, tag] = common_download_split_repo_tag(model.hf_repo);
|
||||
@@ -752,127 +734,49 @@ static hf_plan get_hf_plan(const common_params_model & model,
|
||||
plan.primary = primary;
|
||||
plan.model_files = get_split_files(all, primary);
|
||||
|
||||
if (download_mmproj) {
|
||||
if (opts.download_mmproj) {
|
||||
plan.mmproj = find_best_mmproj(all, primary.path);
|
||||
}
|
||||
|
||||
if (download_mtp) {
|
||||
if (opts.download_mtp) {
|
||||
plan.mtp = find_best_mtp(all, primary.path);
|
||||
}
|
||||
|
||||
return plan;
|
||||
}
|
||||
|
||||
struct download_task {
|
||||
std::string url;
|
||||
std::string path;
|
||||
};
|
||||
|
||||
static std::vector<download_task> get_url_tasks(const common_params_model & model) {
|
||||
auto split = get_gguf_split_info(model.url);
|
||||
|
||||
if (split.count <= 1) {
|
||||
return {{model.url, model.path}};
|
||||
}
|
||||
|
||||
auto filename = split.prefix;
|
||||
if (auto pos = split.prefix.rfind('/'); pos != std::string::npos) {
|
||||
filename = split.prefix.substr(pos + 1);
|
||||
}
|
||||
|
||||
auto parent_path = std::filesystem::path(model.path).parent_path();
|
||||
auto prefix_path = (parent_path / filename).string();
|
||||
|
||||
std::vector<download_task> tasks;
|
||||
for (int i = 1; i <= split.count; i++) {
|
||||
auto suffix = string_format("-%05d-of-%05d.gguf", i, split.count);
|
||||
tasks.push_back({split.prefix + suffix, prefix_path + suffix});
|
||||
}
|
||||
return tasks;
|
||||
}
|
||||
|
||||
common_download_model_result common_download_model(const common_params_model & model,
|
||||
const common_download_opts & opts) {
|
||||
common_download_model_result result;
|
||||
std::vector<download_task> tasks;
|
||||
hf_plan hf;
|
||||
|
||||
bool download_mmproj = opts.download_mmproj;
|
||||
bool download_mtp = opts.download_mtp;
|
||||
bool preset_only = opts.preset_only;
|
||||
bool is_hf = !model.hf_repo.empty();
|
||||
|
||||
if (is_hf) {
|
||||
hf = get_hf_plan(model, opts, download_mmproj, download_mtp);
|
||||
if (!hf.preset.path.empty()) {
|
||||
// if preset.ini exists, only download that file alone
|
||||
tasks.push_back({hf.preset.url, hf.preset.local_path});
|
||||
} else if (!preset_only) {
|
||||
// only add other files if we're NOT in preset-only mode (normal run, non-router)
|
||||
for (const auto & f : hf.model_files) {
|
||||
tasks.push_back({f.url, f.local_path});
|
||||
}
|
||||
if (!hf.mmproj.path.empty()) {
|
||||
tasks.push_back({hf.mmproj.url, hf.mmproj.local_path});
|
||||
}
|
||||
if (!hf.mtp.path.empty()) {
|
||||
tasks.push_back({hf.mtp.url, hf.mtp.local_path});
|
||||
}
|
||||
}
|
||||
} else if (!model.url.empty()) {
|
||||
tasks = get_url_tasks(model);
|
||||
} else {
|
||||
result.model_path = model.path;
|
||||
return result;
|
||||
}
|
||||
|
||||
if (tasks.empty()) {
|
||||
return result;
|
||||
}
|
||||
|
||||
void common_download_run_tasks(const std::vector<common_download_task> & tasks) {
|
||||
std::vector<std::future<int>> futures;
|
||||
for (const auto & task : tasks) {
|
||||
futures.push_back(std::async(std::launch::async,
|
||||
[&task, &opts, is_hf]() {
|
||||
return common_download_file_single(task.url, task.path, opts, is_hf);
|
||||
[&task]() {
|
||||
return common_download_file_single(task.url, task.local_path, task.opts, task.is_hf);
|
||||
}
|
||||
));
|
||||
}
|
||||
|
||||
for (auto & f : futures) {
|
||||
int status = f.get();
|
||||
if (status == -2 && opts.skip_download) {
|
||||
throw common_skip_download_exception();
|
||||
}
|
||||
for (size_t i = 0; i < futures.size(); ++i) {
|
||||
std::string url = tasks[i].url;
|
||||
int status = futures[i].get();
|
||||
bool is_ok = is_http_status_ok(status);
|
||||
if (!is_ok) {
|
||||
return {};
|
||||
throw std::runtime_error(string_format("Download '%s' failed with status code: %d", url.c_str(), status));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (is_hf) {
|
||||
if (!hf.preset.path.empty()) {
|
||||
// if preset.ini is used, do not set other paths
|
||||
result.preset_path = hf_cache::finalize_file(hf.preset);
|
||||
} else {
|
||||
for (const auto & f : hf.model_files) {
|
||||
hf_cache::finalize_file(f);
|
||||
}
|
||||
result.model_path = hf.primary.final_path;
|
||||
std::vector<std::string> common_download_get_all_parts(const std::string & url) {
|
||||
auto split = get_gguf_split_info(url);
|
||||
|
||||
if (!hf.mmproj.path.empty()) {
|
||||
result.mmproj_path = hf_cache::finalize_file(hf.mmproj);
|
||||
}
|
||||
|
||||
if (!hf.mtp.path.empty()) {
|
||||
result.mtp_path = hf_cache::finalize_file(hf.mtp);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
result.model_path = model.path;
|
||||
if (split.count <= 1) {
|
||||
return {url};
|
||||
}
|
||||
|
||||
return result;
|
||||
std::vector<std::string> parts;
|
||||
for (int i = 1; i <= split.count; i++) {
|
||||
auto suffix = string_format("-%05d-of-%05d.gguf", i, split.count);
|
||||
parts.push_back(split.prefix + suffix);
|
||||
}
|
||||
return parts;
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
+28
-43
@@ -1,7 +1,10 @@
|
||||
#pragma once
|
||||
|
||||
#include "hf-cache.h"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <functional>
|
||||
|
||||
struct common_params_model;
|
||||
|
||||
@@ -47,67 +50,40 @@ struct common_cached_model_info {
|
||||
}
|
||||
};
|
||||
|
||||
// Options for common_download_model and common_download_file_single
|
||||
// Options for common_download_file_single
|
||||
struct common_download_opts {
|
||||
std::string bearer_token;
|
||||
common_header_list headers;
|
||||
bool offline = false;
|
||||
bool skip_download = false; // if true, only validation is performed, common_skip_download_exception may be thrown if the file is missing or invalid
|
||||
bool download_mmproj = false;
|
||||
bool download_mtp = false;
|
||||
bool preset_only = false; // if true, only check & download remote preset (for router mode)
|
||||
common_download_callback * callback = nullptr;
|
||||
};
|
||||
|
||||
// Result of common_download_model
|
||||
struct common_download_model_result {
|
||||
std::string model_path;
|
||||
std::string mmproj_path;
|
||||
std::string mtp_path;
|
||||
std::string preset_path;
|
||||
struct common_download_task {
|
||||
common_download_opts opts;
|
||||
std::string url;
|
||||
std::string local_path;
|
||||
std::function<void()> on_done;
|
||||
bool is_hf = false;
|
||||
|
||||
common_download_task() = default;
|
||||
common_download_task(hf_cache::hf_file f,
|
||||
const common_download_opts & opts,
|
||||
std::function<void()> on_done = nullptr)
|
||||
: opts(opts), url(f.url), local_path(f.local_path), on_done(on_done), is_hf(true) {}
|
||||
};
|
||||
|
||||
// throw if the file is missing or invalid (e.g. ETag check failed)
|
||||
struct common_skip_download_exception : public std::runtime_error {
|
||||
common_skip_download_exception() : std::runtime_error("skip download") {}
|
||||
};
|
||||
void common_download_run_tasks(const std::vector<common_download_task> & tasks);
|
||||
|
||||
// Download model from HuggingFace repo or URL
|
||||
//
|
||||
// input (via model struct):
|
||||
// - model.hf_repo: HF repo with optional tag, see common_download_split_repo_tag
|
||||
// - model.hf_file: specific file in the repo (requires hf_repo)
|
||||
// - model.url: simple download (used if hf_repo is empty)
|
||||
// - model.path: local file path
|
||||
//
|
||||
// tag matching (for HF repos without model.hf_file):
|
||||
// - if tag is specified, searches for GGUF matching that quantization
|
||||
// - if no tag, searches for Q4_K_M, then Q4_0, then first available GGUF
|
||||
//
|
||||
// split GGUF: multi-part files like "model-00001-of-00003.gguf" are automatically
|
||||
// detected and all parts are downloaded
|
||||
//
|
||||
// caching:
|
||||
// - HF repos: uses HuggingFace cache
|
||||
// - URLs: uses ETag-based caching
|
||||
//
|
||||
// when opts.offline=true, no network requests are made
|
||||
// when download_mmproj=true, searches for mmproj in same directory as model or any parent directory
|
||||
// then with the closest quantization bits
|
||||
// when download_mtp=true, applies the same sibling search for an MTP-head GGUF
|
||||
//
|
||||
// returns result with model_path, mmproj_path and mtp_path (empty when not found / on failure)
|
||||
common_download_model_result common_download_model(
|
||||
const common_params_model & model,
|
||||
const common_download_opts & opts = {}
|
||||
);
|
||||
// if url is a multi-part GGUF file, returns all parts, otherwise returns the single file
|
||||
std::vector<std::string> common_download_get_all_parts(const std::string & url);
|
||||
|
||||
// returns list of cached models
|
||||
std::vector<common_cached_model_info> common_list_cached_models();
|
||||
|
||||
// download single file from url to local path
|
||||
// returns status code or -1 on error
|
||||
// returns -2 if the download was skipped due to ETag mismatch (file outdated, skip_download=true)
|
||||
// skip_etag: if true, don't read/write .etag files (for HF cache where filename is the hash)
|
||||
int common_download_file_single(const std::string & url,
|
||||
const std::string & path,
|
||||
@@ -124,3 +100,12 @@ std::string common_docker_resolve_model(const std::string & docker);
|
||||
// - if tag is present, removes only files matching that tag (and orphaned blobs)
|
||||
// returns true if anything was removed
|
||||
bool common_download_remove(const std::string & hf_repo_with_tag);
|
||||
|
||||
struct common_download_hf_plan {
|
||||
hf_cache::hf_file primary;
|
||||
hf_cache::hf_files model_files;
|
||||
hf_cache::hf_file mmproj;
|
||||
hf_cache::hf_file mtp;
|
||||
hf_cache::hf_file preset; // if set, only this file is downloaded
|
||||
};
|
||||
common_download_hf_plan common_download_get_hf_plan(const common_params_model & model, const common_download_opts & opts);
|
||||
|
||||
@@ -1,324 +0,0 @@
|
||||
#include "json-partial.h"
|
||||
|
||||
#include "log.h"
|
||||
|
||||
#include <nlohmann/json.hpp>
|
||||
|
||||
#include <string>
|
||||
#include <regex>
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
enum common_json_stack_element_type {
|
||||
COMMON_JSON_STACK_ELEMENT_OBJECT,
|
||||
COMMON_JSON_STACK_ELEMENT_KEY,
|
||||
COMMON_JSON_STACK_ELEMENT_ARRAY,
|
||||
};
|
||||
|
||||
struct common_json_stack_element {
|
||||
common_json_stack_element_type type;
|
||||
std::string key;
|
||||
};
|
||||
|
||||
bool common_json_parse(
|
||||
const std::string & input,
|
||||
const std::string & healing_marker,
|
||||
common_json & out)
|
||||
{
|
||||
std::string::const_iterator it = input.begin();
|
||||
const auto end = input.end();
|
||||
return common_json_parse(it, end, healing_marker, out);
|
||||
}
|
||||
|
||||
bool common_json_parse(
|
||||
std::string::const_iterator & it,
|
||||
const std::string::const_iterator & end,
|
||||
const std::string & healing_marker,
|
||||
common_json & out)
|
||||
{
|
||||
// // https://json.nlohmann.me/features/parsing/sax_interface/
|
||||
struct json_error_locator : public nlohmann::json_sax<json> {
|
||||
std::size_t position;
|
||||
bool found_error;
|
||||
std::string last_token;
|
||||
std::string exception_message;
|
||||
std::vector<common_json_stack_element> stack;
|
||||
|
||||
json_error_locator() : position(0), found_error(false) {}
|
||||
|
||||
bool parse_error(std::size_t position, const std::string & last_token, const json::exception & ex) override { // NOLINT
|
||||
this->position = position - 1;
|
||||
this->found_error = true;
|
||||
this->last_token = last_token;
|
||||
this->exception_message = ex.what();
|
||||
return false;
|
||||
}
|
||||
void close_value() {
|
||||
if (!stack.empty() && (stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY)) {
|
||||
stack.pop_back();
|
||||
}
|
||||
}
|
||||
bool null() override { // NOLINT
|
||||
close_value();
|
||||
return true;
|
||||
}
|
||||
bool boolean(bool) override { // NOLINT
|
||||
close_value();
|
||||
return true;
|
||||
}
|
||||
bool number_integer(number_integer_t) override { // NOLINT
|
||||
close_value();
|
||||
return true;
|
||||
}
|
||||
bool number_unsigned(number_unsigned_t) override { // NOLINT
|
||||
close_value();
|
||||
return true;
|
||||
}
|
||||
bool number_float(number_float_t, const string_t &) override { // NOLINT
|
||||
close_value();
|
||||
return true;
|
||||
}
|
||||
bool string(string_t &) override { // NOLINT
|
||||
close_value();
|
||||
return true;
|
||||
}
|
||||
bool binary(binary_t &) override { // NOLINT
|
||||
close_value();
|
||||
return true;
|
||||
}
|
||||
bool start_object(std::size_t) override { // NOLINT
|
||||
stack.push_back({COMMON_JSON_STACK_ELEMENT_OBJECT, ""});
|
||||
return true;
|
||||
}
|
||||
bool end_object() override {
|
||||
GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT);
|
||||
stack.pop_back();
|
||||
close_value();
|
||||
return true;
|
||||
}
|
||||
bool key(string_t & key) override { // NOLINT
|
||||
stack.push_back({COMMON_JSON_STACK_ELEMENT_KEY, key});
|
||||
return true;
|
||||
}
|
||||
bool start_array(std::size_t) override { // NOLINT
|
||||
stack.push_back({COMMON_JSON_STACK_ELEMENT_ARRAY, ""});
|
||||
return true;
|
||||
}
|
||||
bool end_array() override {
|
||||
GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY);
|
||||
stack.pop_back();
|
||||
close_value();
|
||||
return true;
|
||||
}
|
||||
};
|
||||
json_error_locator err_loc;
|
||||
auto start = it;
|
||||
json::sax_parse(it, end, &err_loc);
|
||||
|
||||
if (err_loc.found_error) {
|
||||
it = start;
|
||||
auto temptative_end = it + err_loc.position;
|
||||
// LOG_DBG("Error at position %zu (is_end = %s): %s\n", err_loc.position, temptative_end == end ? "true" : "false", err_loc.exception_message.c_str());
|
||||
|
||||
auto input = std::string(it, temptative_end);
|
||||
try {
|
||||
out.json = json::parse(input);
|
||||
// out.json = json::parse(it, temptative_end);
|
||||
it = temptative_end;
|
||||
return true;
|
||||
} catch (const std::exception & ex) {
|
||||
// No, needs healing.
|
||||
LOG_DBG("Failed to parse up to error: %s: <<<%s>>>\n", ex.what(), std::string(it, temptative_end).c_str());
|
||||
}
|
||||
auto can_parse = [](const std::string & str) {
|
||||
try {
|
||||
auto _ = json::parse(str); // NOLINT
|
||||
return true;
|
||||
} catch (const std::exception &) {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
if (!healing_marker.empty() && !err_loc.stack.empty()) {
|
||||
std::string str(it, temptative_end);
|
||||
auto last_non_sp_pos = str.find_last_not_of(" \n\r\t");
|
||||
if (last_non_sp_pos == std::string::npos) {
|
||||
throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location");
|
||||
}
|
||||
auto last_non_sp_char = str[last_non_sp_pos];
|
||||
// Used to detect stops on a number, which may not be complete.
|
||||
auto was_maybe_number = [&]() {
|
||||
if (!str.empty() && std::isspace(str.back())) {
|
||||
return false;
|
||||
}
|
||||
return std::isdigit(last_non_sp_char) ||
|
||||
last_non_sp_char == '.' ||
|
||||
last_non_sp_char == 'e' ||
|
||||
last_non_sp_char == 'E' ||
|
||||
last_non_sp_char == '-';
|
||||
};
|
||||
|
||||
std::string closing;
|
||||
for (size_t i = err_loc.stack.size(); i > 0; i--) {
|
||||
auto & el = err_loc.stack[i - 1];
|
||||
if (el.type == COMMON_JSON_STACK_ELEMENT_OBJECT) {
|
||||
closing += "}";
|
||||
} else if (el.type == COMMON_JSON_STACK_ELEMENT_ARRAY) {
|
||||
closing += "]";
|
||||
} else if (el.type != COMMON_JSON_STACK_ELEMENT_KEY) {
|
||||
throw std::runtime_error("Unexpected stack element type");
|
||||
}
|
||||
}
|
||||
|
||||
// Matches a potentially partial unicode escape sequence, e.g. \u, \uX, \uXX, \uXXX, \uXXXX
|
||||
static const std::regex partial_unicode_regex(R"(\\u(?:[0-9a-fA-F](?:[0-9a-fA-F](?:[0-9a-fA-F](?:[0-9a-fA-F])?)?)?)?$)");
|
||||
|
||||
auto is_high_surrogate = [&](const std::string & s) {
|
||||
// Check if a partial of a high surrogate (U+D800-U+DBFF)
|
||||
return s.length() >= 4 &&
|
||||
s[0] == '\\' && s[1] == 'u' &&
|
||||
std::tolower(s[2]) == 'd' &&
|
||||
(s[3] == '8' || s[3] == '9' || std::tolower(s[3]) == 'a' || std::tolower(s[3]) == 'b');
|
||||
};
|
||||
|
||||
// Initialize the unicode marker to a low surrogate to handle the edge case
|
||||
// where a high surrogate (U+D800-U+DBFF) is immediately followed by a
|
||||
// backslash (\)
|
||||
std::string unicode_marker_padding = "udc00";
|
||||
std::smatch last_unicode_seq;
|
||||
|
||||
if (std::regex_search(str, last_unicode_seq, partial_unicode_regex)) {
|
||||
std::smatch second_last_seq;
|
||||
std::string prelude = str.substr(0, last_unicode_seq.position());
|
||||
|
||||
// Pad the escape sequence with 0s until it forms a complete sequence of 6 characters
|
||||
unicode_marker_padding = std::string(6 - last_unicode_seq.length(), '0');
|
||||
|
||||
if (is_high_surrogate(last_unicode_seq.str())) {
|
||||
// If the sequence is a partial match for a high surrogate, add a low surrogate (U+DC00-U+UDFF)
|
||||
unicode_marker_padding += "\\udc00";
|
||||
} else if (std::regex_search(prelude, second_last_seq, partial_unicode_regex)) {
|
||||
if (is_high_surrogate(second_last_seq.str())) {
|
||||
// If this follows a high surrogate, pad it to be a low surrogate
|
||||
if (last_unicode_seq.length() == 2) {
|
||||
unicode_marker_padding = "dc00";
|
||||
} else if (last_unicode_seq.length() == 3) {
|
||||
unicode_marker_padding = "c00";
|
||||
} else {
|
||||
// The original unicode_marker_padding is already padded with 0s
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const auto & magic_seed = out.healing_marker.marker = healing_marker;//"$llama.cpp.json$";
|
||||
|
||||
if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY) {
|
||||
// We're inside an object value
|
||||
if (last_non_sp_char == ':' && can_parse(str + "1" + closing)) {
|
||||
// Was about to create an object value
|
||||
str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
|
||||
} else if (can_parse(str + ": 1" + closing)) {
|
||||
str += (out.healing_marker.json_dump_marker = ":\"" + magic_seed) + "\"" + closing;
|
||||
} else if (last_non_sp_char == '{' && can_parse(str + closing)) {
|
||||
// Was about to create an object
|
||||
str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing;
|
||||
} else if (can_parse(str + "\"" + closing)) {
|
||||
// Was inside an object value string
|
||||
str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing;
|
||||
} else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
|
||||
// Was inside an object value string after an escape
|
||||
str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
|
||||
} else if (can_parse(str + unicode_marker_padding + "\"" + closing)) {
|
||||
// Was inside an object value string after a partial unicode escape
|
||||
str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\"" + closing;
|
||||
} else {
|
||||
// find last :
|
||||
auto last_pos = str.find_last_of(':');
|
||||
if (last_pos == std::string::npos) {
|
||||
throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location");
|
||||
}
|
||||
// Cutting back to opening : for object value
|
||||
str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
|
||||
}
|
||||
} else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY) {
|
||||
if ((last_non_sp_char == ',' || last_non_sp_char == '[') && can_parse(str + "1" + closing)) {
|
||||
// Was about to create an array value
|
||||
str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
|
||||
} else if (can_parse(str + "\"" + closing)) {
|
||||
// Was inside an array value string
|
||||
str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing;
|
||||
} else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
|
||||
// Was inside an array value string after an escape
|
||||
str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
|
||||
} else if (can_parse(str + unicode_marker_padding + "\"" + closing)) {
|
||||
// Was inside an array value string after a partial unicode escape
|
||||
str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\"" + closing;
|
||||
} else if (!was_maybe_number() && can_parse(str + ", 1" + closing)) {
|
||||
// Had just finished a value
|
||||
str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\"" + closing;
|
||||
} else {
|
||||
auto last_pos = str.find_last_of("[,");
|
||||
if (last_pos == std::string::npos) {
|
||||
throw std::runtime_error("Cannot heal a truncated JSON array stopped in an unknown location");
|
||||
}
|
||||
// Cutting back to last [ or , for array value
|
||||
str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
|
||||
}
|
||||
} else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT) {
|
||||
if ((last_non_sp_char == '{' && can_parse(str + closing)) ||
|
||||
(last_non_sp_char == ',' && can_parse(str + "\"\": 1" + closing))) {
|
||||
// Was about to create an object key+value
|
||||
str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing;
|
||||
} else if (!was_maybe_number() && can_parse(str + ",\"\": 1" + closing)) {
|
||||
// Was about to create an object key+value
|
||||
str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\": 1" + closing;
|
||||
} else if (can_parse(str + "\": 1" + closing)) {
|
||||
// Was inside an object key string
|
||||
str += (out.healing_marker.json_dump_marker = magic_seed) + "\": 1" + closing;
|
||||
} else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\": 1" + closing)) {
|
||||
// Was inside an object key string after an escape
|
||||
str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\": 1" + closing;
|
||||
} else if (can_parse(str + unicode_marker_padding + "\": 1" + closing)) {
|
||||
// Was inside an object key string after a partial unicode escape
|
||||
str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\": 1" + closing;
|
||||
} else {
|
||||
auto last_pos = str.find_last_of(':');
|
||||
if (last_pos == std::string::npos) {
|
||||
throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location");
|
||||
}
|
||||
// fprintf(stderr, "Cutting back to last : for object key+value\n");
|
||||
str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
|
||||
}
|
||||
} else {
|
||||
throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location");
|
||||
}
|
||||
// fprintf(stderr, "HEALED:\nSTRING <<<\n%s\n>>>\n\nmagic_cut: <<<\n%s\n>>>\n\n", str.c_str(), out.healing_marker.json_dump_marker.c_str());
|
||||
out.json = json::parse(str);
|
||||
it = temptative_end;
|
||||
return true;
|
||||
}
|
||||
// handle unclosed top-level primitive
|
||||
if (err_loc.position != 0 && !healing_marker.empty() && err_loc.stack.empty()) {
|
||||
std::string str(it, temptative_end);
|
||||
const auto & magic_seed = out.healing_marker.marker = healing_marker;
|
||||
if (can_parse(str + "\"")) {
|
||||
// Was inside an string
|
||||
str += (out.healing_marker.json_dump_marker = magic_seed) + "\"";
|
||||
} else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"")) {
|
||||
// Was inside an string after an escape
|
||||
str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"";
|
||||
} else {
|
||||
// TODO: handle more unclosed top-level primitive if the stack was empty but we got an error (e.g. "tru", "\"", etc...)
|
||||
// fprintf(stderr, "Closing: TODO\n");
|
||||
return false;
|
||||
}
|
||||
out.json = json::parse(str);
|
||||
it = temptative_end;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
out.json = json::parse(it, end);
|
||||
it = end;
|
||||
return true;
|
||||
}
|
||||
@@ -1,39 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
// TODO: use json_fwd.hpp when possible
|
||||
#include <nlohmann/json.hpp>
|
||||
|
||||
// Healing marker (empty if the JSON was fully parsed / wasn't healed).
|
||||
struct common_healing_marker {
|
||||
// Raw marker.
|
||||
std::string marker;
|
||||
|
||||
// Cutting the `common_json.json.dump()` string at the (only) occurrence of this marker should yield the original partial JSON string (modulo spaces / if it had the same dump format).
|
||||
std::string json_dump_marker;
|
||||
};
|
||||
|
||||
// Represents a parsed JSON object, with its optional healing marker (a JSON dump fragment that can be used to find the position of healing in the JSON dump string)
|
||||
struct common_json {
|
||||
nlohmann::ordered_json json;
|
||||
|
||||
common_healing_marker healing_marker;
|
||||
};
|
||||
|
||||
// Parse the JSON string, healing (closing) any partial JSON if `healing_marker` is not empty.
|
||||
//
|
||||
// Healing completes partial JSON strings by adding a (possibly modified) healing marker, then whatever is needed to close the JSON.
|
||||
// This allows to parse the resulting healed JSON string, yet be able to cut it again if needed at the healing marker.
|
||||
// (this is used when parsing JSON outputs from the models, then crafting partial JSONs for the partial tool calls in OAI format).
|
||||
//
|
||||
// For instance, parsing `{` with a healing marker `foo` will produce a healed JSON `{"foo":1}`, w/ json_dump_marker = `"foo"` (which can be used to break the JSON again).
|
||||
bool common_json_parse(
|
||||
const std::string & input,
|
||||
const std::string & healing_marker,
|
||||
common_json & out);
|
||||
|
||||
// Parse the JSON string (see overload above), but advancing an iterator to the end of the input when the (potentially partial) parsing succeeds.
|
||||
bool common_json_parse(
|
||||
std::string::const_iterator & it,
|
||||
const std::string::const_iterator & end,
|
||||
const std::string & healing_marker,
|
||||
common_json & out);
|
||||
@@ -46,6 +46,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
|
||||
"DbrxForCausalLM": "dbrx",
|
||||
"DeciLMForCausalLM": "deci",
|
||||
"DeepseekForCausalLM": "deepseek",
|
||||
"DeepseekOCRForCausalLM": "deepseek",
|
||||
"DeepseekV2ForCausalLM": "deepseek",
|
||||
"DeepseekV3ForCausalLM": "deepseek",
|
||||
"DeepseekV32ForCausalLM": "deepseek",
|
||||
@@ -135,6 +136,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
|
||||
"LlamaModel": "llama",
|
||||
"Eagle3DraftModel": "llama",
|
||||
"Eagle3Speculator": "llama",
|
||||
"Eagle3LlamaForCausalLM": "llama",
|
||||
"LlamaForCausalLMEagle3": "llama",
|
||||
"LlavaForConditionalGeneration": "llama",
|
||||
"LlavaStableLMEpochForCausalLM": "stablelm",
|
||||
@@ -233,6 +235,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
|
||||
"UMT5ForConditionalGeneration": "t5",
|
||||
"UMT5Model": "t5",
|
||||
"UltravoxModel": "ultravox",
|
||||
"UnlimitedOCRForCausalLM": "deepseek",
|
||||
"VLlama3ForCausalLM": "llama",
|
||||
"VoxtralForConditionalGeneration": "llama",
|
||||
"WavTokenizerDec": "wavtokenizer",
|
||||
@@ -299,6 +302,7 @@ MMPROJ_MODEL_MAP: dict[str, str] = {
|
||||
"StepVLForConditionalGeneration": "step3",
|
||||
"Step3p7ForConditionalGeneration": "step3",
|
||||
"UltravoxModel": "ultravox",
|
||||
"UnlimitedOCRForCausalLM": "deepseek",
|
||||
"VoxtralForConditionalGeneration": "ultravox",
|
||||
"YoutuVLForConditionalGeneration": "youtuvl",
|
||||
}
|
||||
|
||||
+10
-2
@@ -14,7 +14,7 @@ from .base import MmprojModel, ModelBase, TextModel, gguf, logger
|
||||
from .qwen import QwenModel
|
||||
|
||||
|
||||
@ModelBase.register("DeepseekOCRForCausalLM")
|
||||
@ModelBase.register("DeepseekOCRForCausalLM", "UnlimitedOCRForCausalLM")
|
||||
class DeepseekOCRVisionModel(MmprojModel):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
@@ -205,6 +205,8 @@ class DeepseekModel(TextModel):
|
||||
@ModelBase.register(
|
||||
"DeepseekV2ForCausalLM",
|
||||
"DeepseekV3ForCausalLM",
|
||||
"DeepseekOCRForCausalLM",
|
||||
"UnlimitedOCRForCausalLM",
|
||||
"KimiVLForConditionalGeneration",
|
||||
"KimiK25ForConditionalGeneration",
|
||||
"YoutuForCausalLM",
|
||||
@@ -224,7 +226,7 @@ class DeepseekV2Model(TextModel):
|
||||
self.origin_hf_arch = hparams.get('architectures', [None])[0]
|
||||
|
||||
# special handling for Deepseek OCR
|
||||
if self.origin_hf_arch in ("DeepseekOCRForCausalLM", "DeepseekOCR2ForCausalLM"):
|
||||
if self.origin_hf_arch in ("DeepseekOCRForCausalLM", "DeepseekOCR2ForCausalLM", "UnlimitedOCRForCausalLM"):
|
||||
self.model_arch = gguf.MODEL_ARCH.DEEPSEEK2OCR
|
||||
self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
|
||||
self.gguf_writer.add_architecture()
|
||||
@@ -350,6 +352,12 @@ class DeepseekV2Model(TextModel):
|
||||
|
||||
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
||||
|
||||
# Unlimited-OCR sliding window; written for metadata, the decoder ignores it (full MHA)
|
||||
if is_ocr:
|
||||
sliding_window = hparams.get("sliding_window_size") or hparams.get("sliding_window")
|
||||
if sliding_window:
|
||||
self.gguf_writer.add_sliding_window(sliding_window)
|
||||
|
||||
if (rope_mscale_all := self.rope_parameters.get("mscale_all_dim")) is not None:
|
||||
# [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
|
||||
# note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul
|
||||
|
||||
@@ -23,6 +23,7 @@ from .base import ModelBase, TextModel, gguf, logger
|
||||
"LlavaForConditionalGeneration",
|
||||
"VoxtralForConditionalGeneration",
|
||||
"LlamaForCausalLMEagle3",
|
||||
"Eagle3LlamaForCausalLM",
|
||||
"Eagle3Speculator",
|
||||
"Eagle3DraftModel",
|
||||
"IQuestCoderForCausalLM",
|
||||
|
||||
@@ -413,6 +413,15 @@ In two device selection modes, the default SYCL backend is level_zero, you can c
|
||||
|------------------|----------------------------------------|
|
||||
| Single device | --split-mode none --main-gpu DEVICE_ID |
|
||||
| Multiple devices | --split-mode layer (default) |
|
||||
| Multiple devices | --split-mode tensor (tensor parallelism) |
|
||||
|
||||
`--split-mode tensor` (tensor parallelism) shards each layer across the selected
|
||||
GPUs. It requires flash attention, which is auto-enabled when `--flash-attn` is
|
||||
left at its default `auto`, so `--split-mode tensor` works out of the box.
|
||||
Passing `--flash-attn off` together with `--split-mode tensor` is rejected at
|
||||
context creation. The default `f16` KV cache is recommended. Tensor parallelism
|
||||
is currently optimized for 2 GPUs; other device counts fall back to a generic
|
||||
all-reduce.
|
||||
|
||||
Examples:
|
||||
|
||||
@@ -715,6 +724,15 @@ In two device selection modes, the default SYCL backend is level_zero, you can c
|
||||
|------------------|----------------------------------------|
|
||||
| Single device | --split-mode none --main-gpu DEVICE_ID |
|
||||
| Multiple devices | --split-mode layer (default) |
|
||||
| Multiple devices | --split-mode tensor (tensor parallelism) |
|
||||
|
||||
`--split-mode tensor` (tensor parallelism) shards each layer across the selected
|
||||
GPUs. It requires flash attention, which is auto-enabled when `--flash-attn` is
|
||||
left at its default `auto`, so `--split-mode tensor` works out of the box.
|
||||
Passing `--flash-attn off` together with `--split-mode tensor` is rejected at
|
||||
context creation. The default `f16` KV cache is recommended. Tensor parallelism
|
||||
is currently optimized for 2 GPUs; other device counts fall back to a generic
|
||||
all-reduce.
|
||||
|
||||
Examples:
|
||||
|
||||
|
||||
@@ -24,7 +24,6 @@
|
||||
"GGML_LLAMAFILE": "OFF",
|
||||
"GGML_OPENCL": "ON",
|
||||
"GGML_HEXAGON": "ON",
|
||||
"GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128",
|
||||
"LLAMA_OPENSSL": "OFF"
|
||||
}
|
||||
},
|
||||
@@ -47,7 +46,6 @@
|
||||
"GGML_LLAMAFILE": "OFF",
|
||||
"GGML_OPENCL": "ON",
|
||||
"GGML_HEXAGON": "ON",
|
||||
"GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128",
|
||||
"LLAMA_OPENSSL": "OFF"
|
||||
}
|
||||
},
|
||||
@@ -73,7 +71,6 @@
|
||||
"GGML_LLAMAFILE": "OFF",
|
||||
"GGML_OPENCL": "OFF",
|
||||
"GGML_HEXAGON": "ON",
|
||||
"GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128",
|
||||
"LLAMA_OPENSSL": "OFF"
|
||||
}
|
||||
},
|
||||
|
||||
+41
-1
@@ -13,6 +13,45 @@ The `llama-server` application supports several implementations of speculative d
|
||||
A much smaller model (called the _draft model_) generates drafts.
|
||||
A draft model is the most used approach in speculative decoding.
|
||||
|
||||
### EAGLE-3 (`draft-eagle3`)
|
||||
|
||||
EAGLE-3 uses a small draft model that reads the target model's hidden states to predict the next tokens, so it
|
||||
reaches higher acceptance than a standalone draft model of the same size. The draft is a one-layer transformer
|
||||
trained for a specific target model; it shares the target model's tokenizer and, optionally, uses a reduced draft
|
||||
vocabulary with its own `lm_head`, which is mapped back using a `d2t` table.
|
||||
|
||||
Convert the EAGLE-3 checkpoint with `--target-model-dir` so it inherits the target's tokenizer and the layer
|
||||
indices to read. Both the SpecForge `LlamaForCausalLMEagle3` and the vLLM/AngelSlim `Eagle3LlamaForCausalLM`
|
||||
checkpoint formats are supported (for example [`AngelSlim/Qwen3-4B_eagle3`](https://huggingface.co/AngelSlim/Qwen3-4B_eagle3)
|
||||
for `Qwen/Qwen3-4B`):
|
||||
|
||||
```bash
|
||||
python convert_hf_to_gguf.py AngelSlim/Qwen3-4B_eagle3 \
|
||||
--target-model-dir Qwen/Qwen3-4B --outtype bf16 --outfile Qwen3-4B-eagle3.gguf
|
||||
|
||||
llama-server -m Qwen3-4B.gguf -md Qwen3-4B-eagle3.gguf --spec-type draft-eagle3
|
||||
```
|
||||
|
||||
Supported EAGLE-3 draft models include:
|
||||
|
||||
- [yuhuili/EAGLE3-LLaMA3.1-Instruct-8B](https://huggingface.co/yuhuili/EAGLE3-LLaMA3.1-Instruct-8B)
|
||||
- [yuhuili/EAGLE3-LLaMA3.3-Instruct-70B](https://huggingface.co/yuhuili/EAGLE3-LLaMA3.3-Instruct-70B)
|
||||
- [RedHatAI/gemma-4-31B-it-speculator.eagle3](https://huggingface.co/RedHatAI/gemma-4-31B-it-speculator.eagle3)
|
||||
- [RedHatAI/gemma-4-26B-A4B-it-speculator.eagle3](https://huggingface.co/RedHatAI/gemma-4-26B-A4B-it-speculator.eagle3)
|
||||
- [Tengyunw/qwen3_8b_eagle3](https://huggingface.co/Tengyunw/qwen3_8b_eagle3)
|
||||
- [Tengyunw/qwen3_30b_moe_eagle3](https://huggingface.co/Tengyunw/qwen3_30b_moe_eagle3)
|
||||
- [AngelSlim/Qwen3-1.7B_eagle3](https://huggingface.co/AngelSlim/Qwen3-1.7B_eagle3)
|
||||
- [AngelSlim/Qwen3-4B_eagle3](https://huggingface.co/AngelSlim/Qwen3-4B_eagle3)
|
||||
- [AngelSlim/Qwen3-8B_eagle3](https://huggingface.co/AngelSlim/Qwen3-8B_eagle3)
|
||||
- [AngelSlim/Qwen3-14B_eagle3](https://huggingface.co/AngelSlim/Qwen3-14B_eagle3)
|
||||
- [AngelSlim/Qwen3-32B_eagle3](https://huggingface.co/AngelSlim/Qwen3-32B_eagle3)
|
||||
- [AngelSlim/Qwen3-a3B_eagle3](https://huggingface.co/AngelSlim/Qwen3-a3B_eagle3)
|
||||
- [RedHatAI/gpt-oss-20b-speculator.eagle3](https://huggingface.co/RedHatAI/gpt-oss-20b-speculator.eagle3)
|
||||
- [lmsys/EAGLE3-gpt-oss-120b-bf16](https://huggingface.co/lmsys/EAGLE3-gpt-oss-120b-bf16)
|
||||
- [nvidia/gpt-oss-120b-Eagle3-long-context](https://huggingface.co/nvidia/gpt-oss-120b-Eagle3-long-context)
|
||||
|
||||
For the full and up-to-date list of supported models, see #18039.
|
||||
|
||||
### n-gram Cache (`ngram-cache`)
|
||||
|
||||
An n-gram is a sequence of n tokens. The n-gram cache implementation maintains statistics about short n-gram sequences.
|
||||
@@ -108,7 +147,7 @@ If a draft model is combined with a draftless decoding the draftless decoding ha
|
||||
### General Speculative Parameters
|
||||
|
||||
```
|
||||
--spec-type [none|draft-simple|draft-mtp|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]
|
||||
--spec-type [none|draft-simple|draft-eagle3|draft-mtp|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]
|
||||
comma-separated list of types of speculative decoding to use
|
||||
(default: none)
|
||||
(env: LLAMA_ARG_SPEC_TYPE)
|
||||
@@ -247,6 +286,7 @@ Specifies a comma-separated list of speculative decoding types to use.
|
||||
|------|-------------|
|
||||
| `none` | No speculative decoding (default) |
|
||||
| `draft-simple` | Use a simple draft model for speculation |
|
||||
| `draft-eagle3` | Use an EAGLE-3 draft model that reads the target's hidden states |
|
||||
| `draft-mtp` | Use Multi Token Prediction (MTP) heads from the main model |
|
||||
| `ngram-cache` | Use n-gram cache lookup |
|
||||
| `ngram-simple` | Use simple n-gram pattern matching |
|
||||
|
||||
@@ -266,7 +266,6 @@ set (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
|
||||
"ggml: OpenCL API version to target")
|
||||
|
||||
option(GGML_HEXAGON "ggml: enable Hexagon backend" OFF)
|
||||
set(GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE 128 CACHE STRING "ggml: quantize group size (32, 64, or 128)")
|
||||
|
||||
# toolchain for vulkan-shaders-gen
|
||||
set (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")
|
||||
|
||||
@@ -27,6 +27,14 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int de
|
||||
// split tensor buffer that splits matrices by rows across multiple devices
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
|
||||
|
||||
// Tensor parallelism (--split-mode tensor): comm_init/free/allreduce_tensor
|
||||
// trio queried by the meta-backend via ggml_backend_reg_get_proc_address.
|
||||
// See typedefs in ggml/include/ggml-backend.h. Mirrors the CUDA backend's
|
||||
// pattern (ggml_backend_cuda_comm_*).
|
||||
GGML_BACKEND_API void * ggml_backend_sycl_comm_init(ggml_backend_t * backends, size_t n_backends);
|
||||
GGML_BACKEND_API void ggml_backend_sycl_comm_free(void * comm_ctx);
|
||||
GGML_BACKEND_API bool ggml_backend_sycl_comm_allreduce_tensor(void * comm_ctx, struct ggml_tensor ** tensors);
|
||||
|
||||
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
|
||||
|
||||
|
||||
@@ -34,26 +34,26 @@ template <float (*bin_op)(const float, const float),
|
||||
static __global__ void k_bin_bcast(const src0_t * src0,
|
||||
const src1_t * src1,
|
||||
dst_t * dst,
|
||||
const int ne0,
|
||||
const int ne1,
|
||||
const int ne2,
|
||||
const uint32_t ne0,
|
||||
const uint32_t ne1,
|
||||
const uint32_t ne2,
|
||||
const uint3 ne3,
|
||||
const uint3 ne10,
|
||||
const uint3 ne11,
|
||||
const uint3 ne12,
|
||||
const uint3 ne13,
|
||||
/*const int s0,*/
|
||||
const int s1,
|
||||
const int s2,
|
||||
const int s3,
|
||||
const int s00,
|
||||
const int s01,
|
||||
const int s02,
|
||||
const int s03,
|
||||
const int s10,
|
||||
const int s11,
|
||||
const int s12,
|
||||
const int s13,
|
||||
/*const uint32_t s0,*/
|
||||
const uint32_t s1,
|
||||
const uint32_t s2,
|
||||
const uint32_t s3,
|
||||
const uint32_t s00,
|
||||
const uint32_t s01,
|
||||
const uint32_t s02,
|
||||
const uint32_t s03,
|
||||
const uint32_t s10,
|
||||
const uint32_t s11,
|
||||
const uint32_t s12,
|
||||
const uint32_t s13,
|
||||
src1_ptrs... src1s) {
|
||||
ggml_cuda_pdl_lc();
|
||||
const uint32_t i0s = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
@@ -61,7 +61,7 @@ static __global__ void k_bin_bcast(const src0_t * src0,
|
||||
const uint32_t i2 = fastdiv((blockDim.z * blockIdx.z + threadIdx.z), ne3);
|
||||
const uint32_t i3 = (blockDim.z * blockIdx.z + threadIdx.z) - (i2 * ne3.z);
|
||||
|
||||
if (i0s >= (uint32_t)ne0 || i1 >= (uint32_t)ne1 || i2 >= (uint32_t)ne2 || i3 >= ne3.z) {
|
||||
if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3.z) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -69,25 +69,32 @@ static __global__ void k_bin_bcast(const src0_t * src0,
|
||||
const uint32_t i12 = fastmodulo(i2, ne12);
|
||||
const uint32_t i13 = fastmodulo(i3, ne13);
|
||||
|
||||
const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
|
||||
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
|
||||
const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
|
||||
const size_t i_src0 = size_t( i3)*s03 + size_t( i2)*s02 + size_t( i1)*s01;
|
||||
const size_t i_src1 = size_t(i13)*s13 + size_t(i12)*s12 + size_t(i11)*s11;
|
||||
const size_t i_dst = size_t( i3)*s3 + size_t( i2)*s2 + size_t( i1)*s1;
|
||||
|
||||
const src0_t * src0_row = src0 ? (src0 + i_src0) : nullptr;
|
||||
dst_t * dst_row = dst + i_dst;
|
||||
|
||||
const uint32_t s0 = blockDim.x * gridDim.x;
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
for (int i0 = i0s; i0 < ne0; i0 += blockDim.x * gridDim.x) {
|
||||
for (uint32_t i0 = i0s; i0 < ne0; i0 += s0) {
|
||||
const uint32_t i10 = fastmodulo(i0, ne10);
|
||||
|
||||
float result = src0_row ? (float) src0_row[i0*s00] : 0.0f;
|
||||
float result = src0_row ? (float) src0_row[size_t(i0)*s00] : 0.0f;
|
||||
if constexpr (sizeof...(src1_ptrs) > 0) {
|
||||
result = (..., (result = bin_op(result, (float)src1s[i_src1 + i10*s10])));
|
||||
result = (..., (result = bin_op(result, (float)src1s[i_src1 + size_t(i10)*s10])));
|
||||
} else {
|
||||
result = bin_op(result, (float)src1[i_src1 + i10*s10]);
|
||||
result = bin_op(result, (float)src1[i_src1 + size_t(i10)*s10]);
|
||||
}
|
||||
|
||||
dst_row[i0] = (dst_t) result;
|
||||
|
||||
// protect i0 from overflow
|
||||
if (ne0 - i0 <= s0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -110,19 +117,19 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0,
|
||||
const uint3 ne12,
|
||||
const uint3 ne13,
|
||||
/*const int s0,*/
|
||||
const int s1,
|
||||
const int s2,
|
||||
const int s3,
|
||||
const int s00,
|
||||
const int s01,
|
||||
const int s02,
|
||||
const int s03,
|
||||
const int s10,
|
||||
const int s11,
|
||||
const int s12,
|
||||
const int s13,
|
||||
const uint32_t s1,
|
||||
const uint32_t s2,
|
||||
const uint32_t s3,
|
||||
const uint32_t s00,
|
||||
const uint32_t s01,
|
||||
const uint32_t s02,
|
||||
const uint32_t s03,
|
||||
const uint32_t s10,
|
||||
const uint32_t s11,
|
||||
const uint32_t s12,
|
||||
const uint32_t s13,
|
||||
src1_ptrs... src1s) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
const uint32_t i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
const uint32_t i3 = fastdiv(i, prod_012);
|
||||
const uint32_t i2 = fastdiv(i - i3 * prod_012.z, prod_01);
|
||||
@@ -133,25 +140,25 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0,
|
||||
return;
|
||||
}
|
||||
|
||||
const int i11 = fastmodulo(i1, ne11);
|
||||
const int i12 = fastmodulo(i2, ne12);
|
||||
const int i13 = fastmodulo(i3, ne13);
|
||||
const uint32_t i11 = fastmodulo(i1, ne11);
|
||||
const uint32_t i12 = fastmodulo(i2, ne12);
|
||||
const uint32_t i13 = fastmodulo(i3, ne13);
|
||||
|
||||
const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
|
||||
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
|
||||
const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
|
||||
const size_t i_src0 = size_t( i3)*s03 + size_t( i2)*s02 + size_t( i1)*s01;
|
||||
const size_t i_src1 = size_t(i13)*s13 + size_t(i12)*s12 + size_t(i11)*s11;
|
||||
const size_t i_dst = size_t( i3)*s3 + size_t( i2)*s2 + size_t( i1)*s1;
|
||||
|
||||
const src0_t * src0_row = src0 ? (src0 + i_src0) : nullptr;
|
||||
dst_t * dst_row = dst + i_dst;
|
||||
|
||||
const int i10 = fastmodulo(i0, ne10);
|
||||
const uint32_t i10 = fastmodulo(i0, ne10);
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
float result = src0_row ? (float) src0_row[i0*s00] : 0.0f;
|
||||
float result = src0_row ? (float) src0_row[size_t(i0)*s00] : 0.0f;
|
||||
if constexpr (sizeof...(src1_ptrs) > 0) {
|
||||
result = (..., (result = bin_op(result, (float)src1s[i_src1 + i10*s10])));
|
||||
result = (..., (result = bin_op(result, (float)src1s[i_src1 + size_t(i10)*s10])));
|
||||
} else {
|
||||
result = bin_op(result, (float)src1[i_src1 + i10*s10]);
|
||||
result = bin_op(result, (float)src1[i_src1 + size_t(i10)*s10]);
|
||||
}
|
||||
|
||||
dst_row[i0] = (dst_t) result;
|
||||
@@ -248,6 +255,31 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
|
||||
size_t s02 = nb02 / sizeof(src0_t);
|
||||
size_t s03 = nb03 / sizeof(src0_t);
|
||||
|
||||
GGML_ASSERT(ne0 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(ne1 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(ne2 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(ne3 <= std::numeric_limits<uint32_t>::max());
|
||||
|
||||
//GGML_ASSERT(s0 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(s1 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(s2 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(s3 <= std::numeric_limits<uint32_t>::max());
|
||||
|
||||
GGML_ASSERT(s00 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(s01 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(s02 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(s03 <= std::numeric_limits<uint32_t>::max());
|
||||
|
||||
GGML_ASSERT(s10 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(s11 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(s12 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(s13 <= std::numeric_limits<uint32_t>::max());
|
||||
|
||||
GGML_ASSERT(cne1[0] <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(cne1[1] <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(cne1[2] <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(cne1[3] <= std::numeric_limits<uint32_t>::max());
|
||||
|
||||
GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
|
||||
GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
|
||||
GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
|
||||
@@ -263,6 +295,8 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
|
||||
GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
|
||||
GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
|
||||
|
||||
GGML_ASSERT(ne2 * ne3 <= std::numeric_limits<unsigned int>::max());
|
||||
|
||||
const int block_size = 128;
|
||||
|
||||
int64_t hne0 = std::max(ne0 / 2LL, 1LL);
|
||||
@@ -281,7 +315,13 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
|
||||
const uint3 ne13 = init_fastdiv_values((uint32_t) cne1[3]);
|
||||
|
||||
if (block_nums.z > 65535 || block_nums.y > 65535) {
|
||||
int block_num = (ne0 * ne1 * ne2 * ne3 + block_size - 1) / block_size;
|
||||
int64_t block_num = (ne0 * ne1 * ne2 * ne3 + block_size - 1) / block_size;
|
||||
|
||||
GGML_ASSERT(block_num <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(block_num * block_size <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(ne0 * ne1 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(ne0 * ne1 * ne2 <= std::numeric_limits<uint32_t>::max());
|
||||
|
||||
const uint3 prod_012 = init_fastdiv_values((uint32_t) (ne0 * ne1 * ne2));
|
||||
const uint3 prod_01 = init_fastdiv_values((uint32_t) (ne0 * ne1));
|
||||
const uint3 ne0_fastdiv = init_fastdiv_values((uint32_t) ne0);
|
||||
@@ -298,6 +338,10 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
|
||||
s10, s11, s12, s13, (const src1_t *) dst->src[I + 1]->data...);
|
||||
}
|
||||
} else {
|
||||
GGML_ASSERT(int64_t(block_nums.x) * block_dims.x <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(int64_t(block_nums.y) * block_dims.y <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(int64_t(block_nums.z) * block_dims.z <= std::numeric_limits<uint32_t>::max());
|
||||
|
||||
const uint3 ne3_fastdiv = init_fastdiv_values((uint32_t) ne3);
|
||||
{
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream);
|
||||
|
||||
+35
-29
@@ -53,10 +53,10 @@ static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const
|
||||
const int64_t nmat = ne / (ne00 * ne01);
|
||||
const int64_t n = ne00 * ne01;
|
||||
|
||||
const int x = blockIdx.x * CUDA_CPY_TILE_DIM_2D + threadIdx.x;
|
||||
const int y = blockIdx.y * CUDA_CPY_TILE_DIM_2D + threadIdx.y;
|
||||
const int tx = blockIdx.y * CUDA_CPY_TILE_DIM_2D + threadIdx.x; // transpose block offset
|
||||
const int ty = blockIdx.x * CUDA_CPY_TILE_DIM_2D + threadIdx.y;
|
||||
const int64_t x = (int64_t) blockIdx.x * CUDA_CPY_TILE_DIM_2D + threadIdx.x;
|
||||
const int64_t y = (int64_t) blockIdx.y * CUDA_CPY_TILE_DIM_2D + threadIdx.y;
|
||||
const int64_t tx = (int64_t) blockIdx.y * CUDA_CPY_TILE_DIM_2D + threadIdx.x; // transpose block offset
|
||||
const int64_t ty = (int64_t) blockIdx.x * CUDA_CPY_TILE_DIM_2D + threadIdx.y;
|
||||
|
||||
__shared__ float tile[2][CUDA_CPY_TILE_DIM_2D][CUDA_CPY_TILE_DIM_2D+1];
|
||||
int cur_tile_buf = 0;
|
||||
@@ -197,7 +197,7 @@ static void ggml_cpy_scalar_contiguous_cuda(
|
||||
cudaStream_t stream) {
|
||||
|
||||
const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
GGML_ASSERT(num_blocks < UINT_MAX);
|
||||
GGML_ASSERT(num_blocks <= INT_MAX);
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params((dim3)num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream);
|
||||
ggml_cuda_kernel_launch(cpy_scalar_contiguous<src_t, dst_t>, launch_params, cx, cdst, ne);
|
||||
}
|
||||
@@ -208,6 +208,14 @@ static void ggml_cpy_scalar_cuda(
|
||||
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
|
||||
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
|
||||
|
||||
const auto launch_scalar_generic = [&]() {
|
||||
const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
GGML_ASSERT(num_blocks <= INT_MAX);
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params((dim3)num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream);
|
||||
ggml_cuda_kernel_launch(cpy_scalar<cpy_1_scalar<src_t, dst_t>>, launch_params,
|
||||
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
};
|
||||
|
||||
if (transposed) {
|
||||
GGML_ASSERT(ne == ne00*ne01*ne02); // ne[3] is 1 assumed
|
||||
int64_t ne00n, ne01n, ne02n;
|
||||
@@ -224,20 +232,18 @@ static void ggml_cpy_scalar_cuda(
|
||||
int64_t grid_x = (ne01n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D;
|
||||
int64_t grid_y = (ne00n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D;
|
||||
int64_t grid_z = (ne/(ne01n*ne00n) + CUDA_CPY_BLOCK_NM - 1) / CUDA_CPY_BLOCK_NM;
|
||||
GGML_ASSERT(grid_x < UINT_MAX);
|
||||
GGML_ASSERT(grid_y < USHRT_MAX);
|
||||
GGML_ASSERT(grid_z < USHRT_MAX);
|
||||
dim3 dimGrid(grid_x, grid_y, grid_z);
|
||||
dim3 dimBlock(CUDA_CPY_TILE_DIM_2D, CUDA_CPY_BLOCK_ROWS, 1);
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(dimGrid, dimBlock, 0, stream);
|
||||
ggml_cuda_kernel_launch(cpy_scalar_transpose<dst_t>, launch_params,
|
||||
cx, cdst, ne, ne00n, ne01n, ne02n, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
GGML_ASSERT(grid_x <= INT_MAX);
|
||||
if (grid_y > USHRT_MAX || grid_z > USHRT_MAX) {
|
||||
launch_scalar_generic();
|
||||
} else {
|
||||
dim3 dimGrid(grid_x, grid_y, grid_z);
|
||||
dim3 dimBlock(CUDA_CPY_TILE_DIM_2D, CUDA_CPY_BLOCK_ROWS, 1);
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(dimGrid, dimBlock, 0, stream);
|
||||
ggml_cuda_kernel_launch(cpy_scalar_transpose<dst_t>, launch_params,
|
||||
cx, cdst, ne, ne00n, ne01n, ne02n, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
} else {
|
||||
const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
GGML_ASSERT(num_blocks < UINT_MAX);
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params((dim3)num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream);
|
||||
ggml_cuda_kernel_launch(cpy_scalar<cpy_1_scalar<src_t, dst_t>>, launch_params,
|
||||
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
launch_scalar_generic();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -248,7 +254,7 @@ static void ggml_cpy_f32_q8_0_cuda(
|
||||
|
||||
GGML_ASSERT(ne % QK8_0 == 0);
|
||||
const int64_t num_blocks = ne / QK8_0;
|
||||
GGML_ASSERT(num_blocks < UINT_MAX);
|
||||
GGML_ASSERT(num_blocks <= INT_MAX);
|
||||
cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
@@ -259,7 +265,7 @@ static void ggml_cpy_q8_0_f32_cuda(
|
||||
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
|
||||
|
||||
const int64_t num_blocks = ne;
|
||||
GGML_ASSERT(num_blocks < UINT_MAX);
|
||||
GGML_ASSERT(num_blocks <= INT_MAX);
|
||||
cpy_q_f32<cpy_blck_q8_0_f32, QK8_0><<<num_blocks, 1, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
@@ -271,7 +277,7 @@ static void ggml_cpy_f32_q4_0_cuda(
|
||||
|
||||
GGML_ASSERT(ne % QK4_0 == 0);
|
||||
const int64_t num_blocks = ne / QK4_0;
|
||||
GGML_ASSERT(num_blocks < UINT_MAX);
|
||||
GGML_ASSERT(num_blocks <= INT_MAX);
|
||||
cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
@@ -284,7 +290,7 @@ static void ggml_cpy_q4_0_f32_cuda(
|
||||
const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13,
|
||||
cudaStream_t stream) {
|
||||
const int64_t num_blocks = ne;
|
||||
GGML_ASSERT(num_blocks < UINT_MAX);
|
||||
GGML_ASSERT(num_blocks <= INT_MAX);
|
||||
cpy_q_f32<cpy_blck_q_f32<dequantize_q4_0, QK4_0>, QK4_0><<<num_blocks, 1, 0, stream>>>(
|
||||
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
@@ -297,7 +303,7 @@ static void ggml_cpy_f32_q4_1_cuda(
|
||||
|
||||
GGML_ASSERT(ne % QK4_1 == 0);
|
||||
const int64_t num_blocks = ne / QK4_1;
|
||||
GGML_ASSERT(num_blocks < UINT_MAX);
|
||||
GGML_ASSERT(num_blocks <= INT_MAX);
|
||||
cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
@@ -310,7 +316,7 @@ static void ggml_cpy_q4_1_f32_cuda(
|
||||
const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13,
|
||||
cudaStream_t stream) {
|
||||
const int64_t num_blocks = ne;
|
||||
GGML_ASSERT(num_blocks < UINT_MAX);
|
||||
GGML_ASSERT(num_blocks <= INT_MAX);
|
||||
cpy_q_f32<cpy_blck_q_f32<dequantize_q4_1, QK4_1>, QK4_1><<<num_blocks, 1, 0, stream>>>(
|
||||
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
@@ -323,7 +329,7 @@ static void ggml_cpy_f32_q5_0_cuda(
|
||||
|
||||
GGML_ASSERT(ne % QK5_0 == 0);
|
||||
const int64_t num_blocks = ne / QK5_0;
|
||||
GGML_ASSERT(num_blocks < UINT_MAX);
|
||||
GGML_ASSERT(num_blocks <= INT_MAX);
|
||||
cpy_f32_q<cpy_blck_f32_q5_0, QK5_0><<<num_blocks, 1, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
@@ -336,7 +342,7 @@ static void ggml_cpy_q5_0_f32_cuda(
|
||||
const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13,
|
||||
cudaStream_t stream) {
|
||||
const int64_t num_blocks = ne;
|
||||
GGML_ASSERT(num_blocks < UINT_MAX);
|
||||
GGML_ASSERT(num_blocks <= INT_MAX);
|
||||
cpy_q_f32<cpy_blck_q_f32<dequantize_q5_0, QK5_0>, QK5_0><<<num_blocks, 1, 0, stream>>>(
|
||||
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
@@ -349,7 +355,7 @@ static void ggml_cpy_f32_q5_1_cuda(
|
||||
|
||||
GGML_ASSERT(ne % QK5_1 == 0);
|
||||
const int64_t num_blocks = ne / QK5_1;
|
||||
GGML_ASSERT(num_blocks < UINT_MAX);
|
||||
GGML_ASSERT(num_blocks <= INT_MAX);
|
||||
cpy_f32_q<cpy_blck_f32_q5_1, QK5_1><<<num_blocks, 1, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
@@ -362,7 +368,7 @@ static void ggml_cpy_q5_1_f32_cuda(
|
||||
const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13,
|
||||
cudaStream_t stream) {
|
||||
const int64_t num_blocks = ne;
|
||||
GGML_ASSERT(num_blocks < UINT_MAX);
|
||||
GGML_ASSERT(num_blocks <= INT_MAX);
|
||||
cpy_q_f32<cpy_blck_q_f32<dequantize_q5_1, QK5_1>, QK5_1><<<num_blocks, 1, 0, stream>>>(
|
||||
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
@@ -375,7 +381,7 @@ static void ggml_cpy_f32_iq4_nl_cuda(
|
||||
|
||||
GGML_ASSERT(ne % QK4_NL == 0);
|
||||
const int64_t num_blocks = ne / QK4_NL;
|
||||
GGML_ASSERT(num_blocks < UINT_MAX);
|
||||
GGML_ASSERT(num_blocks <= INT_MAX);
|
||||
cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL><<<num_blocks, 1, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
@@ -25,7 +25,6 @@ include(ExternalProject)
|
||||
option(GGML_HEXAGON_HTP_DEBUG "ggml-hexagon: enable HTP debug output" OFF)
|
||||
option(GGML_HEXAGON_FA_EXP2_HF "ggml-hexagon: use FP16 exp2 polynomial in FA softmax instead of F32 exp round-trip" OFF)
|
||||
set(GGML_HEXAGON_HTP_CERT "$ENV{HEXAGON_HTP_CERT}" CACHE PATH "ggml-hexagon: enable HTP library signing using certificate")
|
||||
set(GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE 128 CACHE STRING "ggml-hexagon: quantize group size (32, 64, or 128)")
|
||||
|
||||
add_library(htp_iface OBJECT
|
||||
${CMAKE_CURRENT_BINARY_DIR}/htp_iface_stub.c)
|
||||
@@ -72,15 +71,12 @@ function(build_htp_skel V)
|
||||
-DHEXAGON_SDK_ROOT=${HEXAGON_SDK_ROOT}
|
||||
-DHEXAGON_TOOLS_ROOT=${HEXAGON_TOOLS_ROOT}
|
||||
-DHEXAGON_HTP_DEBUG=${GGML_HEXAGON_HTP_DEBUG}
|
||||
-DGGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE=${GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE}
|
||||
-DDSP_VERSION=${V}
|
||||
-DPREBUILT_LIB_DIR="toolv19_${V}")
|
||||
list(APPEND HTP_SKELS ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-${V}.so)
|
||||
set(HTP_SKELS ${HTP_SKELS} PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
build_htp_skel(v68)
|
||||
build_htp_skel(v69)
|
||||
build_htp_skel(v73)
|
||||
build_htp_skel(v75)
|
||||
build_htp_skel(v79)
|
||||
|
||||
+1359
-1274
File diff suppressed because it is too large
Load Diff
@@ -5,10 +5,12 @@
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-common.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <stdio.h>
|
||||
#include "htp-ops.h"
|
||||
#include "htp/matmul-ops.h"
|
||||
|
||||
struct htp_opnode {
|
||||
ggml_tensor * node = nullptr;
|
||||
@@ -17,6 +19,13 @@ struct htp_opnode {
|
||||
|
||||
htp_op_code opcode = HTP_OP_INVALID;
|
||||
|
||||
std::vector<ggml_tensor *> extra_dsts;
|
||||
|
||||
int32_t kernel_params[HTP_OP_MAX_KERN_PARAMS] = {0};
|
||||
|
||||
htp_opnode(ggml_tensor * node = nullptr, std::vector<ggml_tensor *> fused = {}, htp_op_code opcode = HTP_OP_INVALID, std::vector<ggml_tensor *> extra_dsts = {})
|
||||
: node(node), fused(std::move(fused)), opcode(opcode), extra_dsts(std::move(extra_dsts)) {}
|
||||
|
||||
ggml_op op() const {
|
||||
return node->op;
|
||||
}
|
||||
@@ -25,6 +34,26 @@ struct htp_opnode {
|
||||
return fused.empty() ? node : fused.back();
|
||||
}
|
||||
|
||||
void add_fused(ggml_tensor * t, bool extra_dst = false) {
|
||||
fused.push_back(t);
|
||||
if (extra_dst) {
|
||||
extra_dsts.push_back(t);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<const ggml_tensor *> get_outputs() const {
|
||||
std::vector<const ggml_tensor *> res;
|
||||
if (extra_dsts.empty()) {
|
||||
res.push_back(dst());
|
||||
} else {
|
||||
res.push_back(node);
|
||||
for (const auto * x : extra_dsts) {
|
||||
res.push_back(x);
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
const ggml_tensor * src0() const {
|
||||
return node->src[0];
|
||||
}
|
||||
@@ -37,10 +66,6 @@ struct htp_opnode {
|
||||
return ggml_op_is_empty(node->op);
|
||||
}
|
||||
|
||||
void add_fused(ggml_tensor * t) {
|
||||
fused.push_back(t);
|
||||
}
|
||||
|
||||
bool stackable() const {
|
||||
switch (this->op()) {
|
||||
case GGML_OP_MUL_MAT:
|
||||
@@ -131,87 +156,117 @@ struct htp_opformat {
|
||||
char types[16 * GGML_MAX_SRC];
|
||||
char buffs[64 * GGML_MAX_SRC];
|
||||
char names[64 * GGML_MAX_SRC];
|
||||
char kparams[128];
|
||||
|
||||
int format_tensor_dims(char * str, const struct ggml_tensor * t) {
|
||||
int format_tensor_dims(char * str, size_t max_size, const struct ggml_tensor * t) {
|
||||
if (!t) {
|
||||
return sprintf(str, "NONE");
|
||||
return snprintf(str, max_size, "NONE");
|
||||
}
|
||||
if (t->ne[2] == 1 && t->ne[3] == 1) {
|
||||
return sprintf(str, "%d:%d", (int) t->ne[0], (int) t->ne[1]);
|
||||
return snprintf(str, max_size, "%d:%d", (int) t->ne[0], (int) t->ne[1]);
|
||||
} else {
|
||||
return sprintf(str, "%d:%d:%d:%d", (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]);
|
||||
return snprintf(str, max_size, "%d:%d:%d:%d", (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]);
|
||||
}
|
||||
}
|
||||
|
||||
void format_op_dims(char * str, const htp_opnode & node) {
|
||||
void format_op_dims(char * str, size_t max_size, const htp_opnode & node) {
|
||||
char * p = str;
|
||||
char * p_end = str + max_size;
|
||||
auto inputs = node.get_inputs();
|
||||
|
||||
if (!inputs.empty()) {
|
||||
p += format_tensor_dims(p, inputs[0]);
|
||||
p += std::min((size_t)format_tensor_dims(p, p_end - p, inputs[0]), (size_t)(p_end - p));
|
||||
|
||||
for (size_t i = 1; i < inputs.size(); i++) {
|
||||
p += sprintf(p, " x ");
|
||||
p += format_tensor_dims(p, inputs[i]);
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, " x "), (size_t)(p_end - p));
|
||||
}
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)format_tensor_dims(p, p_end - p, inputs[i]), (size_t)(p_end - p));
|
||||
}
|
||||
}
|
||||
|
||||
p += sprintf(p, " -> ");
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, " -> "), (size_t)(p_end - p));
|
||||
}
|
||||
}
|
||||
|
||||
char self[64];
|
||||
format_tensor_dims(self, node.dst());
|
||||
p += sprintf(p, "%s", self);
|
||||
format_tensor_dims(self, sizeof(self), node.dst());
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, "%s", self), (size_t)(p_end - p));
|
||||
}
|
||||
}
|
||||
|
||||
int format_tensor_strides(char * str, const struct ggml_tensor * t) {
|
||||
int format_tensor_strides(char * str, size_t max_size, const struct ggml_tensor * t) {
|
||||
if (!t) {
|
||||
return sprintf(str, "NONE");
|
||||
return snprintf(str, max_size, "NONE");
|
||||
}
|
||||
const char * c = ggml_is_contiguous(t) ? "" : "!";
|
||||
|
||||
if (t->ne[2] == 1 && t->ne[3] == 1) {
|
||||
return sprintf(str, "%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], c);
|
||||
return snprintf(str, max_size, "%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], c);
|
||||
} else {
|
||||
return sprintf(str, "%zu:%zu:%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], (size_t) t->nb[2], (size_t) t->nb[3], c);
|
||||
return snprintf(str, max_size, "%zu:%zu:%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], (size_t) t->nb[2], (size_t) t->nb[3], c);
|
||||
}
|
||||
}
|
||||
|
||||
void format_op_strides(char * str, const htp_opnode & node) {
|
||||
void format_op_strides(char * str, size_t max_size, const htp_opnode & node) {
|
||||
char * p = str;
|
||||
char * p_end = str + max_size;
|
||||
auto inputs = node.get_inputs();
|
||||
|
||||
if (!inputs.empty()) {
|
||||
p += format_tensor_strides(p, inputs[0]);
|
||||
p += std::min((size_t)format_tensor_strides(p, p_end - p, inputs[0]), (size_t)(p_end - p));
|
||||
|
||||
for (size_t i = 1; i < inputs.size(); i++) {
|
||||
p += sprintf(p, " x ");
|
||||
p += format_tensor_strides(p, inputs[i]);
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, " x "), (size_t)(p_end - p));
|
||||
}
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)format_tensor_strides(p, p_end - p, inputs[i]), (size_t)(p_end - p));
|
||||
}
|
||||
}
|
||||
|
||||
p += sprintf(p, " -> ");
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, " -> "), (size_t)(p_end - p));
|
||||
}
|
||||
}
|
||||
|
||||
char self[64];
|
||||
format_tensor_strides(self, node.dst());
|
||||
p += sprintf(p, "%s", self);
|
||||
format_tensor_strides(self, sizeof(self), node.dst());
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, "%s", self), (size_t)(p_end - p));
|
||||
}
|
||||
}
|
||||
|
||||
void format_op_types(char * str, const htp_opnode & node) {
|
||||
void format_op_types(char * str, size_t max_size, const htp_opnode & node) {
|
||||
char * p = str;
|
||||
char * p_end = str + max_size;
|
||||
auto inputs = node.get_inputs();
|
||||
|
||||
if (!inputs.empty()) {
|
||||
p += sprintf(p, "%s", inputs[0] ? ggml_type_name(inputs[0]->type) : "NONE");
|
||||
|
||||
for (size_t i = 1; i < inputs.size(); i++) {
|
||||
p += sprintf(p, " x ");
|
||||
p += sprintf(p, "%s", inputs[i] ? ggml_type_name(inputs[i]->type) : "NONE");
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, "%s", inputs[0] ? ggml_type_name(inputs[0]->type) : "NONE"), (size_t)(p_end - p));
|
||||
}
|
||||
|
||||
p += sprintf(p, " -> ");
|
||||
for (size_t i = 1; i < inputs.size(); i++) {
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, " x "), (size_t)(p_end - p));
|
||||
}
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, "%s", inputs[i] ? ggml_type_name(inputs[i]->type) : "NONE"), (size_t)(p_end - p));
|
||||
}
|
||||
}
|
||||
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, " -> "), (size_t)(p_end - p));
|
||||
}
|
||||
}
|
||||
|
||||
p += sprintf(p, "%s", ggml_type_name(node.dst()->type));
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, "%s", ggml_type_name(node.dst()->type)), (size_t)(p_end - p));
|
||||
}
|
||||
}
|
||||
|
||||
const char * tensor_buff_name(const struct ggml_tensor * t) {
|
||||
@@ -221,51 +276,102 @@ struct htp_opformat {
|
||||
return "NONE";
|
||||
}
|
||||
|
||||
void format_op_buffs(char * str, const htp_opnode & node) {
|
||||
void format_op_buffs(char * str, size_t max_size, const htp_opnode & node) {
|
||||
char * p = str;
|
||||
char * p_end = str + max_size;
|
||||
auto inputs = node.get_inputs();
|
||||
|
||||
if (!inputs.empty()) {
|
||||
p += sprintf(p, "%s", tensor_buff_name(inputs[0]));
|
||||
|
||||
for (size_t i = 1; i < inputs.size(); i++) {
|
||||
p += sprintf(p, " x ");
|
||||
p += sprintf(p, "%s", tensor_buff_name(inputs[i]));
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, "%s", tensor_buff_name(inputs[0])), (size_t)(p_end - p));
|
||||
}
|
||||
|
||||
p += sprintf(p, " -> ");
|
||||
for (size_t i = 1; i < inputs.size(); i++) {
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, " x "), (size_t)(p_end - p));
|
||||
}
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, "%s", tensor_buff_name(inputs[i])), (size_t)(p_end - p));
|
||||
}
|
||||
}
|
||||
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, " -> "), (size_t)(p_end - p));
|
||||
}
|
||||
}
|
||||
|
||||
p += sprintf(p, "%s", tensor_buff_name(node.dst()));
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, "%s", tensor_buff_name(node.dst())), (size_t)(p_end - p));
|
||||
}
|
||||
}
|
||||
|
||||
void format_op_names(char * str, const htp_opnode & node) {
|
||||
void format_op_names(char * str, size_t max_size, const htp_opnode & node) {
|
||||
char * p = str;
|
||||
char * p_end = str + max_size;
|
||||
auto inputs = node.get_inputs();
|
||||
|
||||
if (!inputs.empty()) {
|
||||
p += sprintf(p, "%s", inputs[0] ? inputs[0]->name : "NONE");
|
||||
|
||||
for (size_t i = 1; i < inputs.size(); i++) {
|
||||
p += sprintf(p, " x ");
|
||||
p += sprintf(p, "%s", inputs[i] ? inputs[i]->name : "NONE");
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, "%s", inputs[0] ? inputs[0]->name : "NONE"), (size_t)(p_end - p));
|
||||
}
|
||||
|
||||
p += sprintf(p, " -> ");
|
||||
for (size_t i = 1; i < inputs.size(); i++) {
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, " x "), (size_t)(p_end - p));
|
||||
}
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, "%s", inputs[i] ? inputs[i]->name : "NONE"), (size_t)(p_end - p));
|
||||
}
|
||||
}
|
||||
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, " -> "), (size_t)(p_end - p));
|
||||
}
|
||||
}
|
||||
|
||||
p += sprintf(p, "%s", node.dst()->name);
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, "%s", node.dst()->name), (size_t)(p_end - p));
|
||||
}
|
||||
}
|
||||
void format_kernel_params(char * str, size_t max_size, const htp_opnode & node) {
|
||||
if (node.opcode == HTP_OP_MUL_MAT || node.opcode == HTP_OP_MUL_MAT_ID ||
|
||||
node.opcode == HTP_OP_MUL_MAT_QKV || node.opcode == HTP_OP_MUL_MAT_FFN) {
|
||||
const auto * kparams = (const struct htp_mm_kernel_params *) node.kernel_params;
|
||||
const char * path = "unknown";
|
||||
int32_t type = kparams->kernel_type;
|
||||
if (type == HTP_MM_KERNEL_HMX_2D || type == HTP_MM_KERNEL_HMX_F16_BATCHED) {
|
||||
path = "hmx-tiled";
|
||||
} else if (type == HTP_MM_KERNEL_HVX_F16_F16_VTCM || type == HTP_MM_KERNEL_HVX_F32_F32_VTCM ||
|
||||
type == HTP_MM_KERNEL_HVX_QUANT_ROW || type == HTP_MM_KERNEL_HVX_QUANT_BLOCK) {
|
||||
path = "hvx-tiled";
|
||||
} else if (type == HTP_MM_KERNEL_HVX_F16_F16_DDR || type == HTP_MM_KERNEL_HVX_F16_F32_DDR ||
|
||||
type == HTP_MM_KERNEL_HVX_F32_F32_DDR || type == HTP_MM_KERNEL_HVX_F32_F16_DDR ||
|
||||
type == HTP_MM_KERNEL_HVX_QUANT_ROW_FLAT) {
|
||||
path = "hvx-flat";
|
||||
}
|
||||
snprintf(str, max_size, "%s vtcm %d", path, (int) kparams->vtcm_size);
|
||||
} else {
|
||||
snprintf(str, max_size, "----");
|
||||
}
|
||||
}
|
||||
|
||||
void format(const htp_opnode & node) {
|
||||
format_op_dims(dims, node);
|
||||
format_op_strides(strides, node);
|
||||
format_op_types(types, node);
|
||||
format_op_buffs(buffs, node);
|
||||
format_op_names(names, node);
|
||||
format_op_dims(dims, sizeof(dims), node);
|
||||
format_op_strides(strides, sizeof(strides), node);
|
||||
format_op_types(types, sizeof(types), node);
|
||||
format_op_buffs(buffs, sizeof(buffs), node);
|
||||
format_op_names(names, sizeof(names), node);
|
||||
format_kernel_params(kparams, sizeof(kparams), node);
|
||||
}
|
||||
|
||||
htp_opformat() {}
|
||||
htp_opformat() {
|
||||
strides[0] = '\0';
|
||||
dims[0] = '\0';
|
||||
types[0] = '\0';
|
||||
buffs[0] = '\0';
|
||||
names[0] = '\0';
|
||||
kparams[0] = '\0';
|
||||
}
|
||||
htp_opformat(const htp_opnode & node) { format(node); }
|
||||
};
|
||||
|
||||
|
||||
@@ -19,43 +19,9 @@ add_library(${HTP_LIB} SHARED
|
||||
htp_iface_skel.c
|
||||
worker-pool.c
|
||||
hex-dma.c
|
||||
)
|
||||
|
||||
target_compile_definitions(${HTP_LIB} PRIVATE
|
||||
$<IF:$<BOOL:${HEXAGON_HTP_DEBUG}>,HTP_DEBUG=1,NDEBUG=1>
|
||||
$<IF:$<BOOL:${HEXAGON_HTP_DEBUG}>,FARF_HIGH=1,>
|
||||
FP32_QUANTIZE_GROUP_SIZE=${GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE})
|
||||
|
||||
if (GGML_HEXAGON_FA_EXP2_HF)
|
||||
message(STATUS "ggml-htp: HMX_FA_USE_EXP2_HF=1 (use FP16 exp2 polynomial in FA softmax)")
|
||||
target_compile_definitions(${HTP_LIB} PRIVATE HMX_FA_USE_EXP2_HF=1)
|
||||
endif()
|
||||
|
||||
# HMX acceleration: available on v73+ architectures
|
||||
set(HTP_HMX_VERSIONS v73 v75 v79 v81)
|
||||
list(FIND HTP_HMX_VERSIONS ${DSP_VERSION} _hmx_idx)
|
||||
|
||||
if (_hmx_idx GREATER_EQUAL 0)
|
||||
target_sources(${HTP_LIB} PRIVATE
|
||||
hmx-flash-attn-ops.c
|
||||
hmx-matmul-ops.c
|
||||
hmx-queue.c
|
||||
)
|
||||
|
||||
# -mhmx enables HMX instruction set (needed by files that include hmx-utils.h)
|
||||
set_source_files_properties(
|
||||
hmx-flash-attn-ops.c
|
||||
hmx-matmul-ops.c
|
||||
hmx-queue.c
|
||||
PROPERTIES COMPILE_OPTIONS "-mhmx"
|
||||
)
|
||||
|
||||
target_compile_definitions(${HTP_LIB} PRIVATE HTP_HAS_HMX=1)
|
||||
endif()
|
||||
|
||||
build_idl(htp_iface.idl ${HTP_LIB})
|
||||
|
||||
target_sources(${HTP_LIB} PRIVATE
|
||||
hmx-queue.c
|
||||
flash-attn-ops.c
|
||||
hmx-flash-attn-ops.c
|
||||
matmul-ops.c
|
||||
binary-ops.c
|
||||
unary-ops.c
|
||||
@@ -63,7 +29,6 @@ target_sources(${HTP_LIB} PRIVATE
|
||||
softmax-ops.c
|
||||
act-ops.c
|
||||
rope-ops.c
|
||||
flash-attn-ops.c
|
||||
set-rows-ops.c
|
||||
get-rows-ops.c
|
||||
cpy-ops.c
|
||||
@@ -79,6 +44,17 @@ target_sources(${HTP_LIB} PRIVATE
|
||||
pad-ops.c
|
||||
)
|
||||
|
||||
target_compile_definitions(${HTP_LIB} PRIVATE
|
||||
$<IF:$<BOOL:${HEXAGON_HTP_DEBUG}>,HTP_DEBUG=1,NDEBUG=1>
|
||||
$<IF:$<BOOL:${HEXAGON_HTP_DEBUG}>,FARF_HIGH=1,>)
|
||||
|
||||
if (GGML_HEXAGON_FA_EXP2_HF)
|
||||
message(STATUS "ggml-htp: HMX_FA_USE_EXP2_HF=1 (use FP16 exp2 polynomial in FA softmax)")
|
||||
target_compile_definitions(${HTP_LIB} PRIVATE HMX_FA_USE_EXP2_HF=1)
|
||||
endif()
|
||||
|
||||
build_idl(htp_iface.idl ${HTP_LIB})
|
||||
|
||||
set_target_properties(${HTP_LIB} PROPERTIES EXPORT_COMPILE_COMMANDS ON)
|
||||
|
||||
install(TARGETS ${HTP_LIB})
|
||||
|
||||
@@ -3,7 +3,7 @@ if (HEXAGON_TOOLCHAIN_INCLUDED)
|
||||
endif()
|
||||
set(HEXAGON_TOOLCHAIN_INCLUDED true)
|
||||
|
||||
#Cross Compiling for Hexagon
|
||||
# Cross Compiling for Hexagon
|
||||
set(HEXAGON TRUE)
|
||||
set(CMAKE_SYSTEM_NAME QURT)
|
||||
set(CMAKE_SYSTEM_PROCESSOR Hexagon)
|
||||
@@ -14,7 +14,6 @@ set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
|
||||
set(CUSTOM_RUNELF_PATH "")
|
||||
|
||||
#To fix backward compatibility with EAI addon.
|
||||
if (NOT HEXAGON_SDK_ROOT)
|
||||
set(HEXAGON_SDK_ROOT $ENV{HEXAGON_SDK_ROOT})
|
||||
endif()
|
||||
@@ -31,7 +30,6 @@ endif()
|
||||
file(TO_CMAKE_PATH "${HEXAGON_TOOLS_ROOT}" HEXAGON_TOOLS_ROOT)
|
||||
file(TO_CMAKE_PATH "${HEXAGON_SDK_ROOT}" HEXAGON_SDK_ROOT)
|
||||
|
||||
#Get the Binary extension of the Hexagon Toolchain
|
||||
if(CMAKE_HOST_SYSTEM_NAME STREQUAL Windows)
|
||||
set(HEXAGON_TOOLCHAIN_SUFFIX .exe)
|
||||
endif()
|
||||
@@ -48,12 +46,12 @@ set(CMAKE_TRY_COMPILE_PLATFORM_VARIABLES
|
||||
HEXAGON_TOOLS_ROOT
|
||||
)
|
||||
|
||||
#QURT Related includes and linker flags
|
||||
# QURT Related includes and linker flags
|
||||
set(V_ARCH ${HEXAGON_ARCH})
|
||||
set(_QURT_INSTALL_DIR "${HEXAGON_SDK_ROOT}/rtos/qurt/ADSP${V_ARCH}MP${V_ARCH_EXTN}")
|
||||
set(_QURT_INSTALL_DIR "${HEXAGON_SDK_ROOT}/rtos/qurt/compute${V_ARCH}${V_ARCH_EXTN}")
|
||||
|
||||
if( ${TREE} MATCHES PAKMAN )
|
||||
if (${TREE} MATCHES PAKMAN)
|
||||
set(_QURT_INSTALL_DIR "${QURT_IMAGE_DIR}/compute${V_ARCH}${V_ARCH_EXTN}")
|
||||
endif()
|
||||
message(DEBUG "_QURT_INSTALL_DIR:${_QURT_INSTALL_DIR}")
|
||||
@@ -83,11 +81,9 @@ set(QURT_START_LINK_LIBS
|
||||
)
|
||||
STRING(REPLACE ";" " " QURT_START_LINK_LIBS "${QURT_START_LINK_LIBS}")
|
||||
|
||||
set(QURT_END_LINK_LIBS
|
||||
${TARGET_DIR}/fini.o
|
||||
)
|
||||
set(QURT_END_LINK_LIBS ${TARGET_DIR}/fini.o)
|
||||
|
||||
#Non QURT related includes and linker flags
|
||||
# Non QURT related includes and linker flags
|
||||
|
||||
set(TARGET_DIR_NOOS "${HEXAGON_TOOLCHAIN}/Tools/target/hexagon/lib/${HEXAGON_ARCH}")
|
||||
|
||||
@@ -99,8 +95,10 @@ if (NOT NO_WRAP_MEM_API)
|
||||
set(WRAP_MEMALIGN -Wl,--wrap=memalign)
|
||||
endif()
|
||||
|
||||
set(ARCH_FLAGS "-mcpu=${V_ARCH} -m${V_ARCH} -mhvx=${V_ARCH} -mhmx")
|
||||
|
||||
set(PIC_SHARED_LD_FLAGS
|
||||
-mcpu=${V_ARCH} -m${V_ARCH} -mhvx=${V_ARCH}
|
||||
${ARCH_FLAGS}
|
||||
-G0
|
||||
-fpic
|
||||
-Wl,-Bsymbolic
|
||||
@@ -120,13 +118,13 @@ STRING(REPLACE ";" " " PIC_SHARED_LD_FLAGS "${PIC_SHARED_LD_FLAGS}")
|
||||
|
||||
set(HEXAGON_PIC_SHARED_LINK_OPTIONS "${PIC_SHARED_LD_FLAGS}")
|
||||
|
||||
#System include paths
|
||||
# System include paths
|
||||
include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/incs)
|
||||
include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/incs/stddef)
|
||||
include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/ipc/fastrpc/incs)
|
||||
|
||||
#LLVM toolchain setup
|
||||
#Compiler paths, options and architecture
|
||||
# LLVM toolchain setup
|
||||
# Compiler paths, options and architecture
|
||||
set(CMAKE_C_COMPILER ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-clang${HEXAGON_TOOLCHAIN_SUFFIX})
|
||||
set(CMAKE_CXX_COMPILER ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-clang++${HEXAGON_TOOLCHAIN_SUFFIX})
|
||||
set(CMAKE_AR ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-ar${HEXAGON_TOOLCHAIN_SUFFIX})
|
||||
@@ -137,8 +135,8 @@ set(CMAKE_PREFIX_PATH ${HEXAGON_TOOLCHAIN}/Tools/target/hexagon)
|
||||
set(CMAKE_SHARED_LIBRARY_SONAME_C_FLAG "-Wl,-soname,")
|
||||
set(CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG "-Wl,-soname,")
|
||||
|
||||
#Compiler Options
|
||||
set(COMMON_FLAGS "-mcpu=hexagon${V_ARCH} -m${V_ARCH} -mhvx=${V_ARCH} -fvectorize -flto -Wall -Werror -fno-zero-initialized-in-bss -G0 -fdata-sections -fpic ${XQF_ARGS}")
|
||||
# Compiler Options
|
||||
set(COMMON_FLAGS "${ARCH_FLAGS} -fvectorize -flto -Wall -Werror -fno-zero-initialized-in-bss -G0 -fdata-sections -fpic ${XQF_ARGS}")
|
||||
|
||||
set(CMAKE_CXX_FLAGS_DEBUG "${COMMON_FLAGS} -O0 -D_DEBUG -g")
|
||||
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${COMMON_FLAGS} -O2 -g")
|
||||
|
||||
@@ -18,7 +18,8 @@
|
||||
#include "htp-ctx.h"
|
||||
#include "htp-ops.h"
|
||||
#include "htp-ops.h"
|
||||
#include "hmx-ops.h"
|
||||
|
||||
int hmx_flash_attn_ext(struct htp_ops_context * octx);
|
||||
|
||||
// Must be multiple of 32
|
||||
#define FLASH_ATTN_BLOCK_SIZE (32 * 2)
|
||||
@@ -633,7 +634,6 @@ int op_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
|
||||
#ifdef HTP_HAS_HMX
|
||||
// HMX path: head_dim multiple of 64, F16 KV, and no sinks
|
||||
if (k->type == HTP_TYPE_F16 && v->type == HTP_TYPE_F16 && k->ne[0] % 64 == 0 && v->ne[0] % 64 == 0 && octx->src[4] == NULL) {
|
||||
int ret = hmx_flash_attn_ext(octx);
|
||||
@@ -642,7 +642,6 @@ int op_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
}
|
||||
// VTCM too small or other failure -> fall through to HVX path
|
||||
}
|
||||
#endif
|
||||
|
||||
struct htp_fa_context factx;
|
||||
factx.octx = octx;
|
||||
|
||||
@@ -0,0 +1,80 @@
|
||||
#ifndef HEX_COMMON_H
|
||||
#define HEX_COMMON_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#ifndef SIZE_MAX
|
||||
#define SIZE_MAX ((size_t)-1)
|
||||
#endif
|
||||
|
||||
#ifndef MAX
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
#endif
|
||||
|
||||
#ifndef MIN
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#endif
|
||||
|
||||
static inline uint32_t hex_ceil_pow2(uint32_t x) {
|
||||
if (x <= 1) { return 1; }
|
||||
int p = 2;
|
||||
x--;
|
||||
while (x >>= 1) { p <<= 1; }
|
||||
return p;
|
||||
}
|
||||
|
||||
static inline size_t hmx_ceil_div(size_t num, size_t den) {
|
||||
return (num + den - 1) / den;
|
||||
}
|
||||
|
||||
static inline int32_t hex_is_aligned(const void * addr, uint32_t align) {
|
||||
return ((size_t) addr & (align - 1)) == 0;
|
||||
}
|
||||
|
||||
static inline size_t hex_align_up(size_t v, size_t align) {
|
||||
return hmx_ceil_div(v, align) * align;
|
||||
}
|
||||
|
||||
static inline size_t hex_align_down(size_t v, size_t align) {
|
||||
return (v / align) * align;
|
||||
}
|
||||
|
||||
static inline int32_t hex_is_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) {
|
||||
uint32_t left_off = (size_t) addr & (chunk_size - 1);
|
||||
uint32_t right_off = left_off + n;
|
||||
return right_off <= chunk_size;
|
||||
}
|
||||
|
||||
static inline uint32_t hex_round_up(uint32_t n, uint32_t m) {
|
||||
return m * ((n + m - 1) / m);
|
||||
}
|
||||
|
||||
static inline size_t hex_smin(size_t a, size_t b) {
|
||||
return a < b ? a : b;
|
||||
}
|
||||
|
||||
static inline size_t hex_smax(size_t a, size_t b) {
|
||||
return a > b ? a : b;
|
||||
}
|
||||
|
||||
static inline void hex_swap_ptr(void ** p1, void ** p2) {
|
||||
void * t = *p1;
|
||||
*p1 = *p2;
|
||||
*p2 = t;
|
||||
}
|
||||
|
||||
static inline bool hex_mul_overflow(size_t a, size_t b, size_t *out) {
|
||||
if (a != 0 && b > SIZE_MAX / a) return true;
|
||||
*out = a * b;
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline bool hex_add_overflow(size_t a, size_t b, size_t *out) {
|
||||
if (a > SIZE_MAX - b) return true;
|
||||
*out = a + b;
|
||||
return false;
|
||||
}
|
||||
|
||||
#endif // HEX_COMMON_H
|
||||
@@ -5,6 +5,7 @@
|
||||
#include <hexagon_types.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
#include "hex-utils.h"
|
||||
|
||||
#include "hex-profile.h"
|
||||
|
||||
@@ -127,13 +128,8 @@ static inline dma_ptr dma_make_ptr(void *dst, const void *src)
|
||||
return p;
|
||||
}
|
||||
|
||||
#if __HVX_ARCH__ < 73
|
||||
static const uint32_t dma_src_l2_bypass_on = 1;
|
||||
static const uint32_t dma_dst_l2_bypass_on = 0;
|
||||
#else
|
||||
static const uint32_t dma_src_l2_bypass_on = 1;
|
||||
static const uint32_t dma_dst_l2_bypass_on = 1;
|
||||
#endif
|
||||
|
||||
static inline bool dma_queue_push_single_1d(dma_queue * q, dma_ptr dptr, size_t size) {
|
||||
if (((q->push_idx + 1) & q->idx_mask) == q->pop_idx) {
|
||||
|
||||
@@ -11,14 +11,7 @@
|
||||
|
||||
#include "hex-fastdiv.h"
|
||||
#include "hex-dump.h"
|
||||
|
||||
#ifndef MAX
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
#endif
|
||||
|
||||
#ifndef MIN
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#endif
|
||||
#include "hex-common.h"
|
||||
|
||||
static inline uint64_t hex_get_cycles() {
|
||||
uint64_t cycles = 0;
|
||||
@@ -32,54 +25,6 @@ static inline uint64_t hex_get_pktcnt() {
|
||||
return pktcnt;
|
||||
}
|
||||
|
||||
static inline uint32_t hex_ceil_pow2(uint32_t x) {
|
||||
if (x <= 1) { return 1; }
|
||||
int p = 2;
|
||||
x--;
|
||||
while (x >>= 1) { p <<= 1; }
|
||||
return p;
|
||||
}
|
||||
|
||||
static inline size_t hmx_ceil_div(size_t num, size_t den) {
|
||||
return (num + den - 1) / den;
|
||||
}
|
||||
|
||||
static inline int32_t hex_is_aligned(const void * addr, uint32_t align) {
|
||||
return ((size_t) addr & (align - 1)) == 0;
|
||||
}
|
||||
|
||||
static inline size_t hex_align_up(size_t v, size_t align) {
|
||||
return hmx_ceil_div(v, align) * align;
|
||||
}
|
||||
|
||||
static inline size_t hex_align_down(size_t v, size_t align) {
|
||||
return (v / align) * align;
|
||||
}
|
||||
|
||||
static inline int32_t hex_is_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) {
|
||||
uint32_t left_off = (size_t) addr & (chunk_size - 1);
|
||||
uint32_t right_off = left_off + n;
|
||||
return right_off <= chunk_size;
|
||||
}
|
||||
|
||||
static inline uint32_t hex_round_up(uint32_t n, uint32_t m) {
|
||||
return m * ((n + m - 1) / m);
|
||||
}
|
||||
|
||||
static inline size_t hex_smin(size_t a, size_t b) {
|
||||
return a < b ? a : b;
|
||||
}
|
||||
|
||||
static inline size_t hex_smax(size_t a, size_t b) {
|
||||
return a > b ? a : b;
|
||||
}
|
||||
|
||||
static inline void hex_swap_ptr(void ** p1, void ** p2) {
|
||||
void * t = *p1;
|
||||
*p1 = *p2;
|
||||
*p2 = t;
|
||||
}
|
||||
|
||||
static inline void hex_l2fetch(const void * p, uint32_t width, uint32_t stride, uint32_t height) {
|
||||
const uint64_t control = Q6_P_combine_RR(stride, Q6_R_combine_RlRl(width, height));
|
||||
Q6_l2fetch_AP((void *) p, control);
|
||||
|
||||
@@ -49,7 +49,7 @@
|
||||
// g_br = hex_align_up(gqa_factor * Br, 32) replaces Br for all Q/O/S/P/D dimensions.
|
||||
// Layout: Q + O_ping + O_pong + K_dma*2 + V_dma*2 + K_tile + V_tile + S + P + D + vectors + scales
|
||||
// Mask is DMA'd into a VTCM buffer (Br rows per KV block) to avoid DDR reads in softmax.
|
||||
static size_t hmx_fa_compute_vtcm_usage(size_t gqa_factor, size_t DK, size_t DV, size_t Br, size_t Bc, size_t n_threads, bool use_pipeline) {
|
||||
static size_t hmx_fa_compute_vtcm_usage(size_t gqa_factor, size_t DK, size_t DV, size_t Br, size_t Bc, size_t n_threads, bool pipeline) {
|
||||
const size_t g_br = hex_align_up(gqa_factor * Br, HMX_FP16_TILE_N_ROWS);
|
||||
const size_t q_tile_size = hex_align_up(g_br * DK * sizeof(__fp16), 4096); // Q: [g_br, DK]
|
||||
const size_t o_tile_size = hex_align_up(g_br * DV * sizeof(__fp16), 4096); // O: [g_br, DV] x2 ping-pong
|
||||
@@ -70,7 +70,7 @@ static size_t hmx_fa_compute_vtcm_usage(size_t gqa_factor, size_t DK, size_t DV,
|
||||
+ k_dma_size * 2 // K DMA x2
|
||||
+ v_dma_size * 2 // V DMA x2
|
||||
+ k_tile_size * 1 // K tiles
|
||||
+ v_tile_size * (use_pipeline ? 2 : 1) // V tiles (double-buffered if pipelining)
|
||||
+ v_tile_size * (pipeline ? 2 : 1) // V tiles (double-buffered if pipelining)
|
||||
+ s_tile_size * 2 // S + P
|
||||
+ d_tile_size * 1 // D (diagonal matrix)
|
||||
+ col_vec_size * 4 // m_vec, l_vec, s_rowmax, p_rowsum
|
||||
@@ -290,7 +290,7 @@ static const int16_t d_tile_scatter_offsets[64] __attribute__((aligned(128))) =
|
||||
|
||||
struct hmx_fa_context {
|
||||
const struct htp_ops_context * octx;
|
||||
bool use_pipeline; // true when n_kv_blocks >= FA_MIN_KV_BLOCKS && n_threads >= 2
|
||||
bool pipeline; // true when n_kv_blocks >= FA_MIN_KV_BLOCKS && n_threads >= 2
|
||||
uint32_t n_threads;
|
||||
|
||||
// Op parameters
|
||||
@@ -409,7 +409,7 @@ static void fa_v_interleave_thread(unsigned int n, unsigned int i, void * data)
|
||||
return;
|
||||
}
|
||||
|
||||
__fp16 * v_tiles_dest = factx->use_pipeline ? factx->vtcm_v_tiles[args->buf_idx] : factx->vtcm_v_tiles[0];
|
||||
__fp16 * v_tiles_dest = factx->pipeline ? factx->vtcm_v_tiles[args->buf_idx] : factx->vtcm_v_tiles[0];
|
||||
|
||||
struct htp_thread_trace * tr = factx->octx->ctx ? &factx->octx->ctx->trace[i] : NULL;
|
||||
htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, start);
|
||||
@@ -1312,13 +1312,13 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
const size_t g_br = hex_align_up(G * Br, HMX_FP16_TILE_N_ROWS);
|
||||
|
||||
const uint32_t n_kv_blocks = (nek1 + Bc - 1) / Bc;
|
||||
const bool use_pipeline = (n_kv_blocks >= FA_MIN_KV_BLOCKS && n_threads_init >= 2);
|
||||
const bool pipeline = (n_kv_blocks >= FA_MIN_KV_BLOCKS && n_threads_init >= 2);
|
||||
|
||||
// Bypass thread pool dispatch for small prompts/non-pipelined prefill by setting n_threads = 1
|
||||
const uint32_t n_threads = use_pipeline ? n_threads_init : 1;
|
||||
const uint32_t n_threads = pipeline ? n_threads_init : 1;
|
||||
|
||||
FARF(HIGH, "hmx-fa: neq1=%u nek1=%u DK=%u DV=%u G=%u Br=%zu Bc=%zu g_br=%zu n_kv_blocks=%u pipeline=%d vtcm=%zu",
|
||||
neq1, nek1, DK, DV, G, Br, Bc, g_br, n_kv_blocks, use_pipeline, vtcm_budget);
|
||||
neq1, nek1, DK, DV, G, Br, Bc, g_br, n_kv_blocks, pipeline, vtcm_budget);
|
||||
|
||||
// ======== Build context ========
|
||||
struct hmx_fa_context factx;
|
||||
@@ -1339,7 +1339,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
factx.n_kv_blocks = n_kv_blocks;
|
||||
factx.is_q_fp32 = (q->type == HTP_TYPE_F32);
|
||||
factx.is_dst_fp32 = (dst->type == HTP_TYPE_F32);
|
||||
factx.use_pipeline = use_pipeline;
|
||||
factx.pipeline = pipeline;
|
||||
factx.mask_broadcast = (mask != NULL && mask->ne[2] == 1);
|
||||
|
||||
// Extract op parameters (mutable during softcap adjustment, then stored as const in factx)
|
||||
@@ -1405,7 +1405,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
factx.vtcm_v_fp16[1] = (__fp16 *) vtcm_seq_alloc(&vtcm_cur, v_dma_bytes);
|
||||
factx.vtcm_k_tiles = (__fp16 *) vtcm_seq_alloc(&vtcm_cur, k_tile_bytes);
|
||||
factx.vtcm_v_tiles[0] = (__fp16 *) vtcm_seq_alloc(&vtcm_cur, v_tile_bytes);
|
||||
if (use_pipeline) {
|
||||
if (pipeline) {
|
||||
factx.vtcm_v_tiles[1] = (__fp16 *) vtcm_seq_alloc(&vtcm_cur, v_tile_bytes);
|
||||
} else {
|
||||
factx.vtcm_v_tiles[1] = NULL;
|
||||
@@ -1456,7 +1456,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
// ======== HMX lock strategy ========
|
||||
// Pipeline: queue thread auto-acquires HMX lock on first push; released by suspend.
|
||||
// Fallback: main thread holds the lock (original behavior).
|
||||
if (!factx.use_pipeline) {
|
||||
if (!factx.pipeline) {
|
||||
HAP_compute_res_hmx_lock(ctx->vtcm_rctx);
|
||||
}
|
||||
|
||||
@@ -1550,7 +1550,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
const size_t k_src_stride = size_k_row_padded / sizeof(__fp16);
|
||||
const size_t v_src_stride = size_v_row_padded / sizeof(__fp16);
|
||||
|
||||
if (factx.use_pipeline) {
|
||||
if (factx.pipeline) {
|
||||
// ==================================================================
|
||||
// Pipeline path: HVX phases ‖ HMX queue worker
|
||||
// ==================================================================
|
||||
@@ -1780,7 +1780,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
fa_build_d_diag_inv_l(&factx, n_row_tiles, n_row_tiles_g_br);
|
||||
|
||||
// HMX: O_final = diag(1/l) @ O_prev
|
||||
if (factx.use_pipeline) {
|
||||
if (factx.pipeline) {
|
||||
on_job.o_curr = o_tile_curr;
|
||||
on_job.o_prev = o_tile_prev;
|
||||
on_job.d_tiles = factx.vtcm_d_tiles;
|
||||
@@ -1826,7 +1826,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
} // end KV head loop
|
||||
} // end batch loop
|
||||
|
||||
if (factx.use_pipeline) {
|
||||
if (factx.pipeline) {
|
||||
hmx_queue_suspend(ctx->hmx_queue);
|
||||
} else {
|
||||
HAP_compute_res_hmx_unlock(ctx->vtcm_rctx);
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,6 +0,0 @@
|
||||
// HMX operations compiled as a single translation unit.
|
||||
// This allows interprocedural optimizations within HMX ops without requiring global HTP LTO.
|
||||
|
||||
#include "hmx-queue.c"
|
||||
#include "hmx-matmul-ops.c"
|
||||
#include "hmx-flash-attn-ops.c"
|
||||
@@ -1,88 +0,0 @@
|
||||
// HMX operation entry-point declarations.
|
||||
// Ported from htp-ops-lib/include/dsp/ops.h (renamed, benchmark kernels removed). (https://github.com/haozixu/htp-ops-lib)
|
||||
|
||||
#ifndef HMX_OPS_H
|
||||
#define HMX_OPS_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "htp-ops.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
float *dst;
|
||||
const float *activation;
|
||||
const __fp16 *permuted_weight;
|
||||
int m;
|
||||
int k;
|
||||
int n;
|
||||
int act_stride;
|
||||
int weight_stride;
|
||||
int dst_stride;
|
||||
int ne02;
|
||||
int ne03;
|
||||
int ne12;
|
||||
int ne13;
|
||||
size_t src0_nb2;
|
||||
size_t src0_nb3;
|
||||
size_t src1_nb2;
|
||||
size_t src1_nb3;
|
||||
size_t dst_nb2;
|
||||
size_t dst_nb3;
|
||||
} hmx_matmul_f16_f32_batched_params_t;
|
||||
|
||||
// HMX matrix multiplication — tile-permuted FP16 weights, FP32 activation/output
|
||||
// act_stride: activation row stride in elements (= k for contiguous, or
|
||||
// nb[1]/sizeof(float) for permuted tensors like attention Q).
|
||||
// weight_stride: weight row stride in elements (= k for compact weights, or
|
||||
// nb[1]/sizeof(__fp16) for permuted KV-cache views used by QK).
|
||||
int hmx_matmul_f16_f32(struct htp_context *ctx,
|
||||
float *restrict dst,
|
||||
const float *activation,
|
||||
const __fp16 *permuted_weight,
|
||||
int m, int k, int n,
|
||||
int act_stride,
|
||||
int weight_stride);
|
||||
|
||||
// Batched F16 wrapper over hmx_mat_mul_f16_f32.
|
||||
// Batch semantics match ggml_mul_mat(): src0 broadcasts to src1 in dims 2/3.
|
||||
int hmx_matmul_f16_f32_batched(struct htp_context *ctx, const hmx_matmul_f16_f32_batched_params_t *params);
|
||||
|
||||
// HMX matrix multiplication — all supported weight types (F16/F32/Q4_0/Q4_1/Q8_0/IQ4_NL/MXFP4)
|
||||
int hmx_matmul_2d_f32(struct htp_context *ctx,
|
||||
float *restrict dst,
|
||||
const float *activation,
|
||||
const uint8_t *permuted_weight,
|
||||
int m, int k, int n,
|
||||
int act_stride,
|
||||
int weight_stride,
|
||||
int weight_type);
|
||||
|
||||
struct mmid_row_mapping;
|
||||
|
||||
int hmx_matmul_id_2d_f32(struct htp_context *ctx,
|
||||
float *restrict dst,
|
||||
const float *activation,
|
||||
const uint8_t *permuted_weight,
|
||||
int m, int k, int n,
|
||||
int ne11,
|
||||
size_t act_nb1, size_t act_nb2,
|
||||
size_t dst_nb1, size_t dst_nb2,
|
||||
int weight_stride,
|
||||
int weight_type,
|
||||
const struct mmid_row_mapping *matrix_rows,
|
||||
int cur_a,
|
||||
int mapping_stride);
|
||||
|
||||
// HMX flash attention
|
||||
int hmx_flash_attn_ext(struct htp_ops_context * octx);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // HMX_OPS_H
|
||||
@@ -13,7 +13,9 @@
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#ifndef HTP_MAX_NTHREADS
|
||||
#define HTP_MAX_NTHREADS 10
|
||||
#endif
|
||||
#define HTP_MAX_MMAPS 16
|
||||
|
||||
// Memory mapping
|
||||
@@ -42,9 +44,13 @@ struct htp_ops_context {
|
||||
|
||||
enum htp_op_code op; // FIXME: rename to opcode
|
||||
int32_t op_params[HTP_OP_MAX_PARAMS];
|
||||
int32_t kernel_params[HTP_OP_MAX_KERN_PARAMS];
|
||||
|
||||
const struct htp_tensor * src[HTP_OP_MAX_INPUTS];
|
||||
const struct htp_tensor * dst;
|
||||
union {
|
||||
const struct htp_tensor * dst;
|
||||
const struct htp_tensor * dsts[HTP_OP_MAX_OUTPUTS];
|
||||
};
|
||||
|
||||
// TODO convert these to an array
|
||||
struct htp_spad src0_spad;
|
||||
@@ -87,13 +93,13 @@ struct htp_context {
|
||||
|
||||
struct htp_ops_context octx;
|
||||
|
||||
#ifdef HTP_HAS_HMX
|
||||
struct hmx_queue * hmx_queue; // Async HMX queue for pipeline overlap
|
||||
#endif
|
||||
};
|
||||
|
||||
int op_matmul(struct htp_ops_context * octx);
|
||||
int op_matmul_id(struct htp_ops_context * octx);
|
||||
int op_matmul_qkv(struct htp_ops_context * octx);
|
||||
int op_matmul_ffn(struct htp_ops_context * octx);
|
||||
int op_binary(struct htp_ops_context * octx);
|
||||
int op_unary(struct htp_ops_context * octx);
|
||||
int op_sum_rows(struct htp_ops_context * octx);
|
||||
|
||||
@@ -28,18 +28,19 @@ enum htp_data_type {
|
||||
HTP_TYPE_MXFP4 = 39,
|
||||
|
||||
// types used internally for repack, dyn.quant, etc
|
||||
HTP_TYPE_Q4_0x4x2 = 200,
|
||||
HTP_TYPE_Q4_1x4x2,
|
||||
HTP_TYPE_Q8_0x4x2,
|
||||
HTP_TYPE_MXFP4x4x2,
|
||||
HTP_TYPE_Q4_0_TILED = 200,
|
||||
HTP_TYPE_Q4_1_TILED,
|
||||
HTP_TYPE_Q8_0_TILED,
|
||||
HTP_TYPE_MXFP4_TILED,
|
||||
|
||||
HTP_TYPE_INVALID
|
||||
};
|
||||
|
||||
// Constats for internal types
|
||||
#define QK_Q4_0x4x2 256 // 4x Q4_0 blocks packed with next 4x Q4_0 blocks (size in bytes 128)
|
||||
#define QK_Q8_0x4x2 256 // 4x Q8_0 blocks concat with next 4x Q8_0 blocks
|
||||
#define QK_MXFP4x4x2 256 // 4x MXFP4 blocks concat with next 4x MXFP4 blocks
|
||||
#define QK_Q4_0_TILED 256 // 32x32 Q4_0 tiled layout
|
||||
#define QK_Q8_0_TILED 128 // 32x32 Q8_0 tiled layout
|
||||
#define QK_MXFP4_TILED 256 // 32x32 MXFP4 tiled layout
|
||||
|
||||
|
||||
|
||||
// Mask to enable various stages of the Ops.
|
||||
@@ -57,6 +58,8 @@ enum htp_op_code {
|
||||
HTP_OP_DIV = 3,
|
||||
HTP_OP_MUL_MAT,
|
||||
HTP_OP_MUL_MAT_ID,
|
||||
HTP_OP_MUL_MAT_QKV,
|
||||
HTP_OP_MUL_MAT_FFN,
|
||||
HTP_OP_RMS_NORM,
|
||||
HTP_OP_RMS_NORM_MUL,
|
||||
HTP_OP_UNARY_SILU,
|
||||
@@ -99,7 +102,9 @@ enum htp_op_code {
|
||||
|
||||
#define HTP_OP_MAX_DIMS 4 // aka GGML_MAX_DIMS
|
||||
#define HTP_OP_MAX_INPUTS 6 // aka GGML_MAX_SRCS
|
||||
#define HTP_OP_MAX_OUTPUTS 4
|
||||
#define HTP_OP_MAX_PARAMS 16 // aka GGML_MAX_OP_PARAMS
|
||||
#define HTP_OP_MAX_KERN_PARAMS 32
|
||||
|
||||
#define HTP_OP_MAX_BUFS 16
|
||||
#define HTP_OP_MAX_REQS 256
|
||||
@@ -142,8 +147,10 @@ struct htp_op_desc {
|
||||
uint32_t opcode; // GGML/HTP Op
|
||||
uint32_t flags; // Op flags
|
||||
int32_t params[HTP_OP_MAX_PARAMS]; // Params for the op, e.g. epsilon of RMS norm
|
||||
int32_t kernel_params[HTP_OP_MAX_KERN_PARAMS]; // generic blob for host-precomputed parameters
|
||||
uint16_t src[HTP_OP_MAX_INPUTS]; // Input tensors indices
|
||||
uint16_t dst; // Output tensor index
|
||||
uint16_t dst[HTP_OP_MAX_OUTPUTS]; // Output tensor indices
|
||||
uint16_t pad[2]; // padding to align to 64 bits
|
||||
};
|
||||
|
||||
#ifndef HTP_MAX_NTHREADS
|
||||
|
||||
@@ -11,12 +11,13 @@ struct htp_iface_pmu_conf {
|
||||
};
|
||||
|
||||
interface htp_iface : remote_handle64 {
|
||||
AEEResult start(in uint32 sess_id, in uint64 dsp_queue_id, in uint32 n_hvx, in uint32 use_hmx, in uint64 max_vmem);
|
||||
AEEResult start(in uint32 sess_id, in uint64 dsp_queue_id, in uint32 n_hvx, in uint32 n_hmx, in uint64 max_vmem);
|
||||
AEEResult stop();
|
||||
AEEResult mmap(in uint32 fd, in uint32 size);
|
||||
AEEResult munmap(in uint32 fd);
|
||||
AEEResult profiler(in uint32 mode, in htp_iface_pmu_conf pmu);
|
||||
AEEResult etm(in uint32 enable);
|
||||
AEEResult hwinfo(rout uint32 n_threads, rout uint32 n_hvx, rout uint32 n_hmx, rout uint64 vtcm_size);
|
||||
};
|
||||
|
||||
#endif /* HTP_IDL */
|
||||
|
||||
@@ -170,25 +170,7 @@ static inline HVX_VectorPair hvx_vec_f16_to_f32(HVX_Vector v) {
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Q6_Vsf_equals_Vw is only available on v73+.*/
|
||||
#if __HVX_ARCH__ < 73
|
||||
static inline HVX_Vector hvx_vec_i32_to_qf32(HVX_Vector const in)
|
||||
{
|
||||
HVX_Vector const vzero = Q6_V_vzero();
|
||||
HVX_VectorPred is_zero = Q6_Q_vcmp_eq_VwVw(in, vzero);
|
||||
HVX_Vector lshift = Q6_Vw_vnormamt_Vw(in);
|
||||
HVX_Vector normalized = Q6_Vw_vasl_VwVw(in, lshift);
|
||||
HVX_Vector vexp = Q6_Vw_vsub_VwVw(Q6_V_vsplat_R(0x7f + 30), lshift);
|
||||
HVX_Vector mant = Q6_V_vand_VV(Q6_V_vsplat_R(0xFFFFFF00), normalized);
|
||||
HVX_Vector ret = Q6_V_vmux_QVV(is_zero, vzero, Q6_Vw_vadd_VwVw(mant, vexp));
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline HVX_Vector Q6_Vsf_equals_Vw(HVX_Vector const in)
|
||||
{
|
||||
return Q6_Vsf_equals_Vqf32(hvx_vec_i32_to_qf32(in));
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline HVX_Vector hvx_vec_i16_from_hf_rnd_sat(HVX_Vector vin) {
|
||||
// This looks complicated.
|
||||
@@ -305,4 +287,17 @@ static inline HVX_Vector hvx_vec_mul_f32_f32(HVX_Vector a, HVX_Vector b) {
|
||||
|
||||
#endif // __HVX_ARCH__ < 79
|
||||
|
||||
static inline HVX_Vector hvx_vec_load_act_tile(const uint8_t * y_q, uint32_t kt, HVX_Vector * v_act_all) {
|
||||
if (kt % 4 == 0) {
|
||||
*v_act_all = hvx_vmem(y_q + kt * 32);
|
||||
return *v_act_all;
|
||||
} else if (kt % 4 == 1) {
|
||||
return Q6_V_vror_VR(*v_act_all, 32);
|
||||
} else if (kt % 4 == 2) {
|
||||
return Q6_V_vror_VR(*v_act_all, 64);
|
||||
} else {
|
||||
return Q6_V_vror_VR(*v_act_all, 96);
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* HVX_BASE_H */
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -361,7 +361,7 @@ static void vtcm_free(struct htp_context * ctx) {
|
||||
static void htp_packet_callback(dspqueue_t queue, int error, void * context);
|
||||
static void htp_error_callback(dspqueue_t queue, int error, void * context);
|
||||
|
||||
AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_queue_id, uint32 n_hvx, uint32 use_hmx, uint64_t max_vmem) {
|
||||
AEEResult htp_iface_start(remote_handle64 handle, uint32_t sess_id, uint64_t dsp_queue_id, uint32_t n_hvx, uint32_t n_hmx, uint64_t max_vmem) {
|
||||
struct htp_context * ctx = (struct htp_context *) handle;
|
||||
|
||||
if (!ctx) {
|
||||
@@ -395,10 +395,9 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que
|
||||
return AEE_ENOMEMORY;
|
||||
}
|
||||
|
||||
#ifdef HTP_HAS_HMX
|
||||
ctx->hmx_enabled = use_hmx;
|
||||
ctx->hmx_enabled = n_hmx;
|
||||
ctx->hmx_queue = NULL;
|
||||
if (use_hmx) {
|
||||
if (n_hmx) {
|
||||
ctx->hmx_queue = hmx_queue_create(16, ctx->vtcm_rctx);
|
||||
if (ctx->hmx_queue) {
|
||||
ctx->hmx_queue->trace = &ctx->trace[HTP_MAX_NTHREADS];
|
||||
@@ -407,8 +406,7 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que
|
||||
ctx->hmx_enabled = false;
|
||||
}
|
||||
}
|
||||
FARF(HIGH, "HMX %s (use_hmx=%d)", ctx->hmx_enabled ? "enabled" : "disabled", use_hmx);
|
||||
#endif
|
||||
FARF(HIGH, "HMX %s (n_hmx=%d)", ctx->hmx_enabled ? "enabled" : "disabled", n_hmx);
|
||||
|
||||
qurt_sysenv_max_hthreads_t hw_threads;
|
||||
qurt_sysenv_get_max_hw_threads(&hw_threads);
|
||||
@@ -481,13 +479,11 @@ AEEResult htp_iface_stop(remote_handle64 handle) {
|
||||
dma_queue_delete(ctx->dma[i]);
|
||||
}
|
||||
|
||||
#ifdef HTP_HAS_HMX
|
||||
if (ctx->hmx_queue) {
|
||||
hmx_queue_delete(ctx->hmx_queue);
|
||||
ctx->hmx_queue = NULL;
|
||||
}
|
||||
ctx->hmx_enabled = false;
|
||||
#endif
|
||||
|
||||
vtcm_free(ctx);
|
||||
|
||||
@@ -500,6 +496,36 @@ AEEResult htp_iface_stop(remote_handle64 handle) {
|
||||
return AEE_SUCCESS;
|
||||
}
|
||||
|
||||
AEEResult htp_iface_hwinfo(remote_handle64 handle, uint32_t * n_threads, uint32_t * n_hvx, uint32_t * n_hmx, uint64_t * vtcm_size) {
|
||||
(void)handle;
|
||||
if (!n_threads || !n_hvx || !n_hmx || !vtcm_size) {
|
||||
return AEE_EBADPARM;
|
||||
}
|
||||
|
||||
qurt_sysenv_max_hthreads_t hw_threads;
|
||||
qurt_sysenv_get_max_hw_threads(&hw_threads);
|
||||
uint32_t hw_nhvx = (qurt_hvx_get_units() >> 8) & 0xFF;
|
||||
|
||||
uint32_t n_hvx_val = hw_nhvx;
|
||||
if (n_hvx_val > hw_threads.max_hthreads) {
|
||||
n_hvx_val = hw_threads.max_hthreads;
|
||||
}
|
||||
if (n_hvx_val > HTP_MAX_NTHREADS) {
|
||||
n_hvx_val = HTP_MAX_NTHREADS;
|
||||
}
|
||||
|
||||
// for now we force n_threads == n_hvx
|
||||
*n_threads = n_hvx_val;
|
||||
*n_hvx = n_hvx_val;
|
||||
*n_hmx = 1;
|
||||
|
||||
uint32_t vtcm_sz = 8 * 1024 * 1024; // 8MB default fallback
|
||||
HAP_compute_res_query_VTCM(0, (unsigned int *)&vtcm_sz, NULL, NULL, NULL);
|
||||
*vtcm_size = vtcm_sz;
|
||||
|
||||
return AEE_SUCCESS;
|
||||
}
|
||||
|
||||
static void htp_error_callback(dspqueue_t queue, int error, void * context) {
|
||||
// No errors expected on the DSP.
|
||||
FARF(ERROR, "Error callback: 0x%08x", (unsigned) error);
|
||||
@@ -554,6 +580,12 @@ static int execute_op(struct htp_ops_context * octx) {
|
||||
case HTP_OP_MUL_MAT_ID:
|
||||
return op_matmul_id(octx);
|
||||
|
||||
case HTP_OP_MUL_MAT_QKV:
|
||||
return op_matmul_qkv(octx);
|
||||
|
||||
case HTP_OP_MUL_MAT_FFN:
|
||||
return op_matmul_ffn(octx);
|
||||
|
||||
case HTP_OP_MUL:
|
||||
case HTP_OP_ADD:
|
||||
case HTP_OP_SUB:
|
||||
@@ -762,8 +794,9 @@ static void prep_tensors(struct htp_context *ctx, struct htp_buf_desc *bufs, str
|
||||
}
|
||||
}
|
||||
|
||||
static void proc_op_req(struct htp_ops_context * octx, struct htp_tensor *tens, uint32_t idx, struct htp_op_desc * op) {
|
||||
static int proc_op_req(struct htp_ops_context * octx, struct htp_tensor *tens, uint32_t idx, struct htp_op_desc * op) {
|
||||
memcpy(octx->op_params, op->params, sizeof(octx->op_params));
|
||||
memcpy(octx->kernel_params, op->kernel_params, sizeof(octx->kernel_params));
|
||||
octx->flags = op->flags;
|
||||
octx->op = op->opcode;
|
||||
|
||||
@@ -785,22 +818,41 @@ static void proc_op_req(struct htp_ops_context * octx, struct htp_tensor *tens,
|
||||
src->ne[0], src->ne[1], src->ne[3], src->ne[3]);
|
||||
}
|
||||
|
||||
// Prep output tensor
|
||||
struct htp_tensor *dst = tens + op->dst;
|
||||
// Prep output tensors
|
||||
for (uint32_t i = 0; i < HTP_OP_MAX_OUTPUTS; i++) {
|
||||
uint16_t dst_idx = op->dst[i];
|
||||
if (dst_idx == 0xffff) {
|
||||
octx->dsts[i] = NULL;
|
||||
continue;
|
||||
}
|
||||
struct htp_tensor *dst = tens + dst_idx;
|
||||
octx->dsts[i] = dst;
|
||||
|
||||
octx->dst = dst;
|
||||
FARF(HIGH, "prep-dst[%u] #%u: data %p size %u : %u:%u:%u:%u", i, dst_idx, (void*) dst->data, dst->size,
|
||||
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
|
||||
}
|
||||
|
||||
FARF(HIGH, "prep-dst #%u: data %p size %u : %u:%u:%u:%u", op->dst, (void*) dst->data, dst->size,
|
||||
dst->ne[0], dst->ne[1], dst->ne[3], dst->ne[3]);
|
||||
int status = execute_op(octx);
|
||||
|
||||
(void) execute_op(octx);
|
||||
octx->src0_spad.src = NULL;
|
||||
octx->src1_spad.src = NULL;
|
||||
octx->src2_spad.src = NULL;
|
||||
octx->src3_spad.src = NULL;
|
||||
octx->dst_spad.src = NULL;
|
||||
|
||||
// flush buffers on output
|
||||
hex_l2flush((void *) dst->data, dst->size);
|
||||
dst->flags |= HTP_TENSOR_FLUSHED;
|
||||
for (uint32_t i = 0; i < HTP_OP_MAX_OUTPUTS; i++) {
|
||||
if (octx->dsts[i]) {
|
||||
struct htp_tensor *dst = (struct htp_tensor *)octx->dsts[i];
|
||||
hex_l2flush((void *) dst->data, dst->size);
|
||||
dst->flags |= HTP_TENSOR_FLUSHED;
|
||||
|
||||
FARF(HIGH, "post-dst #%u: data %p size %u : %u:%u:%u:%u", op->dst, (void*) dst->data, dst->size,
|
||||
dst->ne[0], dst->ne[1], dst->ne[3], dst->ne[3]);
|
||||
FARF(HIGH, "post-dst[%u] #%u: data %p size %u : %u:%u:%u:%u", i, op->dst[i], (void*) dst->data, dst->size,
|
||||
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
|
||||
}
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
#define DSPQUEUE_POLL_TIMEOUT_USEC 100
|
||||
@@ -892,20 +944,26 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
|
||||
}
|
||||
}
|
||||
|
||||
int op_status = HTP_STATUS_OK;
|
||||
uint32_t op_wakeup = n_ops / 2; // half-way throgh the batch
|
||||
|
||||
for (uint32_t i=0; i < n_ops; i++) {
|
||||
struct profile_data prof;
|
||||
|
||||
if (i == (n_ops-1)) {
|
||||
// wake up the host before starting the last op
|
||||
if (i == op_wakeup) {
|
||||
dspqueue_write_early_wakeup_noblock(queue, 0, 0);
|
||||
}
|
||||
|
||||
profile_start(ctx->profiler, &prof);
|
||||
|
||||
proc_op_req(octx, tens, i, &ops[i]);
|
||||
op_status = proc_op_req(octx, tens, i, &ops[i]);
|
||||
|
||||
profile_stop(ctx->profiler, &prof);
|
||||
|
||||
if (op_status != HTP_STATUS_OK) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (ctx->profiler) {
|
||||
pds[i].opcode = ops[i].opcode;
|
||||
pds[i].usecs = prof.usecs;
|
||||
@@ -919,7 +977,7 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
|
||||
|
||||
struct htp_opbatch_rsp rsp;
|
||||
rsp.id = req.id;
|
||||
rsp.status = HTP_STATUS_OK;
|
||||
rsp.status = op_status;
|
||||
rsp.n_bufs = n_bufs;
|
||||
rsp.n_tensors = n_tens;
|
||||
rsp.n_ops = n_ops;
|
||||
|
||||
+2729
-4117
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,508 @@
|
||||
#ifndef HTP_MATMUL_OPS_H
|
||||
#define HTP_MATMUL_OPS_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include "htp-ops.h"
|
||||
#include "hex-fastdiv.h"
|
||||
#include "hex-common.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// --- HMX Tile Constraints ---
|
||||
#define HTP_MM_HMX_TILE_N_COLS 32
|
||||
#define HTP_MM_HMX_TILE_N_ROWS 32
|
||||
#define HTP_MM_HMX_TILE_SIZE (32 * 32 * sizeof(__fp16)) // 2048 bytes
|
||||
#define HTP_MM_HMX_TILE_N_ELMS 1024
|
||||
#define HTP_MM_HMX_MIN_NROWS 4
|
||||
|
||||
// --- Weight Repacked Tile Sizes ---
|
||||
#define HTP_MM_WEIGHT_TILE_SIZE_Q4_0 576
|
||||
#define HTP_MM_WEIGHT_TILE_SIZE_Q4_1 640
|
||||
#define HTP_MM_WEIGHT_TILE_SIZE_Q8_0 1088
|
||||
#define HTP_MM_WEIGHT_TILE_SIZE_IQ4_NL 576
|
||||
#define HTP_MM_WEIGHT_TILE_SIZE_MXFP4 544
|
||||
|
||||
// --- Weight Repacked Aligned Tile Sizes ---
|
||||
#define HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_Q4_0 640
|
||||
#define HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_Q4_1 640
|
||||
#define HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_Q8_0 1152
|
||||
#define HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_IQ4_NL 640
|
||||
#define HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_MXFP4 640
|
||||
|
||||
// --- Activation Tiled Block Sizes (including padding) ---
|
||||
#define HTP_MM_ACT_TILE_SIZE_Q8_0 1152
|
||||
#define HTP_MM_ACT_TILE_SIZE_Q8_1 1280
|
||||
|
||||
#define HTP_MM_MAX_PREFETCH 16
|
||||
|
||||
// --- Solver Cost Model Penalty Weights (HMX-specific) ---
|
||||
#define HTP_MM_HMX_COST_W_DEQUANT 3 // cost penalty for quantized weight loading/dequantization
|
||||
#define HTP_MM_HMX_COST_A_CONVERT 2 // cost penalty for activation loading/conversion
|
||||
|
||||
// --- DMA Activation Transfer Configuration ---
|
||||
#define HTP_MM_DMA_ACT_ROWS_PER_STEP 2
|
||||
#define HTP_MM_DMA_ACT_MULTIPLIER 4
|
||||
|
||||
enum htp_mm_kernel_type {
|
||||
HTP_MM_KERNEL_UNSUPPORTED = 0,
|
||||
|
||||
// HMX paths
|
||||
HTP_MM_KERNEL_HMX_2D,
|
||||
HTP_MM_KERNEL_HMX_F16_BATCHED,
|
||||
|
||||
// HVX floating-point paths
|
||||
HTP_MM_KERNEL_HVX_F16_F16_VTCM,
|
||||
HTP_MM_KERNEL_HVX_F16_F16_DDR,
|
||||
HTP_MM_KERNEL_HVX_F16_F32_DDR,
|
||||
|
||||
HTP_MM_KERNEL_HVX_F32_F32_VTCM,
|
||||
HTP_MM_KERNEL_HVX_F32_F32_DDR,
|
||||
HTP_MM_KERNEL_HVX_F32_F16_DDR,
|
||||
|
||||
// HVX quantized paths
|
||||
HTP_MM_KERNEL_HVX_QUANT_ROW, // standard row-wise parallel quantization
|
||||
HTP_MM_KERNEL_HVX_QUANT_BLOCK, // parallel block-wise quantization
|
||||
HTP_MM_KERNEL_HVX_QUANT_ROW_FLAT, // row-wise fallback flat quantization
|
||||
};
|
||||
|
||||
// Op-specific struct for precomputed matmul params
|
||||
struct htp_mm_kernel_params {
|
||||
int32_t kernel_type; // enum htp_mm_kernel_type
|
||||
int32_t pipeline; // 1 = pipelined execution, 0 = standard
|
||||
int32_t m_chunk; // Row chunk size (M chunk)
|
||||
int32_t n_chunk; // Col chunk size (N chunk)
|
||||
int32_t n_threads; // Number of threads to spawn
|
||||
int32_t n_act_threads; // Number of threads for activation preparation
|
||||
int32_t n_hmx; // 1 = use HMX, 0 = use HVX
|
||||
int32_t n_prefetch; // Prefetch lookahead buffers/rows in VTCM
|
||||
int32_t tile_size; // Weight tile size
|
||||
int32_t aligned_tile_size; // Aligned weight tile size (padded to 128)
|
||||
int32_t src1_row_size; // Row size for quantized activation
|
||||
int32_t vtcm_size; // Total required scratchpad size in VTCM
|
||||
int32_t vtcm_src0_size; // src0 scratchpad size in VTCM
|
||||
int32_t vtcm_src1_size; // src1 scratchpad size in VTCM
|
||||
int32_t vtcm_src2_size; // src2 scratchpad size in VTCM (fused only)
|
||||
int32_t vtcm_src3_size; // src3 scratchpad size in VTCM (fused only)
|
||||
int32_t vtcm_dst_size; // dst scratchpad size in VTCM
|
||||
|
||||
// Precomputed division values
|
||||
struct fastdiv_values div_ne12_ne1;
|
||||
struct fastdiv_values div_ne1;
|
||||
struct fastdiv_values div_r2;
|
||||
struct fastdiv_values div_r3;
|
||||
struct fastdiv_values div_ne11;
|
||||
};
|
||||
|
||||
#if defined(__cplusplus)
|
||||
static_assert(sizeof(struct htp_mm_kernel_params) <= 128, "htp_matmul_kernel_params is too large for kernel_params blob");
|
||||
#else
|
||||
_Static_assert(sizeof(struct htp_mm_kernel_params) <= 128, "htp_matmul_kernel_params is too large for kernel_params blob");
|
||||
#endif
|
||||
|
||||
struct mmid_row_mapping {
|
||||
uint32_t i1;
|
||||
uint32_t i2;
|
||||
};
|
||||
|
||||
// Search for optimal (mc, nc) chunk sizes within VTCM budget.
|
||||
static inline int htp_mm_hmx_compute_chunks(size_t vtcm_total,
|
||||
size_t overhead,
|
||||
size_t per_n_cost,
|
||||
size_t per_m_cost,
|
||||
size_t per_mn_cost,
|
||||
size_t m,
|
||||
size_t n,
|
||||
size_t m_block_cost,
|
||||
size_t n_block_cost,
|
||||
size_t * m_chunk_out,
|
||||
size_t * n_chunk_out,
|
||||
size_t * total_out) {
|
||||
if (m == 0 || n == 0) return -1;
|
||||
if (vtcm_total <= overhead) return -1;
|
||||
if (per_n_cost == 0 || per_m_cost == 0 || per_mn_cost == 0) return -1;
|
||||
|
||||
const size_t usable = vtcm_total - overhead;
|
||||
|
||||
size_t best_cost = SIZE_MAX;
|
||||
size_t best_mn = 0;
|
||||
size_t best_m = 0, best_n = 0;
|
||||
|
||||
const size_t n_max = hex_align_down((size_t)n, HTP_MM_HMX_TILE_N_COLS);
|
||||
for (size_t nc = n_max; nc >= HTP_MM_HMX_TILE_N_COLS; nc -= HTP_MM_HMX_TILE_N_COLS) {
|
||||
size_t n_fixed = 0, ncmn = 0, mc_denom = 0;
|
||||
if (hex_mul_overflow(nc, per_n_cost, &n_fixed)) continue;
|
||||
if (n_fixed >= usable) goto next_nc;
|
||||
|
||||
if (hex_mul_overflow(nc, per_mn_cost, &ncmn)) goto next_nc;
|
||||
if (hex_add_overflow(per_m_cost, ncmn, &mc_denom) || mc_denom == 0) goto next_nc;
|
||||
|
||||
{
|
||||
size_t remain = usable - n_fixed;
|
||||
size_t mc = remain / mc_denom;
|
||||
mc = hex_align_down(mc, HTP_MM_HMX_TILE_N_ROWS);
|
||||
mc = hex_smin(mc, m);
|
||||
|
||||
if (mc == 0) {
|
||||
goto next_nc;
|
||||
}
|
||||
|
||||
size_t mblocks = ((size_t) m + mc - 1) / mc;
|
||||
size_t nblocks = ((size_t) n + nc - 1) / nc;
|
||||
size_t cost = mblocks * m_block_cost + nblocks * n_block_cost;
|
||||
size_t mn = mc * nc;
|
||||
if (cost < best_cost || (cost == best_cost && mn > best_mn)) {
|
||||
best_cost = cost;
|
||||
best_mn = mn;
|
||||
best_m = mc;
|
||||
best_n = nc;
|
||||
}
|
||||
}
|
||||
|
||||
next_nc:
|
||||
if (nc == HTP_MM_HMX_TILE_N_COLS) break; // avoid size_t underflow
|
||||
}
|
||||
|
||||
if (best_m == 0 || best_n == 0) return -1;
|
||||
|
||||
// Compute exact total (with overflow checks)
|
||||
size_t t0 = 0, t1 = 0, t2 = 0, mn = 0, total = 0;
|
||||
if (hex_mul_overflow(best_n, per_n_cost, &t0)) return -1;
|
||||
if (hex_mul_overflow(best_m, per_m_cost, &t1)) return -1;
|
||||
if (hex_mul_overflow(best_m, best_n, &mn)) return -1;
|
||||
if (hex_mul_overflow(mn, per_mn_cost, &t2)) return -1;
|
||||
if (hex_add_overflow(t0, t1, &total)) return -1;
|
||||
if (hex_add_overflow(total, t2, &total)) return -1;
|
||||
if (hex_add_overflow(total, overhead, &total)) return -1;
|
||||
|
||||
*m_chunk_out = best_m;
|
||||
*n_chunk_out = best_n;
|
||||
*total_out = total;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// --- Tile Size Helpers ---
|
||||
static inline uint32_t htp_mm_get_weight_tile_size(int weight_type) {
|
||||
switch (weight_type) {
|
||||
case HTP_TYPE_Q4_0:
|
||||
case HTP_TYPE_IQ4_NL:
|
||||
return HTP_MM_WEIGHT_TILE_SIZE_Q4_0;
|
||||
case HTP_TYPE_Q4_1:
|
||||
return HTP_MM_WEIGHT_TILE_SIZE_Q4_1;
|
||||
case HTP_TYPE_Q8_0:
|
||||
return HTP_MM_WEIGHT_TILE_SIZE_Q8_0;
|
||||
case HTP_TYPE_MXFP4:
|
||||
return HTP_MM_WEIGHT_TILE_SIZE_MXFP4;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static inline uint32_t htp_mm_get_weight_aligned_tile_size(int weight_type) {
|
||||
switch (weight_type) {
|
||||
case HTP_TYPE_Q4_0:
|
||||
case HTP_TYPE_IQ4_NL:
|
||||
return HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_Q4_0;
|
||||
case HTP_TYPE_Q4_1:
|
||||
return HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_Q4_1;
|
||||
case HTP_TYPE_Q8_0:
|
||||
return HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_Q8_0;
|
||||
case HTP_TYPE_MXFP4:
|
||||
return HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_MXFP4;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
// --- Activation/Row Size Helpers ---
|
||||
static inline size_t htp_mm_q8_0_tiled_row_size(uint32_t ne) {
|
||||
const uint32_t ne_padded = ((ne + 127) / 128) * 128;
|
||||
const uint32_t nb_32 = ne_padded / 32;
|
||||
return nb_32 * HTP_MM_ACT_TILE_SIZE_Q8_0;
|
||||
}
|
||||
|
||||
static inline size_t htp_mm_q8_1_tiled_row_size(uint32_t ne) {
|
||||
const uint32_t ne_padded = ((ne + 127) / 128) * 128;
|
||||
const uint32_t nb_32 = ne_padded / 32;
|
||||
return nb_32 * HTP_MM_ACT_TILE_SIZE_Q8_1;
|
||||
}
|
||||
|
||||
static inline size_t htp_mm_q8_0_flat_row_size(uint32_t ne) {
|
||||
const uint32_t quants_size = hex_align_up(ne, 128);
|
||||
const uint32_t num_scales = (ne + 31) / 32;
|
||||
const uint32_t scales_size = hex_align_up(num_scales * 2, 128);
|
||||
return quants_size + scales_size;
|
||||
}
|
||||
|
||||
static inline size_t htp_mm_q8_1_flat_row_size(uint32_t ne) {
|
||||
const uint32_t quants_size = hex_align_up(ne, 128);
|
||||
const uint32_t num_scales = (ne + 31) / 32;
|
||||
const uint32_t scales_size = hex_align_up(num_scales * 4, 128);
|
||||
return quants_size + scales_size;
|
||||
}
|
||||
|
||||
static inline size_t htp_mm_get_tiled_row_stride(int weight_type, uint32_t k) {
|
||||
uint32_t nb = (k + QK_Q4_0_TILED - 1) / QK_Q4_0_TILED;
|
||||
switch (weight_type) {
|
||||
case HTP_TYPE_Q4_0:
|
||||
case HTP_TYPE_IQ4_NL:
|
||||
case HTP_TYPE_Q4_1:
|
||||
case HTP_TYPE_Q8_0:
|
||||
case HTP_TYPE_MXFP4:
|
||||
return (size_t) nb * htp_mm_get_weight_tile_size(weight_type);
|
||||
case HTP_TYPE_F16:
|
||||
return (size_t) k * sizeof(__fp16);
|
||||
case HTP_TYPE_F32:
|
||||
return (size_t) k * sizeof(float);
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static inline size_t htp_mm_round_up(size_t n, size_t m) {
|
||||
return ((n + m - 1) / m) * m;
|
||||
}
|
||||
|
||||
static inline bool htp_mm_hmx_pipeline(uint32_t m) {
|
||||
return m > 32;
|
||||
}
|
||||
|
||||
static inline void htp_mm_hmx_get_2d_chunk_costs(
|
||||
int wtype, uint32_t k, bool pipeline, uint32_t aligned_tile_size,
|
||||
size_t * size_per_n_out, size_t * size_per_m_out, size_t * size_per_mn_out
|
||||
) {
|
||||
const bool is_quant = (wtype != HTP_TYPE_F16 && wtype != HTP_TYPE_F32);
|
||||
const size_t row_stride = htp_mm_get_tiled_row_stride(wtype, k);
|
||||
const size_t vec_dot_size = k * sizeof(uint16_t);
|
||||
const uint32_t n_k_tiles = k / HTP_MM_HMX_TILE_N_COLS;
|
||||
const size_t qweight_row_stride = is_quant ? (size_t)(n_k_tiles * aligned_tile_size) / 32 : 0;
|
||||
|
||||
*size_per_n_out = (pipeline ? 2 : 1) * (is_quant ? qweight_row_stride : row_stride) +
|
||||
(pipeline ? 2 * vec_dot_size : vec_dot_size);
|
||||
*size_per_m_out = vec_dot_size;
|
||||
*size_per_mn_out = (pipeline ? 2 : 1) * sizeof(uint16_t);
|
||||
}
|
||||
|
||||
static inline void htp_mm_hmx_get_batched_chunk_costs(
|
||||
uint32_t k, uint32_t group_size,
|
||||
size_t * size_per_n_out, size_t * size_per_m_out, size_t * size_per_mn_out
|
||||
) {
|
||||
const size_t vec_dot_size = k * sizeof(uint16_t);
|
||||
*size_per_n_out = 3 * vec_dot_size;
|
||||
*size_per_m_out = group_size * vec_dot_size;
|
||||
*size_per_mn_out = sizeof(uint16_t);
|
||||
}
|
||||
|
||||
static inline size_t htp_mm_hmx_get_2d_vtcm_size(
|
||||
int wtype, uint32_t k, size_t mc, size_t nc, bool pipeline, uint32_t act_threads, uint32_t aligned_tile_size
|
||||
) {
|
||||
const uint32_t n_k_tiles = k / HTP_MM_HMX_TILE_N_COLS;
|
||||
const bool is_quant = (wtype != HTP_TYPE_F16 && wtype != HTP_TYPE_F32);
|
||||
const size_t row_stride = htp_mm_get_tiled_row_stride(wtype, k);
|
||||
const size_t vec_dot_size = k * sizeof(uint16_t);
|
||||
|
||||
const size_t act_f32_size = htp_mm_round_up(act_threads * 4 * k * sizeof(float), HTP_MM_HMX_TILE_SIZE);
|
||||
size_t weight_area_size = is_quant
|
||||
? htp_mm_round_up((nc / 32) * n_k_tiles * aligned_tile_size, HTP_MM_HMX_TILE_SIZE)
|
||||
: htp_mm_round_up(nc * row_stride, HTP_MM_HMX_TILE_SIZE);
|
||||
if (pipeline) {
|
||||
weight_area_size *= 2;
|
||||
}
|
||||
const size_t act_area_size = htp_mm_round_up(mc * vec_dot_size, HTP_MM_HMX_TILE_SIZE);
|
||||
const size_t output_area_size = htp_mm_round_up(mc * nc * sizeof(uint16_t), HTP_MM_HMX_TILE_SIZE);
|
||||
|
||||
size_t scratch0_size = htp_mm_round_up(nc * vec_dot_size, HTP_MM_HMX_TILE_SIZE);
|
||||
size_t scratch1_size = pipeline ? scratch0_size : 0;
|
||||
size_t scratch2_size = pipeline ? output_area_size : 0;
|
||||
|
||||
return weight_area_size + act_area_size + act_f32_size + output_area_size +
|
||||
scratch0_size + scratch1_size + scratch2_size + 256;
|
||||
}
|
||||
|
||||
static inline size_t htp_mm_hmx_get_batched_vtcm_size(
|
||||
int wtype, uint32_t k, size_t mc, size_t nc, uint32_t group_size, bool use_dma_activation, bool pipeline, uint32_t act_threads) {
|
||||
(void)wtype;
|
||||
(void)pipeline;
|
||||
const size_t vec_dot_size = k * sizeof(uint16_t);
|
||||
const size_t f32_scratch_size = use_dma_activation
|
||||
? htp_mm_round_up(act_threads * 4 * k * sizeof(float), HTP_MM_HMX_TILE_SIZE) : 0;
|
||||
|
||||
const size_t act_head_stride = mc * k;
|
||||
const size_t weight_area_size = htp_mm_round_up(nc * vec_dot_size, HTP_MM_HMX_TILE_SIZE);
|
||||
const size_t act_area_size = htp_mm_round_up(group_size * act_head_stride * sizeof(uint16_t), HTP_MM_HMX_TILE_SIZE);
|
||||
const size_t output_area_size = htp_mm_round_up(group_size * mc * nc * sizeof(uint16_t), HTP_MM_HMX_TILE_SIZE);
|
||||
const size_t scratch_area_size = htp_mm_round_up(nc * vec_dot_size, HTP_MM_HMX_TILE_SIZE);
|
||||
|
||||
return weight_area_size + act_area_size + output_area_size +
|
||||
2 * scratch_area_size + 256 + f32_scratch_size;
|
||||
}
|
||||
|
||||
static inline size_t htp_mm_hvx_get_vtcm_sizes(
|
||||
int kernel_type,
|
||||
int wtype,
|
||||
uint32_t ne10, // k
|
||||
uint32_t src1_nrows, // m_total (or act_nrows)
|
||||
uint32_t n_threads,
|
||||
size_t dst_row_size,
|
||||
size_t src0_row_size,
|
||||
size_t src1_row_size,
|
||||
uint32_t n_prefetch,
|
||||
size_t * vtcm_src0_size_out,
|
||||
size_t * vtcm_src1_size_out,
|
||||
size_t * vtcm_dst_size_out
|
||||
) {
|
||||
size_t vtcm_src0_size = 0;
|
||||
size_t vtcm_src1_size = 0;
|
||||
size_t vtcm_dst_size = 0;
|
||||
|
||||
const bool is_repack = (wtype == HTP_TYPE_Q4_0 || wtype == HTP_TYPE_Q4_1 ||
|
||||
wtype == HTP_TYPE_Q8_0 || wtype == HTP_TYPE_IQ4_NL ||
|
||||
wtype == HTP_TYPE_MXFP4);
|
||||
|
||||
const size_t src0_row_size_padded = htp_mm_round_up(src0_row_size, 128);
|
||||
const size_t dst_nrows = (src1_nrows > 1) ? 0 : 1;
|
||||
|
||||
switch (kernel_type) {
|
||||
case HTP_MM_KERNEL_HVX_F16_F16_VTCM: {
|
||||
size_t f16_src1_row_size = htp_mm_round_up(ne10 * 2, 128);
|
||||
vtcm_src1_size = htp_mm_round_up(f16_src1_row_size * src1_nrows, 256);
|
||||
vtcm_src0_size = htp_mm_round_up(n_prefetch * src0_row_size_padded, 256) * n_threads;
|
||||
vtcm_dst_size = dst_nrows > 0 ? htp_mm_round_up(dst_row_size, 128) * n_threads : 0;
|
||||
break;
|
||||
}
|
||||
case HTP_MM_KERNEL_HVX_F16_F32_DDR:
|
||||
case HTP_MM_KERNEL_HVX_F16_F16_DDR:
|
||||
case HTP_MM_KERNEL_HVX_F32_F32_DDR:
|
||||
case HTP_MM_KERNEL_HVX_F32_F16_DDR: {
|
||||
vtcm_src0_size = htp_mm_round_up(n_prefetch * src0_row_size, 256) * n_threads;
|
||||
vtcm_src1_size = htp_mm_round_up(n_prefetch * src1_row_size, 256) * n_threads;
|
||||
vtcm_dst_size = dst_nrows > 0 ? htp_mm_round_up(dst_row_size, 128) * n_threads : 0;
|
||||
break;
|
||||
}
|
||||
case HTP_MM_KERNEL_HVX_F32_F32_VTCM: {
|
||||
size_t f32_src1_row_size = htp_mm_round_up(ne10 * 4, 128);
|
||||
vtcm_src1_size = htp_mm_round_up(f32_src1_row_size * src1_nrows, 256);
|
||||
vtcm_src0_size = htp_mm_round_up(n_prefetch * src0_row_size_padded, 256) * n_threads;
|
||||
vtcm_dst_size = dst_nrows > 0 ? htp_mm_round_up(dst_row_size, 128) * n_threads : 0;
|
||||
break;
|
||||
}
|
||||
case HTP_MM_KERNEL_HVX_QUANT_BLOCK:
|
||||
case HTP_MM_KERNEL_HVX_QUANT_ROW: {
|
||||
size_t q_src1_row_size = (wtype == HTP_TYPE_Q4_1) ? htp_mm_q8_1_tiled_row_size(ne10) : htp_mm_q8_0_tiled_row_size(ne10);
|
||||
|
||||
vtcm_dst_size = dst_nrows > 0 ? htp_mm_round_up(dst_row_size, 128) : 0;
|
||||
vtcm_src0_size = htp_mm_round_up(n_prefetch * src0_row_size_padded, 256);
|
||||
vtcm_src1_size = htp_mm_round_up(q_src1_row_size * src1_nrows, 256);
|
||||
|
||||
// src0 spad is also used in dynamic quantizer to store padded src1 rows
|
||||
size_t src1_row_size_padded = htp_mm_round_up(q_src1_row_size, QK_Q8_0_TILED * sizeof(float));
|
||||
if (vtcm_src0_size < src1_row_size_padded) {
|
||||
vtcm_src0_size = src1_row_size_padded;
|
||||
}
|
||||
|
||||
vtcm_src0_size = vtcm_src0_size * n_threads;
|
||||
vtcm_dst_size = vtcm_dst_size * n_threads;
|
||||
|
||||
if (is_repack) {
|
||||
uint32_t aligned_tile_size = htp_mm_get_weight_aligned_tile_size(wtype);
|
||||
uint32_t n_k_tiles = ne10 / 32;
|
||||
uint32_t tile_row_size = n_k_tiles * aligned_tile_size;
|
||||
size_t repacked_vtcm_size = htp_mm_round_up(n_prefetch * tile_row_size, 256);
|
||||
if (repacked_vtcm_size < src1_row_size_padded) {
|
||||
repacked_vtcm_size = src1_row_size_padded;
|
||||
}
|
||||
vtcm_src0_size = repacked_vtcm_size * n_threads;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case HTP_MM_KERNEL_HVX_QUANT_ROW_FLAT: {
|
||||
size_t q_src1_row_size = (wtype == HTP_TYPE_Q4_1) ? htp_mm_q8_1_flat_row_size(ne10) : htp_mm_q8_0_flat_row_size(ne10);
|
||||
|
||||
vtcm_dst_size = dst_nrows > 0 ? htp_mm_round_up(dst_row_size, 128) : 0;
|
||||
vtcm_src0_size = htp_mm_round_up(n_prefetch * src0_row_size_padded, 256);
|
||||
vtcm_src1_size = htp_mm_round_up(q_src1_row_size * src1_nrows, 256);
|
||||
|
||||
size_t src1_row_size_padded = htp_mm_round_up(q_src1_row_size, 256);
|
||||
if (vtcm_src0_size < src1_row_size_padded) {
|
||||
vtcm_src0_size = src1_row_size_padded;
|
||||
}
|
||||
|
||||
vtcm_src0_size = vtcm_src0_size * n_threads;
|
||||
vtcm_dst_size = vtcm_dst_size * n_threads;
|
||||
|
||||
if (is_repack) {
|
||||
uint32_t aligned_tile_size = htp_mm_get_weight_aligned_tile_size(wtype);
|
||||
uint32_t n_k_tiles = ne10 / 32;
|
||||
uint32_t tile_row_size = n_k_tiles * aligned_tile_size;
|
||||
size_t repacked_vtcm_size = htp_mm_round_up(n_prefetch * tile_row_size, 256);
|
||||
if (repacked_vtcm_size < src1_row_size_padded) {
|
||||
repacked_vtcm_size = src1_row_size_padded;
|
||||
}
|
||||
vtcm_src0_size = repacked_vtcm_size * n_threads;
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
*vtcm_src0_size_out = vtcm_src0_size;
|
||||
*vtcm_src1_size_out = vtcm_src1_size;
|
||||
*vtcm_dst_size_out = vtcm_dst_size;
|
||||
|
||||
return vtcm_src0_size + vtcm_src1_size + vtcm_dst_size;
|
||||
}
|
||||
|
||||
static inline size_t htp_mm_hvx_id_get_vtcm_sizes(
|
||||
int wtype,
|
||||
uint32_t ne10, // k
|
||||
uint32_t src1_nrows,
|
||||
uint32_t n_threads,
|
||||
size_t src0_row_size, // nb01
|
||||
uint32_t n_prefetch,
|
||||
size_t * vtcm_src0_size_out,
|
||||
size_t * vtcm_src1_size_out
|
||||
) {
|
||||
const bool is_repack = (wtype == HTP_TYPE_Q4_0 || wtype == HTP_TYPE_Q4_1 ||
|
||||
wtype == HTP_TYPE_Q8_0 || wtype == HTP_TYPE_IQ4_NL ||
|
||||
wtype == HTP_TYPE_MXFP4);
|
||||
|
||||
const size_t src0_row_size_padded = htp_mm_round_up(src0_row_size, 128);
|
||||
const size_t src1_row_size = (wtype == HTP_TYPE_Q4_1) ? htp_mm_q8_1_tiled_row_size(ne10)
|
||||
: htp_mm_q8_0_tiled_row_size(ne10);
|
||||
|
||||
size_t src0_sz_per_thread = htp_mm_round_up(n_prefetch * src0_row_size_padded, 256);
|
||||
size_t src1_sz = htp_mm_round_up(src1_row_size * src1_nrows, 256);
|
||||
|
||||
// src0 spad also holds temporary transposed src1 columns during dynamic quantization.
|
||||
const size_t src1_row_size_padded = htp_mm_round_up(src1_row_size, QK_Q8_0_TILED * sizeof(float));
|
||||
if (src0_sz_per_thread < src1_row_size_padded) {
|
||||
src0_sz_per_thread = src1_row_size_padded;
|
||||
}
|
||||
|
||||
if (is_repack) {
|
||||
const uint32_t aligned_tile_size = htp_mm_get_weight_aligned_tile_size(wtype);
|
||||
const uint32_t n_k_tiles = ne10 / 32;
|
||||
const uint32_t tile_row_size = n_k_tiles * aligned_tile_size;
|
||||
size_t repacked_vtcm_size = htp_mm_round_up(n_prefetch * tile_row_size, 256);
|
||||
if (repacked_vtcm_size < src1_row_size_padded) {
|
||||
repacked_vtcm_size = src1_row_size_padded;
|
||||
}
|
||||
src0_sz_per_thread = repacked_vtcm_size;
|
||||
}
|
||||
|
||||
const size_t vtcm_src0_size = src0_sz_per_thread * n_threads;
|
||||
|
||||
*vtcm_src0_size_out = vtcm_src0_size;
|
||||
*vtcm_src1_size_out = src1_sz;
|
||||
|
||||
return vtcm_src0_size + src1_sz;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // HTP_MATMUL_OPS_H
|
||||
@@ -14,8 +14,6 @@ Drivers_Dir = 13
|
||||
1 = %DiskId%
|
||||
|
||||
[SourceDisksFiles]
|
||||
libggml-htp-v68.so = 1
|
||||
libggml-htp-v69.so = 1
|
||||
libggml-htp-v73.so = 1
|
||||
libggml-htp-v75.so = 1
|
||||
libggml-htp-v79.so = 1
|
||||
@@ -28,8 +26,6 @@ ExcludeFromSelect = *
|
||||
CopyFiles=Drivers_Dir
|
||||
|
||||
[Drivers_Dir]
|
||||
libggml-htp-v68.so,,,0x10 ;COPYFLG_NO_OVERWRITE
|
||||
libggml-htp-v69.so,,,0x10 ;COPYFLG_NO_OVERWRITE
|
||||
libggml-htp-v73.so,,,0x10 ;COPYFLG_NO_OVERWRITE
|
||||
libggml-htp-v75.so,,,0x10 ;COPYFLG_NO_OVERWRITE
|
||||
libggml-htp-v79.so,,,0x10 ;COPYFLG_NO_OVERWRITE
|
||||
|
||||
@@ -850,6 +850,7 @@ struct ggml_backend_opencl_context {
|
||||
ref_count--;
|
||||
if (ref_count == 0) {
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
flush_profiling_batch();
|
||||
write_profiling_info();
|
||||
profiling_results.clear();
|
||||
#endif
|
||||
@@ -10152,14 +10153,8 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
float eps;
|
||||
memcpy(&eps, dst->op_params, sizeof(float));
|
||||
|
||||
const int ne00 = src0 ? src0->ne[0] : 0;
|
||||
const int ne01 = src0 ? src0->ne[1] : 0;
|
||||
const int ne02 = src0 ? src0->ne[2] : 0;
|
||||
const int ne03 = src0 ? src0->ne[3] : 0;
|
||||
|
||||
const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
|
||||
const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
|
||||
const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
|
||||
GGML_TENSOR_LOCALS(int, ne0, src0, ne);
|
||||
GGML_TENSOR_LOCALS(cl_ulong, nb0, src0, nb);
|
||||
|
||||
const int nth = MIN(64, ne00);
|
||||
|
||||
@@ -10173,11 +10168,12 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float), &eps));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth, NULL));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float), &eps));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(float)*nth, NULL));
|
||||
|
||||
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
||||
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
||||
|
||||
@@ -24,6 +24,7 @@ kernel void kernel_norm(
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
@@ -43,7 +44,8 @@ kernel void kernel_norm(
|
||||
// parallel sum
|
||||
sum[get_local_id(0)] = 0.0f;
|
||||
for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
|
||||
sum[get_local_id(0)] += x[i00];
|
||||
// this kernel handles float, nb00/4 translates byte offset to element offset
|
||||
sum[get_local_id(0)] += x[i00*nb00/4];
|
||||
}
|
||||
// reduce
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
@@ -60,7 +62,8 @@ kernel void kernel_norm(
|
||||
global float * y = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
||||
sum[get_local_id(0)] = 0.0f;
|
||||
for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
|
||||
y[i00] = x[i00] - mean;
|
||||
// this kernel handles float, nb00/4 translates byte offset to element offset
|
||||
y[i00] = x[i00*nb00/4] - mean;
|
||||
sum[get_local_id(0)] += y[i00] * y[i00];
|
||||
}
|
||||
|
||||
|
||||
@@ -103,8 +103,8 @@ void ggml_sycl_op_conv_3d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
// allocate packed arrays: A_packed (k x m), B_packed (k x n)
|
||||
ggml_sycl_pool_alloc<float> A_packed_alloc(ctx.pool());
|
||||
ggml_sycl_pool_alloc<float> B_packed_alloc(ctx.pool());
|
||||
A_packed_alloc.alloc((size_t) knl_n_total * patch_total * sizeof(float));
|
||||
B_packed_alloc.alloc((size_t) knl_n_total * oc * sizeof(float));
|
||||
A_packed_alloc.alloc((size_t) knl_n_total * patch_total);
|
||||
B_packed_alloc.alloc((size_t) knl_n_total * oc);
|
||||
|
||||
float * A_packed = A_packed_alloc.get();
|
||||
float * B_packed = B_packed_alloc.get();
|
||||
@@ -115,10 +115,16 @@ void ggml_sycl_op_conv_3d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
// Combined kernel: im2col -> pack A, and pack B simultaneously
|
||||
const char * src1_base = (const char *) src1->data;
|
||||
const char * src0_base = (const char *) src0->data;
|
||||
const int64_t src1_nb0 = src1->nb[0];
|
||||
const int64_t src1_nb1 = src1->nb[1];
|
||||
const int64_t src1_nb2 = src1->nb[2];
|
||||
const int64_t src1_nb3 = src1->nb[3];
|
||||
const int64_t src1_w = src1->ne[0];
|
||||
const int64_t src1_h = src1->ne[1];
|
||||
const int64_t src1_d = src1->ne[2];
|
||||
|
||||
const bool src0_is_f32 = (src0->type == GGML_TYPE_F32);
|
||||
|
||||
// Compute correct strides for src0 as (knl_n_total, oc) matrix
|
||||
const int64_t src0_packed_nb0 = kernel_type_size;
|
||||
@@ -165,7 +171,7 @@ void ggml_sycl_op_conv_3d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
const int64_t sz = dst_z * s2 + kz * d2 - p2;
|
||||
|
||||
float val = 0.0f;
|
||||
if (sx >= 0 && sx < src1->ne[0] && sy >= 0 && sy < src1->ne[1] && sz >= 0 && sz < src1->ne[2]) {
|
||||
if (sx >= 0 && sx < src1_w && sy >= 0 && sy < src1_h && sz >= 0 && sz < src1_d) {
|
||||
const int64_t channel_idx = batch_idx * c + ic;
|
||||
const char * ptr = src1_base + sx * src1_nb0 + sy * src1_nb1 + sz * src1_nb2 + channel_idx * src1_nb3;
|
||||
val = *(const float *) ptr;
|
||||
@@ -184,9 +190,9 @@ void ggml_sycl_op_conv_3d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
const int64_t row = t % k;
|
||||
const int64_t col = t / k;
|
||||
const char * src_ptr = (const char *) src0->data + row * src0_packed_nb0 + col * src0_packed_nb1;
|
||||
const char * src_ptr = src0_base + row * src0_packed_nb0 + col * src0_packed_nb1;
|
||||
float v;
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
if (src0_is_f32) {
|
||||
v = *(const float *) src_ptr;
|
||||
} else {
|
||||
v = sycl::vec<sycl::half, 1>(*(const sycl::half *) src_ptr).convert<float, sycl::rounding_mode::automatic>()[0];
|
||||
|
||||
@@ -5859,6 +5859,250 @@ static ggml_backend_dev_t ggml_backend_sycl_reg_get_device(ggml_backend_reg_t re
|
||||
return ctx->devices[index];
|
||||
}
|
||||
|
||||
// ==========================================================================
|
||||
// Tensor parallelism (--split-mode tensor) for the SYCL backend.
|
||||
//
|
||||
// The meta-backend invokes these three entry points via get_proc_address:
|
||||
// * ggml_backend_sycl_comm_init - one-time per-graph setup
|
||||
// * ggml_backend_sycl_comm_allreduce_tensor - per-allreduce step
|
||||
// * ggml_backend_sycl_comm_free - tear-down
|
||||
//
|
||||
// For N=2 (dual-GPU), this is a degenerate ring allreduce with dual paths
|
||||
// chosen by tensor size:
|
||||
//
|
||||
// * Small (nelem < 32K): FP32 direct memcpy + per-device ADD
|
||||
// kernel. The kernel depends_on() its corresponding memcpy event
|
||||
// so it doesn't read partial data. Both devices run in parallel.
|
||||
//
|
||||
// * Large (nelem >= 32K): BF16-compressed. Each device compresses
|
||||
// its FP32 partial to BF16 locally, cross-device memcpys
|
||||
// to the peer (half the PCI bandwidth), where it is decompressed
|
||||
// and added into the local FP32 partial. 6 SYCL submissions per
|
||||
// allreduce (2 compress + 2 memcpy + 2 decompress-add) vs the
|
||||
// 4 for the small path, but the bandwidth saving > 6 GB/s PCIe x 2
|
||||
// dominates for larger tensors.
|
||||
//
|
||||
// Storage: A persistent uint8_t buffer per device, sized to
|
||||
// 4 * nelem bytes. Both paths reinterpret the same bytes (small path
|
||||
// as nelem floats; large path as outbox + inbox = 2*nelem uint16_t
|
||||
// each, using the full 4*nelem byte budget either way). Single
|
||||
// alloc+free per device keeps the SYCL pool's strict-LIFO invariant
|
||||
// trivial.
|
||||
//
|
||||
// For non-(N=2 FP32 contiguous) cases, comm_init or comm_allreduce_tensor
|
||||
// returns null/false, causing the meta-backend to use its generic
|
||||
// butterfly all-reduce fallback.
|
||||
// ==========================================================================
|
||||
|
||||
struct ggml_backend_sycl_comm_context {
|
||||
std::vector<ggml_backend_t> backends;
|
||||
// ONE persistent per-device byte buffer, 4*nelem bytes. Both the
|
||||
// FP32 small-tensor path and the BF16 large-tensor path share it
|
||||
// by reinterpreting.
|
||||
std::unique_ptr<ggml_sycl_pool_alloc<uint8_t>> buf0;
|
||||
std::unique_ptr<ggml_sycl_pool_alloc<uint8_t>> buf1;
|
||||
int64_t buf_nelem = 0;
|
||||
};
|
||||
|
||||
void * ggml_backend_sycl_comm_init(ggml_backend_t * backends, size_t n_backends) try {
|
||||
for (size_t i = 0; i < n_backends; ++i) {
|
||||
if (!ggml_backend_is_sycl(backends[i])) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// Initial version: N=2 only. For N!=2, returning null makes the
|
||||
// meta-backend skip this backend-specific allreduce entirely.
|
||||
if (n_backends != 2) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto * ctx = new ggml_backend_sycl_comm_context;
|
||||
ctx->backends.assign(backends, backends + n_backends);
|
||||
auto * sctx0 = (ggml_backend_sycl_context *) backends[0]->context;
|
||||
auto * sctx1 = (ggml_backend_sycl_context *) backends[1]->context;
|
||||
ctx->buf0 = std::make_unique<ggml_sycl_pool_alloc<uint8_t>>(sctx0->pool());
|
||||
ctx->buf1 = std::make_unique<ggml_sycl_pool_alloc<uint8_t>>(sctx1->pool());
|
||||
return ctx;
|
||||
}
|
||||
catch (const sycl::exception &) { return nullptr; }
|
||||
catch (...) { return nullptr; }
|
||||
|
||||
void ggml_backend_sycl_comm_free(void * comm_ctx_v) {
|
||||
auto * comm_ctx = static_cast<ggml_backend_sycl_comm_context *>(comm_ctx_v);
|
||||
if (comm_ctx == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Sync both per-device queues so the pool_alloc destructors don't
|
||||
// return memory still in use by the last kernel.
|
||||
if (comm_ctx->backends.size() == 2) {
|
||||
auto * sctx0 = (ggml_backend_sycl_context *) comm_ctx->backends[0]->context;
|
||||
auto * sctx1 = (ggml_backend_sycl_context *) comm_ctx->backends[1]->context;
|
||||
try {
|
||||
sctx0->stream()->wait();
|
||||
sctx1->stream()->wait();
|
||||
} catch (...) { /* best effort during shutdown */ }
|
||||
}
|
||||
|
||||
delete comm_ctx;
|
||||
}
|
||||
|
||||
bool ggml_backend_sycl_comm_allreduce_tensor(void * comm_ctx_v, struct ggml_tensor ** tensors) try {
|
||||
if (comm_ctx_v == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto * comm_ctx = static_cast<ggml_backend_sycl_comm_context *>(comm_ctx_v);
|
||||
const size_t n_backends = comm_ctx->backends.size();
|
||||
|
||||
// Fast path: N=2, F32/F16, contiguous, matching shapes.
|
||||
if (n_backends != 2) {
|
||||
return false;
|
||||
}
|
||||
// Accept F32 or F16 inputs natively (types must match). F16 takes the
|
||||
// direct 2-byte memcpy + add path below; other types return false so the
|
||||
// meta-backend uses its generic all-reduce.
|
||||
if (tensors[0]->type != tensors[1]->type) {
|
||||
return false;
|
||||
}
|
||||
if (tensors[0]->type != GGML_TYPE_F32 && tensors[0]->type != GGML_TYPE_F16) {
|
||||
return false;
|
||||
}
|
||||
if (!ggml_is_contiguous(tensors[0]) || !ggml_is_contiguous(tensors[1])) {
|
||||
return false;
|
||||
}
|
||||
if (ggml_nelements(tensors[0]) != ggml_nelements(tensors[1])) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const int64_t nelem = ggml_nelements(tensors[0]);
|
||||
const size_t nbytes = ggml_nbytes(tensors[0]);
|
||||
if (nelem == 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
auto * ctx0 = (ggml_backend_sycl_context *) comm_ctx->backends[0]->context;
|
||||
auto * ctx1 = (ggml_backend_sycl_context *) comm_ctx->backends[1]->context;
|
||||
queue_ptr q0 = ctx0->stream();
|
||||
queue_ptr q1 = ctx1->stream();
|
||||
|
||||
// Grow per-device byte buffers if needed (4 * nelem bytes each).
|
||||
if (comm_ctx->buf_nelem < nelem) {
|
||||
comm_ctx->buf0->realloc(nelem * 4);
|
||||
comm_ctx->buf1->realloc(nelem * 4);
|
||||
comm_ctx->buf_nelem = nelem;
|
||||
}
|
||||
uint8_t * buf0 = comm_ctx->buf0->get();
|
||||
uint8_t * buf1 = comm_ctx->buf1->get();
|
||||
|
||||
// F16 native path: direct 2-byte cross-device copy + add, skipping the
|
||||
// F32 round-trip the meta-backend fallback would force. Cross-device copies
|
||||
// go through dev2dev_memcpy because the two devices are in separate SYCL
|
||||
// contexts (a raw peer-USM q->memcpy would be a silent no-op).
|
||||
if (tensors[0]->type == GGML_TYPE_F16) {
|
||||
sycl::half * f16_out0 = (sycl::half *) tensors[0]->data;
|
||||
sycl::half * f16_out1 = (sycl::half *) tensors[1]->data;
|
||||
sycl::half * f16_tmp0 = (sycl::half *) buf0;
|
||||
sycl::half * f16_tmp1 = (sycl::half *) buf1;
|
||||
|
||||
q0->wait();
|
||||
q1->wait();
|
||||
dev2dev_memcpy(ctx0->device, *q0, ctx1->device, *q1, f16_tmp0, tensors[1]->data, nbytes);
|
||||
dev2dev_memcpy(ctx1->device, *q1, ctx0->device, *q0, f16_tmp1, tensors[0]->data, nbytes);
|
||||
|
||||
q0->submit([&](sycl::handler & h) {
|
||||
h.parallel_for(sycl::range<1>(nelem), [=](sycl::id<1> i) {
|
||||
f16_out0[i] = (sycl::half) ((float) f16_out0[i] + (float) f16_tmp0[i]);
|
||||
});
|
||||
});
|
||||
q1->submit([&](sycl::handler & h) {
|
||||
h.parallel_for(sycl::range<1>(nelem), [=](sycl::id<1> i) {
|
||||
f16_out1[i] = (sycl::half) ((float) f16_out1[i] + (float) f16_tmp1[i]);
|
||||
});
|
||||
});
|
||||
return true;
|
||||
}
|
||||
|
||||
float * out0 = (float *) tensors[0]->data;
|
||||
float * out1 = (float *) tensors[1]->data;
|
||||
|
||||
// BF16 threshold: above this, the PCIe savings from halving the
|
||||
// cross-device bytes outweigh the 2 extra compress kernels.
|
||||
// Below: stay on the FP32 fast path. Threshold mirrors the CUDA
|
||||
// NCCL allreduce pattern for n_backends=2.
|
||||
static constexpr int64_t BF16_THRESHOLD = 32768;
|
||||
|
||||
if (nelem < BF16_THRESHOLD) {
|
||||
// FP32 small path: 4 SYCL submissions per allreduce.
|
||||
float * tmp0 = (float *) buf0;
|
||||
float * tmp1 = (float *) buf1;
|
||||
|
||||
// COMM-D2D-FIX: the two devices are in SEPARATE SYCL contexts, so a raw
|
||||
// q->memcpy of a peer USM pointer is a silent no-op. Route cross-device
|
||||
// copies through dev2dev_memcpy (L0 direct copy / host staging). It is
|
||||
// synchronous, so wait for the local partials to be produced first.
|
||||
q0->wait();
|
||||
q1->wait();
|
||||
dev2dev_memcpy(ctx0->device, *q0, ctx1->device, *q1, tmp0, tensors[1]->data, nbytes);
|
||||
dev2dev_memcpy(ctx1->device, *q1, ctx0->device, *q0, tmp1, tensors[0]->data, nbytes);
|
||||
|
||||
q0->submit([&](sycl::handler & h) {
|
||||
h.parallel_for(sycl::range<1>(nelem), [=](sycl::id<1> i) {
|
||||
out0[i] += tmp0[i];
|
||||
});
|
||||
});
|
||||
q1->submit([&](sycl::handler & h) {
|
||||
h.parallel_for(sycl::range<1>(nelem), [=](sycl::id<1> i) {
|
||||
out1[i] += tmp1[i];
|
||||
});
|
||||
});
|
||||
return true;
|
||||
}
|
||||
|
||||
// BF16 large path: 6 SYCL submissions per allreduce, but the
|
||||
// cross-device memcpy is HALF the bytes. Pure bit-shift
|
||||
// conversion (no rounding) — matches ggml's truncating fp32->bf16.
|
||||
uint16_t * outbox0 = (uint16_t *) buf0;
|
||||
uint16_t * inbox0 = outbox0 + nelem;
|
||||
uint16_t * outbox1 = (uint16_t *) buf1;
|
||||
uint16_t * inbox1 = outbox1 + nelem;
|
||||
|
||||
// Phase A: compress each device's local partial in parallel.
|
||||
sycl::event c0 = q0->parallel_for(sycl::range<1>(nelem), [=](sycl::id<1> i) {
|
||||
outbox0[i] = (uint16_t) (sycl::bit_cast<uint32_t>(out0[i]) >> 16);
|
||||
});
|
||||
|
||||
sycl::event c1 = q1->parallel_for(sycl::range<1>(nelem), [=](sycl::id<1> i) {
|
||||
outbox1[i] = (uint16_t) (sycl::bit_cast<uint32_t>(out1[i]) >> 16);
|
||||
});
|
||||
|
||||
// Phase B: COMM-D2D-FIX-BF16 cross-device copy of compressed bytes via
|
||||
// dev2dev_memcpy (separate SYCL contexts; sync copy after compress).
|
||||
const size_t bf16_bytes = nelem * sizeof(uint16_t);
|
||||
c0.wait();
|
||||
c1.wait();
|
||||
dev2dev_memcpy(ctx0->device, *q0, ctx1->device, *q1, inbox0, outbox1, bf16_bytes);
|
||||
dev2dev_memcpy(ctx1->device, *q1, ctx0->device, *q0, inbox1, outbox0, bf16_bytes);
|
||||
|
||||
// Phase C: decompress + add into local FP32 partial.
|
||||
q0->submit([&](sycl::handler & h) {
|
||||
h.parallel_for(sycl::range<1>(nelem), [=](sycl::id<1> i) {
|
||||
out0[i] += sycl::bit_cast<float>(((uint32_t) inbox0[i]) << 16);
|
||||
});
|
||||
});
|
||||
|
||||
q1->submit([&](sycl::handler & h) {
|
||||
h.parallel_for(sycl::range<1>(nelem), [=](sycl::id<1> i) {
|
||||
out1[i] += sycl::bit_cast<float>(((uint32_t) inbox1[i]) << 16);
|
||||
});
|
||||
});
|
||||
|
||||
return true;
|
||||
}
|
||||
catch (const sycl::exception &) { return false; }
|
||||
catch (...) { return false; }
|
||||
|
||||
static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, const char *name) {
|
||||
GGML_UNUSED(reg);
|
||||
|
||||
@@ -5866,6 +6110,17 @@ static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, cons
|
||||
return (void *)ggml_backend_sycl_split_buffer_type;
|
||||
}
|
||||
|
||||
// Tensor parallelism (--split-mode tensor) entry points.
|
||||
if (strcmp(name, "ggml_backend_comm_init") == 0) {
|
||||
return (void *)ggml_backend_sycl_comm_init;
|
||||
}
|
||||
if (strcmp(name, "ggml_backend_comm_free") == 0) {
|
||||
return (void *)ggml_backend_sycl_comm_free;
|
||||
}
|
||||
if (strcmp(name, "ggml_backend_comm_allreduce_tensor") == 0) {
|
||||
return (void *)ggml_backend_sycl_comm_allreduce_tensor;
|
||||
}
|
||||
|
||||
// SYCL doesn't support registering host memory, left here for reference
|
||||
// "ggml_backend_register_host_buffer"
|
||||
// "ggml_backend_unregister_host_buffer"
|
||||
|
||||
@@ -699,6 +699,7 @@ struct vk_device_struct {
|
||||
|
||||
bool add_rms_fusion;
|
||||
uint32_t partials_binding_alignment;
|
||||
uint32_t max_nodes_per_submit;
|
||||
|
||||
bool shader_64b_indexing;
|
||||
|
||||
@@ -5878,6 +5879,14 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
device->subgroup_vote = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) &&
|
||||
(vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eVote);
|
||||
|
||||
// Submit at least every 100 nodes, in case there are workloads without as much matmul.
|
||||
device->max_nodes_per_submit = 100;
|
||||
const char* GGML_VK_MAX_NODES_PER_SUBMIT = getenv("GGML_VK_MAX_NODES_PER_SUBMIT");
|
||||
if (GGML_VK_MAX_NODES_PER_SUBMIT != nullptr) {
|
||||
uint32_t max_nodes_per_submit = std::stoul(GGML_VK_MAX_NODES_PER_SUBMIT);
|
||||
device->max_nodes_per_submit = std::max(max_nodes_per_submit, 1u);
|
||||
}
|
||||
|
||||
const bool force_disable_f16 = getenv("GGML_VK_DISABLE_F16") != nullptr;
|
||||
|
||||
device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute;
|
||||
@@ -16173,8 +16182,6 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
// Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution.
|
||||
// Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB
|
||||
// (and scaled down based on model size, so smaller models submit earlier).
|
||||
// Also submit at least every 100 nodes, in case there are workloads without as much matmul.
|
||||
int nodes_per_submit = 100;
|
||||
int submitted_nodes = 0;
|
||||
int submit_count = 0;
|
||||
uint64_t mul_mat_bytes = 0;
|
||||
@@ -16400,7 +16407,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
|
||||
// Signal the almost_ready fence when the graph is mostly complete (< 20% remaining)
|
||||
bool almost_ready = (cgraph->n_nodes - i) < cgraph->n_nodes / 5;
|
||||
bool submit = (submitted_nodes >= nodes_per_submit) ||
|
||||
bool submit = ((uint32_t)submitted_nodes >= ctx->device->max_nodes_per_submit) ||
|
||||
(mul_mat_bytes_per_submit != 0 && mul_mat_bytes >= mul_mat_bytes_per_submit) ||
|
||||
(i + ctx->num_additional_fused_ops >= last_node) ||
|
||||
(almost_ready && !ctx->almost_ready_fence_pending);
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
#include <future>
|
||||
#include <queue>
|
||||
#include <condition_variable>
|
||||
#include <atomic>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <cstdlib>
|
||||
@@ -34,6 +35,9 @@
|
||||
|
||||
std::mutex lock;
|
||||
std::vector<std::pair<std::string, std::string>> shader_fnames;
|
||||
// Set when any shader subprocess fails (non-zero exit / stderr / launch failure) so the
|
||||
// build is stopped instead of silently producing a broken libggml-vulkan. (issue #24393)
|
||||
static std::atomic<bool> compile_failed{false};
|
||||
std::locale c_locale("C");
|
||||
|
||||
std::string GLSLC = "glslc";
|
||||
@@ -78,7 +82,7 @@ enum MatMulIdType {
|
||||
|
||||
namespace {
|
||||
|
||||
void execute_command(std::vector<std::string>& command, std::string& stdout_str, std::string& stderr_str) {
|
||||
int execute_command(std::vector<std::string>& command, std::string& stdout_str, std::string& stderr_str) {
|
||||
#ifdef _WIN32
|
||||
HANDLE stdout_read, stdout_write;
|
||||
HANDLE stderr_read, stderr_write;
|
||||
@@ -127,8 +131,11 @@ void execute_command(std::vector<std::string>& command, std::string& stdout_str,
|
||||
CloseHandle(stdout_read);
|
||||
CloseHandle(stderr_read);
|
||||
WaitForSingleObject(pi.hProcess, INFINITE);
|
||||
DWORD exit_code = 1;
|
||||
GetExitCodeProcess(pi.hProcess, &exit_code);
|
||||
CloseHandle(pi.hProcess);
|
||||
CloseHandle(pi.hThread);
|
||||
return (int)exit_code;
|
||||
#else
|
||||
int stdout_pipe[2];
|
||||
int stderr_pipe[2];
|
||||
@@ -175,7 +182,9 @@ void execute_command(std::vector<std::string>& command, std::string& stdout_str,
|
||||
|
||||
close(stdout_pipe[0]);
|
||||
close(stderr_pipe[0]);
|
||||
waitpid(pid, nullptr, 0);
|
||||
int status = 0;
|
||||
waitpid(pid, &status, 0);
|
||||
return WIFEXITED(status) ? WEXITSTATUS(status) : -1;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -372,13 +381,14 @@ void string_to_spv_func(std::string name, std::string in_path, std::string out_p
|
||||
// }
|
||||
// std::cout << std::endl;
|
||||
|
||||
execute_command(cmd, stdout_str, stderr_str);
|
||||
if (!stderr_str.empty()) {
|
||||
std::cerr << "cannot compile " << name << "\n\n";
|
||||
int exit_code = execute_command(cmd, stdout_str, stderr_str);
|
||||
if (exit_code != 0 || !stderr_str.empty()) {
|
||||
std::cerr << "cannot compile " << name << " (exit code " << exit_code << ")\n\n";
|
||||
for (const auto& part : cmd) {
|
||||
std::cerr << part << " ";
|
||||
}
|
||||
std::cerr << "\n\n" << stderr_str << std::endl;
|
||||
compile_failed = true;
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -398,6 +408,7 @@ void string_to_spv_func(std::string name, std::string in_path, std::string out_p
|
||||
shader_fnames.push_back(std::make_pair(name, out_path));
|
||||
} catch (const std::exception& e) {
|
||||
std::cerr << "Error executing command for " << name << ": " << e.what() << std::endl;
|
||||
compile_failed = true;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1271,6 +1282,11 @@ int main(int argc, char** argv) {
|
||||
|
||||
process_shaders();
|
||||
|
||||
if (compile_failed) {
|
||||
std::cerr << "vulkan-shaders-gen: one or more shaders failed to compile" << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
write_output_files();
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
|
||||
@@ -57,19 +57,25 @@ oppoll=
|
||||
opflt=
|
||||
[ "$OF" != "" ] && opflt="GGML_HEXAGON_OPFILTER=$OF"
|
||||
|
||||
opfuse=
|
||||
[ "$OC" != "" ] && opfuse="GGML_HEXAGON_OPFUSION=$OC"
|
||||
|
||||
vmem=
|
||||
[ "$VM" != "" ] && vmem="GGML_HEXAGON_VMEM=$VM"
|
||||
|
||||
mbuf=
|
||||
[ "$MB" != "" ] && mbuf="GGML_HEXAGON_MBUF=$MB"
|
||||
|
||||
mmsel=
|
||||
[ "$MM" != "" ] && mmsel="GGML_HEXAGON_MM_SELECT=$MM"
|
||||
|
||||
set -x
|
||||
|
||||
adb $adbserial $adbhost shell " \
|
||||
cd $basedir; ulimit -c unlimited; \
|
||||
LD_LIBRARY_PATH=$basedir/$branch/lib \
|
||||
ADSP_LIBRARY_PATH=$basedir/$branch/lib \
|
||||
$verbose $sched $opmask $profile $nhvx $hmx $ndev $hb $opbatch $opqueue $oppoll $opflt $vmem $mbuf \
|
||||
$verbose $sched $opmask $profile $nhvx $hmx $ndev $hb $opbatch $opqueue $oppoll $opflt $opfuse $vmem $mbuf $mmsel \
|
||||
./$branch/bin/llama-completion --no-mmap -m $basedir/../gguf/$model \
|
||||
--poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 \
|
||||
--ctx-size 8192 --ubatch-size 1024 -fa on \
|
||||
|
||||
@@ -51,6 +51,12 @@ opqueue=
|
||||
oppoll=
|
||||
[ "$OP" != "" ] && oppoll="GGML_HEXAGON_OPPOLL=$OP"
|
||||
|
||||
opfuse=
|
||||
[ "$OC" != "" ] && opfuse="GGML_HEXAGON_OPFUSION=$OC"
|
||||
|
||||
mmsel=
|
||||
[ "$MM" != "" ] && mmsel="GGML_HEXAGON_MM_SELECT=$MM"
|
||||
|
||||
set -x
|
||||
|
||||
tool=$1; shift
|
||||
@@ -59,5 +65,5 @@ adb $adbserial $adbhost shell " \
|
||||
cd $basedir; ulimit -c unlimited; \
|
||||
LD_LIBRARY_PATH=$basedir/$branch/lib \
|
||||
ADSP_LIBRARY_PATH=$basedir/$branch/lib \
|
||||
$verbose $sched $opmask $profile $nhvx $hmx $ndev $hb $opbatch $opqueue $oppoll ./$branch/bin/$tool $@ \
|
||||
$verbose $sched $opmask $profile $nhvx $hmx $ndev $hb $opbatch $opqueue $oppoll $opfuse $mmsel ./$branch/bin/$tool $@ \
|
||||
"
|
||||
|
||||
@@ -26,7 +26,7 @@ COL_MAP = {
|
||||
}
|
||||
|
||||
op_pattern = re.compile(
|
||||
r"profile-op\s+(?P<op_name>[A-Z_0-9+]+):\s+.*?\s+:\s+(?P<dims>[\d:x\s\->!]+)\s+:\s+(?P<types>[a-z\d_\s\->x]+)\s+:\s+.*?\s+(?:op-)?usec\s+(?P<usec>\d+)\s+(?:op-)?cycles\s+(?P<cycles>\d+)(?:\s+start\s+(?P<start>\d+))?(?:\s+mhz\s+(?P<mhz>[\d.]+))?(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?(?:\s+evt\s+\[(?P<evt>[\d,\s]+)\])?"
|
||||
r"profile-op\s+(?P<op_name>[A-Z_0-9+]+):\s+.*?\s+:\s+(?P<dims>[\d:x\s\->!]+)\s+:\s+(?P<types>[a-z\d_\s\->x]+)\s+:\s+.*?\s+:\s+(?:op-)?usec\s+(?P<usec>\d+)\s+(?:op-)?cycles\s+(?P<cycles>\d+)(?:\s+start\s+(?P<start>\d+))?(?:\s+mhz\s+(?P<mhz>[\d.]+))?(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?(?:\s+evt\s+\[(?P<evt>[\d,\s]+)\])?"
|
||||
)
|
||||
|
||||
trace_pattern = re.compile(
|
||||
@@ -93,9 +93,40 @@ def parse_log(file_path, pmu_index=None):
|
||||
+ int(ts_match.group('us'))
|
||||
)
|
||||
|
||||
op_match = op_pattern.search(line)
|
||||
if "|" in line and "profile-op" in line:
|
||||
parts = [p.strip() for p in line.split("|")]
|
||||
prefix = parts[0]
|
||||
prefix_match = re.search(r"profile-op\s+(?P<op_name>[A-Z_0-9+]+)", prefix)
|
||||
if not prefix_match:
|
||||
continue
|
||||
|
||||
if len(parts) == 7:
|
||||
dims, types, timings = parts[2], parts[3], parts[6]
|
||||
elif len(parts) == 6:
|
||||
dims, types, timings = parts[2], parts[3], parts[5]
|
||||
else:
|
||||
continue
|
||||
|
||||
timing_match = re.search(
|
||||
r"(?:op-)?usec\s+(?P<usec>\d+)\s+(?:op-)?cycles\s+(?P<cycles>\d+)(?:\s+start\s+(?P<start>\d+))?(?:\s+mhz\s+(?P<mhz>[\d.]+))?(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?(?:\s+evt\s+\[(?P<evt>[\d,\s]+)\])?",
|
||||
timings
|
||||
)
|
||||
if not timing_match:
|
||||
continue
|
||||
|
||||
op_match = timing_match
|
||||
op_name = prefix_match.group("op_name")
|
||||
else:
|
||||
op_match = op_pattern.search(line)
|
||||
if op_match:
|
||||
op_name = op_match.group('op_name')
|
||||
dims = op_match.group('dims').strip()
|
||||
types = op_match.group('types').strip()
|
||||
else:
|
||||
op_match = None
|
||||
|
||||
if op_match:
|
||||
pmu_raw = op_match.group('pmu')
|
||||
pmu_raw = op_match.group('pmu') if 'pmu' in op_match.groupdict() else None
|
||||
pmu_val = None
|
||||
if pmu_raw and pmu_index is not None:
|
||||
try:
|
||||
@@ -105,7 +136,7 @@ def parse_log(file_path, pmu_index=None):
|
||||
except (ValueError, IndexError):
|
||||
pmu_val = None
|
||||
|
||||
evt_raw = op_match.group('evt')
|
||||
evt_raw = op_match.group('evt') if 'evt' in op_match.groupdict() else None
|
||||
evt_val = None
|
||||
if evt_raw:
|
||||
try:
|
||||
@@ -122,9 +153,9 @@ def parse_log(file_path, pmu_index=None):
|
||||
op_text = line[idx + 11:].strip() if idx != -1 else line.strip()
|
||||
|
||||
current_op = {
|
||||
'name': op_match.group('op_name'),
|
||||
'dims': op_match.group('dims').strip(),
|
||||
'types': op_match.group('types').strip(),
|
||||
'name': op_name,
|
||||
'dims': dims,
|
||||
'types': types,
|
||||
'op_text': op_text,
|
||||
'usec': int(op_match.group('usec')),
|
||||
'cycles': int(op_match.group('cycles')),
|
||||
|
||||
@@ -12,7 +12,7 @@ from collections import defaultdict
|
||||
logger = logging.getLogger("ggml-hexagon-trace")
|
||||
|
||||
op_pattern = re.compile(
|
||||
r"profile-op\s+(?P<op_name>[A-Z_0-9+]+):\s+.*?\s+:\s+(?P<dims>[\d:x\s\->!]+)\s+:\s+(?P<types>[a-z\d_\s\->x]+)\s+:\s+(?P<strides>[\d:x\s\->!]+)\s+:\s+(?:op-)?usec\s+(?P<usec>\d+)\s+(?:op-)?cycles\s+(?P<cycles>\d+)(?:\s+start\s+(?P<start>\d+))?(?:\s+mhz\s+(?P<mhz>[\d.]+))?(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?(?:\s+evt\s+\[(?P<evt>[\d,\s]+)\])?"
|
||||
r"profile-op\s+(?P<op_name>[A-Z_0-9+]+):\s+.*?\s+:\s+(?P<dims>[\d:x\s\->!]+)\s+:\s+(?P<types>[a-z\d_\s\->x]+)\s+:\s+(?P<strides>[\d:x\s\->!]+?)\s+:\s+(?:(?P<params>.*?)\s+:\s+)?(?:op-)?usec\s+(?P<usec>\d+)\s+(?:op-)?cycles\s+(?P<cycles>\d+)(?:\s+start\s+(?P<start>\d+))?(?:\s+mhz\s+(?P<mhz>[\d.]+))?(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?(?:\s+evt\s+\[(?P<evt>[\d,\s]+)\])?"
|
||||
)
|
||||
|
||||
trace_pattern = re.compile(
|
||||
@@ -66,7 +66,40 @@ def parse_log(file_path):
|
||||
|
||||
for line in f:
|
||||
line_idx += 1
|
||||
op_match = op_pattern.search(line)
|
||||
if "|" in line and "profile-op" in line:
|
||||
parts = [p.strip() for p in line.split("|")]
|
||||
prefix = parts[0]
|
||||
prefix_match = re.search(r"profile-op\s+(?P<op_name>[A-Z_0-9+]+)", prefix)
|
||||
if not prefix_match:
|
||||
continue
|
||||
|
||||
if len(parts) == 7:
|
||||
dims, types, strides, params, timings = parts[2], parts[3], parts[4], parts[5], parts[6]
|
||||
elif len(parts) == 6:
|
||||
dims, types, strides, params, timings = parts[2], parts[3], parts[4], "", parts[5]
|
||||
else:
|
||||
continue
|
||||
|
||||
timing_match = re.search(
|
||||
r"(?:op-)?usec\s+(?P<usec>\d+)\s+(?:op-)?cycles\s+(?P<cycles>\d+)(?:\s+start\s+(?P<start>\d+))?(?:\s+mhz\s+(?P<mhz>[\d.]+))?(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?(?:\s+evt\s+\[(?P<evt>[\d,\s]+)\])?",
|
||||
timings
|
||||
)
|
||||
if not timing_match:
|
||||
continue
|
||||
|
||||
op_match = timing_match
|
||||
op_name = prefix_match.group("op_name")
|
||||
else:
|
||||
op_match = op_pattern.search(line)
|
||||
if op_match:
|
||||
op_name = op_match.group('op_name')
|
||||
dims = op_match.group('dims').strip() if op_match.group('dims') else ''
|
||||
types = op_match.group('types').strip() if op_match.group('types') else ''
|
||||
strides = op_match.group('strides').strip() if op_match.group('strides') else ''
|
||||
params = op_match.group('params').strip() if ('params' in op_match.groupdict() and op_match.group('params')) else ''
|
||||
else:
|
||||
op_match = None
|
||||
|
||||
if op_match:
|
||||
cycles_start_raw = op_match.group('start')
|
||||
unwrapped_cycles_start = None
|
||||
@@ -77,10 +110,11 @@ def parse_log(file_path):
|
||||
op_text = line[idx + 11:].strip() if idx != -1 else line.strip()
|
||||
|
||||
current_op = {
|
||||
'name': op_match.group('op_name'),
|
||||
'dims': op_match.group('dims').strip() if op_match.group('dims') else '',
|
||||
'types': op_match.group('types').strip() if op_match.group('types') else '',
|
||||
'strides': op_match.group('strides').strip() if op_match.group('strides') else '',
|
||||
'name': op_name,
|
||||
'dims': dims,
|
||||
'types': types,
|
||||
'strides': strides,
|
||||
'params': params,
|
||||
'op_text': op_text,
|
||||
'usec': int(op_match.group('usec')),
|
||||
'cycles': int(op_match.group('cycles')),
|
||||
@@ -397,6 +431,8 @@ def generate_perfetto_trace(filtered_ops, output_path):
|
||||
debug_annots.append(make_debug_annotation("line", int_val=op['line_num']))
|
||||
if 'strides' in op and op['strides']:
|
||||
debug_annots.append(make_debug_annotation("strides", string_val=op['strides']))
|
||||
if 'params' in op and op['params'] and op['params'] != '----':
|
||||
debug_annots.append(make_debug_annotation("params", string_val=op['params']))
|
||||
|
||||
# Slice Begin
|
||||
evt_begin = make_track_event(1, 2, name=f"{op['name']} ({op['dims']})", category="operator", debug_annotations=debug_annots)
|
||||
|
||||
@@ -700,6 +700,7 @@ const char * llm_type_name(llm_type type) {
|
||||
case LLM_TYPE_160M: return "160M";
|
||||
case LLM_TYPE_190M: return "190M";
|
||||
case LLM_TYPE_220M: return "220M";
|
||||
case LLM_TYPE_230M: return "230M";
|
||||
case LLM_TYPE_250M: return "250M";
|
||||
case LLM_TYPE_256M: return "256M";
|
||||
case LLM_TYPE_270M: return "270M";
|
||||
|
||||
@@ -36,6 +36,7 @@ enum llm_type {
|
||||
LLM_TYPE_160M,
|
||||
LLM_TYPE_190M,
|
||||
LLM_TYPE_220M,
|
||||
LLM_TYPE_230M,
|
||||
LLM_TYPE_250M,
|
||||
LLM_TYPE_256M,
|
||||
LLM_TYPE_270M,
|
||||
|
||||
+1
-1
@@ -847,7 +847,7 @@ static void init_quantize_state_counters(quantize_state_impl & qs, std::vector<t
|
||||
qs.has_tied_embeddings = false;
|
||||
}
|
||||
}
|
||||
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer();
|
||||
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer_all;
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
@@ -13,6 +13,7 @@ void llama_model_lfm2::load_arch_hparams(llama_model_loader & ml) {
|
||||
hparams.n_layer_dense_lead = hparams.n_layer();
|
||||
|
||||
switch (hparams.n_ff()) {
|
||||
case 2560: type = LLM_TYPE_230M; break;
|
||||
case 4608: type = LLM_TYPE_350M; break;
|
||||
case 6912: type = LLM_TYPE_700M; break;
|
||||
case 8192: type = LLM_TYPE_1_2B; break;
|
||||
|
||||
@@ -199,7 +199,6 @@ llama_build_and_test(test-jinja.cpp)
|
||||
llama_test(test-jinja NAME test-jinja-py ARGS -py LABEL python)
|
||||
llama_build_and_test(test-chat-auto-parser.cpp WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
|
||||
llama_build_and_test(test-chat-template.cpp)
|
||||
llama_build_and_test(test-json-partial.cpp)
|
||||
llama_build_and_test(test-log.cpp)
|
||||
llama_build_and_test(
|
||||
test-peg-parser.cpp
|
||||
|
||||
@@ -8176,6 +8176,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 4, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 1, 4, 1}, {-1,-1,-1,-1}, {1, 2, 0, 3}, {0, 0, 0, 0}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 1, 4, 1}, {-1,-1,-1,-1}, {1, 2, 0, 3}, {0, 0, 0, 0}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2, 2097121, 1, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2, 2, 524281, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3}));
|
||||
|
||||
// CPY - different src/dst shapes (reshaping via CPY)
|
||||
// Use permutations of {3, 5, 7, 32}. Total elements: 3*5*7*32 = 3360.
|
||||
@@ -8420,6 +8422,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
}
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q4_0, GGML_TYPE_F32, 2880, 32, 2880, {1, 1}, {1, 1}));
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q8_0, GGML_TYPE_F32, 2880, 32, 2880, {1, 1}, {1, 1}));
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_MXFP4, GGML_TYPE_F32, 2880, 32, 2880, {1, 1}, {1, 1}));
|
||||
|
||||
|
||||
#if 0
|
||||
{
|
||||
// Test paths in OpenCL
|
||||
@@ -8594,6 +8601,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
|
||||
// gpt-oss issue with Vulkan mmq_id
|
||||
test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_MXFP4, GGML_TYPE_F32, 32, 2, false, 2880, 32, 2880));
|
||||
test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_Q4_0, GGML_TYPE_F32, 32, 2, false, 2880, 32, 2880));
|
||||
|
||||
for (ggml_type type_a : all_types) {
|
||||
test_cases.emplace_back(new test_mul_mat_id(type_a, GGML_TYPE_F32, 4, 2, false, 64, 16, 3*ggml_blck_size(type_a)));
|
||||
|
||||
@@ -1,287 +0,0 @@
|
||||
#include "common.h"
|
||||
#include "json-partial.h"
|
||||
#include <exception>
|
||||
#include <iostream>
|
||||
#include <stdexcept>
|
||||
|
||||
template <class T> static void assert_equals(const T & expected, const T & actual) {
|
||||
if (expected != actual) {
|
||||
std::cerr << "Expected: " << expected << std::endl;
|
||||
std::cerr << "Actual: " << actual << std::endl;
|
||||
std::cerr << std::flush;
|
||||
throw std::runtime_error("Test failed");
|
||||
}
|
||||
}
|
||||
|
||||
static void test_json_healing() {
|
||||
auto parse = [](const std::string & str) {
|
||||
std::cerr << "# Parsing: " << str << '\n';
|
||||
std::string::const_iterator it = str.begin();
|
||||
const auto end = str.end();
|
||||
common_json out;
|
||||
std::string healing_marker = "$llama.cpp.json$";
|
||||
if (common_json_parse(it, end, healing_marker, out)) {
|
||||
auto dump = out.json.dump();
|
||||
std::cerr << "Parsed: " << dump << '\n';
|
||||
std::cerr << "Magic: " << out.healing_marker.json_dump_marker << '\n';
|
||||
std::string result;
|
||||
if (!out.healing_marker.json_dump_marker.empty()) {
|
||||
auto i = dump.find(out.healing_marker.json_dump_marker);
|
||||
if (i == std::string::npos) {
|
||||
throw std::runtime_error("Failed to find magic in dump " + dump + " (magic: " + out.healing_marker.json_dump_marker + ")");
|
||||
}
|
||||
result = dump.substr(0, i);
|
||||
} else {
|
||||
result = dump;
|
||||
}
|
||||
std::cerr << "Result: " << result << '\n';
|
||||
if (string_starts_with(str, result)) {
|
||||
std::cerr << "Failure!\n";
|
||||
}
|
||||
// return dump;
|
||||
} else {
|
||||
throw std::runtime_error("Failed to parse: " + str);
|
||||
}
|
||||
|
||||
};
|
||||
auto parse_all = [&](const std::string & str) {
|
||||
for (size_t i = 1; i < str.size(); i++) {
|
||||
parse(str.substr(0, i));
|
||||
}
|
||||
};
|
||||
parse_all("{\"a\": \"b\"}");
|
||||
parse_all("{\"hey\": 1, \"ho\\\"ha\": [1]}");
|
||||
|
||||
parse_all("[{\"a\": \"b\"}]");
|
||||
|
||||
auto test = [&](const std::vector<std::string> & inputs, const std::string & expected, const std::string & expected_marker) {
|
||||
for (const auto & input : inputs) {
|
||||
common_json out;
|
||||
assert_equals(true, common_json_parse(input, "$foo", out));
|
||||
assert_equals<std::string>(expected, out.json.dump(/* indent */ -1, /* indent_char */ ' ', /* ensure_ascii */ true));
|
||||
assert_equals<std::string>(expected_marker, out.healing_marker.json_dump_marker);
|
||||
}
|
||||
};
|
||||
// No healing needed:
|
||||
test(
|
||||
{
|
||||
R"([{"a":"b"}, "y"])",
|
||||
},
|
||||
R"([{"a":"b"},"y"])",
|
||||
""
|
||||
);
|
||||
// Partial literals can't be healed:
|
||||
test(
|
||||
{
|
||||
R"([1)",
|
||||
R"([tru)",
|
||||
R"([n)",
|
||||
R"([nul)",
|
||||
R"([23.2)",
|
||||
},
|
||||
R"(["$foo"])",
|
||||
R"("$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"({"a": 1)",
|
||||
R"({"a": tru)",
|
||||
R"({"a": n)",
|
||||
R"({"a": nul)",
|
||||
R"({"a": 23.2)",
|
||||
},
|
||||
R"({"a":"$foo"})",
|
||||
R"("$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"({)",
|
||||
},
|
||||
R"({"$foo":1})",
|
||||
R"("$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"([)",
|
||||
},
|
||||
R"(["$foo"])",
|
||||
R"("$foo)"
|
||||
);
|
||||
// Healing right after a full literal
|
||||
test(
|
||||
{
|
||||
R"(1 )",
|
||||
},
|
||||
R"(1)",
|
||||
""
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"(true)",
|
||||
R"(true )",
|
||||
},
|
||||
R"(true)",
|
||||
""
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"(null)",
|
||||
R"(null )",
|
||||
},
|
||||
R"(null)",
|
||||
""
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"([1 )",
|
||||
},
|
||||
R"([1,"$foo"])",
|
||||
R"(,"$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"([{})",
|
||||
R"([{} )",
|
||||
},
|
||||
R"([{},"$foo"])",
|
||||
R"(,"$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"([true)",
|
||||
},
|
||||
// TODO: detect the true/false/null literal was complete
|
||||
R"(["$foo"])",
|
||||
R"("$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"([true )",
|
||||
},
|
||||
R"([true,"$foo"])",
|
||||
R"(,"$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"([true,)",
|
||||
},
|
||||
R"([true,"$foo"])",
|
||||
R"("$foo)"
|
||||
);
|
||||
// Test nesting
|
||||
test(
|
||||
{
|
||||
R"([{"a": [{"b": [{)",
|
||||
},
|
||||
R"([{"a":[{"b":[{"$foo":1}]}]}])",
|
||||
R"("$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"([{"a": [{"b": [)",
|
||||
},
|
||||
R"([{"a":[{"b":["$foo"]}]}])",
|
||||
R"("$foo)"
|
||||
);
|
||||
|
||||
test(
|
||||
{
|
||||
R"([{"a": "b"})",
|
||||
R"([{"a": "b"} )",
|
||||
},
|
||||
R"([{"a":"b"},"$foo"])",
|
||||
R"(,"$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"([{"a": "b"},)",
|
||||
R"([{"a": "b"}, )",
|
||||
},
|
||||
R"([{"a":"b"},"$foo"])",
|
||||
R"("$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"({ "code)",
|
||||
},
|
||||
R"({"code$foo":1})",
|
||||
R"($foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"({ "code\)",
|
||||
},
|
||||
R"({"code\\$foo":1})",
|
||||
R"(\$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"({ "code")",
|
||||
},
|
||||
R"({"code":"$foo"})",
|
||||
R"(:"$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"({ "key")",
|
||||
},
|
||||
R"({"key":"$foo"})",
|
||||
R"(:"$foo)"
|
||||
);
|
||||
// Test unicode escape sequences
|
||||
test(
|
||||
{
|
||||
R"({"a":"\u)",
|
||||
},
|
||||
R"({"a":"\u0000$foo"})",
|
||||
R"(0000$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"({"a":"\u00)",
|
||||
},
|
||||
R"({"a":"\u0000$foo"})",
|
||||
R"(00$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"({"a":"\ud300)",
|
||||
},
|
||||
R"({"a":"\ud300$foo"})",
|
||||
R"($foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"({"a":"\ud800)",
|
||||
},
|
||||
R"({"a":"\ud800\udc00$foo"})",
|
||||
R"(\udc00$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"({"a":"\ud800\)",
|
||||
},
|
||||
R"({"a":"\ud800\udc00$foo"})",
|
||||
R"(udc00$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"({"a":"\ud800\u)",
|
||||
},
|
||||
R"({"a":"\ud800\udc00$foo"})",
|
||||
R"(dc00$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"({"a":"\ud800\udc00)",
|
||||
},
|
||||
R"({"a":"\ud800\udc00$foo"})",
|
||||
R"($foo)"
|
||||
);
|
||||
}
|
||||
|
||||
int main() {
|
||||
test_json_healing();
|
||||
std::cerr << "All tests passed.\n";
|
||||
return 0;
|
||||
}
|
||||
@@ -146,6 +146,8 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
LOG_INF("Model %d/%d, Context %d/%d: %s\n\n", m + 1, num_models, c + 1, num_contexts, result.c_str());
|
||||
|
||||
llama_synchronize(ctx.get());
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1035,25 +1035,23 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
|
||||
if (!params.hf_repo.empty()) {
|
||||
for (size_t i = 0; i < params.hf_repo.size(); i++) {
|
||||
common_params_model model;
|
||||
|
||||
if (params.hf_file.empty() || params.hf_file[i].empty()) {
|
||||
model.hf_repo = params.hf_repo[i];
|
||||
} else {
|
||||
model.hf_repo = params.hf_repo[i];
|
||||
model.hf_file = params.hf_file[i];
|
||||
common_params p;
|
||||
p.hf_token = params.hf_token;
|
||||
p.offline = params.offline;
|
||||
p.model.hf_repo = params.hf_repo[i];
|
||||
if (!params.hf_file.empty() && !params.hf_file[i].empty()) {
|
||||
p.model.hf_file = params.hf_file[i];
|
||||
}
|
||||
|
||||
common_download_opts opts;
|
||||
opts.bearer_token = params.hf_token;
|
||||
opts.offline = params.offline;
|
||||
auto download_result = common_download_model(model, opts);
|
||||
if (download_result.model_path.empty()) {
|
||||
// only the text model file is needed
|
||||
common_models_handler models_handler = common_models_handler_init(p, LLAMA_EXAMPLE_BENCH);
|
||||
common_models_handler_apply(models_handler, p);
|
||||
if (p.model.path.empty()) {
|
||||
fprintf(stderr, "error: failed to download model from HuggingFace\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
params.model.push_back(download_result.model_path);
|
||||
params.model.push_back(p.model.path);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
+23
-17
@@ -115,22 +115,28 @@ if (TARGET mtmd)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
add_executable(llama-llava-cli deprecation-warning.cpp)
|
||||
add_executable(llama-gemma3-cli deprecation-warning.cpp)
|
||||
add_executable(llama-minicpmv-cli deprecation-warning.cpp)
|
||||
add_executable(llama-qwen2vl-cli deprecation-warning.cpp)
|
||||
# Gate CLI binaries on LLAMA_BUILD_TOOLS so that standalone library-only
|
||||
# builds (LLAMA_BUILD_MTMD=ON with LLAMA_BUILD_TOOLS=OFF — e.g. Apple
|
||||
# XCFramework packaging) skip the executables entirely. LLAMA_BUILD_COMMON
|
||||
# defaults to ON in standalone builds, so we cannot rely on it for gating.
|
||||
if (LLAMA_BUILD_TOOLS)
|
||||
add_executable(llama-llava-cli deprecation-warning.cpp)
|
||||
add_executable(llama-gemma3-cli deprecation-warning.cpp)
|
||||
add_executable(llama-minicpmv-cli deprecation-warning.cpp)
|
||||
add_executable(llama-qwen2vl-cli deprecation-warning.cpp)
|
||||
|
||||
set(TARGET llama-mtmd-cli)
|
||||
add_executable (${TARGET} mtmd-cli.cpp)
|
||||
set_target_properties (${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
|
||||
if(LLAMA_TOOLS_INSTALL)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
set(TARGET llama-mtmd-cli)
|
||||
add_executable (${TARGET} mtmd-cli.cpp)
|
||||
set_target_properties (${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
|
||||
if(LLAMA_TOOLS_INSTALL)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
endif()
|
||||
target_link_libraries (${TARGET} PRIVATE llama-common mtmd Threads::Threads)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
# mtmd-debug tool
|
||||
add_executable(llama-mtmd-debug debug/mtmd-debug.cpp)
|
||||
set_target_properties(llama-mtmd-debug PROPERTIES OUTPUT_NAME llama-mtmd-debug)
|
||||
target_link_libraries(llama-mtmd-debug PRIVATE llama-common mtmd Threads::Threads)
|
||||
target_compile_features(llama-mtmd-debug PRIVATE cxx_std_17)
|
||||
endif()
|
||||
target_link_libraries (${TARGET} PRIVATE llama-common mtmd Threads::Threads)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
# mtmd-debug tool
|
||||
add_executable(llama-mtmd-debug debug/mtmd-debug.cpp)
|
||||
set_target_properties(llama-mtmd-debug PROPERTIES OUTPUT_NAME llama-mtmd-debug)
|
||||
target_link_libraries(llama-mtmd-debug PRIVATE llama-common mtmd Threads::Threads)
|
||||
target_compile_features(llama-mtmd-debug PRIVATE cxx_std_17)
|
||||
|
||||
@@ -9,6 +9,7 @@ its output, and holds them against the HF model's scores.
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import unicodedata
|
||||
@@ -28,6 +29,12 @@ class ModelSpec:
|
||||
mmproj_arg: str
|
||||
model_default: str
|
||||
mmproj_default: str
|
||||
prompt: str = "Free OCR. "
|
||||
n_predict: int = 512
|
||||
n_ctx: int | None = None
|
||||
# Unlimited-OCR's "document parsing" prompt emits <|det|> grounding markup that
|
||||
# the HF reference strips in result.md; drop it before scoring to match.
|
||||
strip_grounding: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -63,6 +70,20 @@ MODELS = {
|
||||
model_default="gguf_models/deepseek-ai/deepseek-ocr-2-bf16.gguf",
|
||||
mmproj_default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-2-bf16.gguf",
|
||||
),
|
||||
"unlimited": ModelSpec(
|
||||
key="unlimited", label="Unlimited-OCR",
|
||||
model_arg="--llama-model-unlimited", mmproj_arg="--mmproj-unlimited",
|
||||
model_default="gguf_models/baidu/unlimited-ocr-bf16.gguf",
|
||||
mmproj_default="gguf_models/baidu/mmproj-unlimited-ocr-bf16.gguf",
|
||||
# "Free OCR." immediately emits EOS on this checkpoint; the HF reference
|
||||
# (demo/unlimited_ocr_scores.py) uses "document parsing.", which grounds.
|
||||
prompt="document parsing.",
|
||||
# Grounding emits ~3x the tokens of plain OCR, so it needs a larger budget
|
||||
# and context to reach the article body the ground truth covers.
|
||||
n_predict=4096,
|
||||
n_ctx=16384,
|
||||
strip_grounding=True,
|
||||
),
|
||||
}
|
||||
|
||||
CASES = [
|
||||
@@ -82,9 +103,26 @@ CASES = [
|
||||
# is one pixel off and lands at ~0.69 instead.
|
||||
hf_cer=0.7761, hf_chrf=28.70, cer_tol=0.12, chrf_tol=8.0,
|
||||
),
|
||||
TestCase(
|
||||
model_key="unlimited", label="single-view scan",
|
||||
image="tools/mtmd/test-1.jpeg",
|
||||
ground_truth="tools/mtmd/tests/test-1-ground-truth.txt",
|
||||
# HF reference: Unlimited-OCR scoring (gundam, bf16) on this image/ground-truth.
|
||||
# Decoder runs full MHA, not R-SWA; the band absorbs that gap + bf16 variance.
|
||||
hf_cer=0.1869, hf_chrf=75.23, cer_tol=0.06, chrf_tol=6.0,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
GROUNDING_TAG_RE = re.compile(r"<\|(ref|det)\|>.*?<\|/\1\|>", re.DOTALL)
|
||||
|
||||
|
||||
def strip_grounding(text: str) -> str:
|
||||
"""Drop <|ref|>..<|/ref|> / <|det|>..<|/det|> grounding markup, matching the
|
||||
cleaned result.md the HF reference scores against."""
|
||||
return GROUNDING_TAG_RE.sub("", text)
|
||||
|
||||
|
||||
def arg_dest(flag: str) -> str:
|
||||
return flag.lstrip("-").replace("-", "_")
|
||||
|
||||
@@ -129,19 +167,19 @@ def compute_chrf(expected: str, ocr_out: str) -> float:
|
||||
return CHRF().sentence_score(ocr_out, [expected]).score
|
||||
|
||||
|
||||
def run_mtmd_cli(model_path, mmproj_path, image_path, bin_path) -> str:
|
||||
def run_mtmd_cli(spec: "ModelSpec", model_path, mmproj_path, image_path, bin_path) -> str:
|
||||
"""Run mtmd-cli on the image and return its output."""
|
||||
cmd = [
|
||||
str(bin_path),
|
||||
"-m", str(model_path),
|
||||
"--mmproj", str(mmproj_path),
|
||||
"--image", str(image_path),
|
||||
"-p", "Free OCR. ",
|
||||
"-p", spec.prompt,
|
||||
"--chat-template", "deepseek-ocr",
|
||||
"--temp", "0",
|
||||
"--flash-attn", "off", # match the HF "eager" attention reference
|
||||
"--no-warmup",
|
||||
"-n", "512", # cap loops on hard images (KV would otherwise fill)
|
||||
"-n", str(spec.n_predict), # cap loops on hard images (KV would otherwise fill)
|
||||
# HF decodes with no_repeat_ngram_size; llama.cpp's analog is DRY.
|
||||
# Default DRY breakers include "\n", so they are cleared below.
|
||||
"--dry-multiplier", "0.8",
|
||||
@@ -150,6 +188,8 @@ def run_mtmd_cli(model_path, mmproj_path, image_path, bin_path) -> str:
|
||||
"--dry-penalty-last-n", "-1",
|
||||
"--dry-sequence-breaker", "none",
|
||||
]
|
||||
if spec.n_ctx is not None:
|
||||
cmd += ["-c", str(spec.n_ctx)]
|
||||
logger.debug(f" command: {' '.join(cmd)}")
|
||||
|
||||
try:
|
||||
@@ -164,6 +204,8 @@ def run_mtmd_cli(model_path, mmproj_path, image_path, bin_path) -> str:
|
||||
raise RuntimeError(f"llama-mtmd-cli failed with code {result.returncode}")
|
||||
|
||||
output = result.stdout.decode("utf-8", errors="replace").strip()
|
||||
if spec.strip_grounding:
|
||||
output = strip_grounding(output)
|
||||
if not output:
|
||||
raise RuntimeError("llama-mtmd-cli produced no output on stdout")
|
||||
logger.info(f" output: {len(output)} chars")
|
||||
@@ -193,7 +235,7 @@ def evaluate(case: "TestCase", expected: str, ocr_out: str) -> bool:
|
||||
|
||||
logger.info("")
|
||||
logger.info("=" * 60)
|
||||
logger.info("Free OCR evaluation:")
|
||||
logger.info("OCR evaluation:")
|
||||
logger.info("=" * 60)
|
||||
logger.info(f" CER {cer:>7.4f} (HF {case.hf_cer:.4f}, <= {case.cer_max:>7.4f} -> {verdict(cer_pass)})")
|
||||
logger.info(f" chrF (0-100) {chrf:>7.2f} (HF {case.hf_chrf:.2f}, >= {case.chrf_min:>7.2f} -> {verdict(chrf_pass)})")
|
||||
@@ -269,9 +311,9 @@ def main() -> int:
|
||||
expected = read_expected_text(ground_truth)
|
||||
logger.info(f" Image: {case.image}")
|
||||
logger.info(f" Expected text: {len(expected)} chars")
|
||||
logger.info(" Running llama.cpp 'Free OCR'")
|
||||
logger.info(f" Running llama.cpp prompt {model_spec.prompt!r}")
|
||||
try:
|
||||
ocr_out = run_mtmd_cli(model, mmproj, image, binary)
|
||||
ocr_out = run_mtmd_cli(model_spec, model, mmproj, image, binary)
|
||||
except RuntimeError as e:
|
||||
logger.error(f" Error: {e}")
|
||||
results[title] = False
|
||||
|
||||
@@ -40,6 +40,7 @@ struct debug_options {
|
||||
bool enable_reasoning = true;
|
||||
bool debug_jinja = false;
|
||||
bool force_tool_call = false;
|
||||
bool parallel_tool_calls = true;
|
||||
output_mode mode = output_mode::BOTH;
|
||||
input_message_type input_message = input_message_type::NONE;
|
||||
};
|
||||
@@ -87,6 +88,7 @@ static void print_usage(const char * program_name) {
|
||||
LOG_ERR("\nOptions:\n");
|
||||
LOG_ERR(" --no-tools Disable tool definitions\n");
|
||||
LOG_ERR(" --force-tool-call Set tool calls to forced\n");
|
||||
LOG_ERR(" --parallel-tool-calls=0|1 Set parallel_tool_calls (default: 1)\n");
|
||||
LOG_ERR(" --generation-prompt=0|1 Set add_generation_prompt (default: 1)\n");
|
||||
LOG_ERR(" --enable-reasoning=0|1 Enable reasoning parsing (default: 1)\n");
|
||||
LOG_ERR(" --output=MODE Output mode: analysis, template, both (default: both)\n");
|
||||
@@ -121,6 +123,8 @@ static bool parse_options(int argc, char ** argv, debug_options & opts) {
|
||||
opts.debug_jinja = true;
|
||||
} else if (arg == "--no-tools") {
|
||||
opts.with_tools = false;
|
||||
} else if (arg.rfind("--parallel-tool-calls=", 0) == 0) {
|
||||
opts.parallel_tool_calls = parse_bool_option(arg.substr(22));
|
||||
} else if (arg.rfind("--generation-prompt=", 0) == 0) {
|
||||
opts.generation_prompt = parse_bool_option(arg.substr(20));
|
||||
} else if (arg.rfind("--enable-reasoning=", 0) == 0) {
|
||||
@@ -349,7 +353,7 @@ static autoparser::generation_params prepare_params(const debug_options & opts,
|
||||
params.tools = json();
|
||||
params.tool_choice = COMMON_CHAT_TOOL_CHOICE_NONE;
|
||||
}
|
||||
params.parallel_tool_calls = false;
|
||||
params.parallel_tool_calls = opts.parallel_tool_calls;
|
||||
return params;
|
||||
}
|
||||
|
||||
|
||||
@@ -223,8 +223,8 @@ void server_model_meta::update_caps() {
|
||||
"LLAMA_ARG_HF_REPO_FILE",
|
||||
});
|
||||
params.offline = true;
|
||||
// params.skip_download = true; // TODO: ideally, we should validate the model here, but it takes too much time
|
||||
common_params_handle_models(params, LLAMA_EXAMPLE_SERVER, {});
|
||||
common_models_handler handler = common_models_handler_init(params, LLAMA_EXAMPLE_SERVER);
|
||||
common_models_handler_apply(handler, params); // note: this won't download the model because offline=true
|
||||
if (params.mmproj.path.empty()) {
|
||||
multimodal = { false, false };
|
||||
} else {
|
||||
@@ -1393,9 +1393,8 @@ struct server_download_state : public common_download_callback {
|
||||
|
||||
bool run(common_params & params) {
|
||||
try {
|
||||
common_params_handle_models_params p;
|
||||
p.callback = this;
|
||||
common_params_handle_models(params, LLAMA_EXAMPLE_SERVER, p);
|
||||
common_models_handler handler = common_models_handler_init(params, LLAMA_EXAMPLE_SERVER);
|
||||
common_models_handler_apply(handler, params, this);
|
||||
is_ok = true;
|
||||
} catch (const std::exception & e) {
|
||||
auto model_name = params.model.get_name();
|
||||
@@ -1768,23 +1767,14 @@ void server_models_routes::init_routes() {
|
||||
throw std::invalid_argument("model must be a non-empty string");
|
||||
}
|
||||
|
||||
common_params_model model;
|
||||
common_download_opts opts;
|
||||
common_params p;
|
||||
p.model.hf_repo = name;
|
||||
p.hf_token = params.hf_token;
|
||||
|
||||
model.hf_repo = name;
|
||||
opts.bearer_token = params.hf_token;
|
||||
// note: we only check main model, no need sidecar here
|
||||
opts.download_mmproj = false;
|
||||
opts.download_mtp = false;
|
||||
|
||||
// first, only check if the model is valid and can be downloaded
|
||||
opts.skip_download = true;
|
||||
// validate by fetching metadata
|
||||
bool ok = false;
|
||||
try {
|
||||
auto validation = common_download_model(model, opts);
|
||||
ok = !validation.model_path.empty();
|
||||
} catch (const common_skip_download_exception &) {
|
||||
// model is valid and will be downloaded
|
||||
common_models_handler_init(p, LLAMA_EXAMPLE_SERVER);
|
||||
ok = true;
|
||||
} catch (...) {
|
||||
SRV_ERR("unknown error while validating model '%s'\n", name.c_str());
|
||||
|
||||
+35
-9
@@ -89,15 +89,16 @@ int llama_server(int argc, char ** argv) {
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
// note: router mode also accepts -hf remote-preset, so we need to check that first
|
||||
if (!params.model.hf_repo.empty()) {
|
||||
try {
|
||||
common_params_handle_models_params handle_params;
|
||||
handle_params.preset_only = true;
|
||||
common_params_handle_models(params, LLAMA_EXAMPLE_SERVER, handle_params);
|
||||
} catch (const std::exception & e) {
|
||||
// ignored for now
|
||||
common_models_handler models_handler;
|
||||
try {
|
||||
models_handler = common_models_handler_init(params, LLAMA_EXAMPLE_SERVER);
|
||||
if (common_models_handler_is_preset_repo(models_handler)) {
|
||||
// apply the preset and start the server in router mode
|
||||
common_models_handler_apply(models_handler, params);
|
||||
}
|
||||
} catch (const std::exception & e) {
|
||||
SRV_ERR("failed to fetch model metadata: %s\n", e.what());
|
||||
return 1;
|
||||
}
|
||||
|
||||
// router server never loads a model and must not touch the GPU
|
||||
@@ -241,6 +242,19 @@ int llama_server(int argc, char ** argv) {
|
||||
// Google Cloud Platform (Vertex AI) compat
|
||||
ctx_http.register_gcp_compat();
|
||||
|
||||
// return 403 for disabled features
|
||||
server_http_context::handler_t res_403 = [](const server_http_req &) {
|
||||
auto res = std::make_unique<server_http_res>();
|
||||
res->status = 403;
|
||||
res->data = safe_json_to_str({
|
||||
{"error", {
|
||||
{"message", "this feature is disabled"},
|
||||
{"type", "feature_disabled"},
|
||||
}}
|
||||
});
|
||||
return res;
|
||||
};
|
||||
|
||||
// CORS proxy (EXPERIMENTAL, only used by the Web UI for MCP)
|
||||
if (params.ui_mcp_proxy) {
|
||||
SRV_WRN("%s", "-----------------\n");
|
||||
@@ -249,7 +263,11 @@ int llama_server(int argc, char ** argv) {
|
||||
SRV_WRN("%s", "-----------------\n");
|
||||
ctx_http.get ("/cors-proxy", ex_wrapper(proxy_handler_get));
|
||||
ctx_http.post("/cors-proxy", ex_wrapper(proxy_handler_post));
|
||||
} else {
|
||||
ctx_http.get ("/cors-proxy", ex_wrapper(res_403));
|
||||
ctx_http.post("/cors-proxy", ex_wrapper(res_403));
|
||||
}
|
||||
|
||||
// EXPERIMENTAL built-in tools
|
||||
if (!params.server_tools.empty()) {
|
||||
try {
|
||||
@@ -264,6 +282,9 @@ int llama_server(int argc, char ** argv) {
|
||||
SRV_WRN("%s", "-----------------\n");
|
||||
ctx_http.get ("/tools", ex_wrapper(tools.handle_get));
|
||||
ctx_http.post("/tools", ex_wrapper(tools.handle_post));
|
||||
} else {
|
||||
ctx_http.get ("/tools", ex_wrapper(res_403));
|
||||
ctx_http.post("/tools", ex_wrapper(res_403));
|
||||
}
|
||||
|
||||
//
|
||||
@@ -274,7 +295,12 @@ int llama_server(int argc, char ** argv) {
|
||||
return child.run_download(params);
|
||||
} else if (!is_router_server) {
|
||||
// single-model mode (NOT spawned by router)
|
||||
common_params_handle_models(params, LLAMA_EXAMPLE_SERVER, {});
|
||||
try {
|
||||
common_models_handler_apply(models_handler, params);
|
||||
} catch (const std::exception & e) {
|
||||
SRV_ERR("failed to download model: %s\n", e.what());
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
@@ -16,7 +16,7 @@ def test_mcp_no_proxy():
|
||||
server.start()
|
||||
|
||||
res = server.make_request("GET", "/cors-proxy")
|
||||
assert res.status_code == 404
|
||||
assert res.status_code == 403
|
||||
|
||||
|
||||
def test_mcp_proxy():
|
||||
|
||||
+1
-2
@@ -28,10 +28,9 @@ vite.config.ts.timestamp-*
|
||||
# PWA Artifacts
|
||||
apple-splash-*.png
|
||||
apple-touch-icon-*.png
|
||||
favicon.ico
|
||||
favicon-dark.ico
|
||||
maskable-icon-*.png
|
||||
pwa-*.png
|
||||
static/favicon*
|
||||
|
||||
# Storybook
|
||||
*storybook.log
|
||||
|
||||
Generated
+7
-7
@@ -35,7 +35,7 @@
|
||||
"bits-ui": "2.18.1",
|
||||
"clsx": "2.1.1",
|
||||
"dexie": "4.4.3",
|
||||
"dompurify": "3.4.5",
|
||||
"dompurify": "3.4.11",
|
||||
"eslint": "9.39.4",
|
||||
"eslint-config-prettier": "10.1.8",
|
||||
"eslint-plugin-storybook": "10.4.2",
|
||||
@@ -8653,9 +8653,9 @@
|
||||
"peer": true
|
||||
},
|
||||
"node_modules/dompurify": {
|
||||
"version": "3.4.5",
|
||||
"resolved": "https://registry.npmjs.org/dompurify/-/dompurify-3.4.5.tgz",
|
||||
"integrity": "sha512-OrwIBKsdNSVEeubdJ1HBv/wNENRM9ytAVCv7YXt//A3vPdVMNuACRqK9mXCGCBW2ln7BT/A4X0jXHo2Gu89miA==",
|
||||
"version": "3.4.11",
|
||||
"resolved": "https://registry.npmjs.org/dompurify/-/dompurify-3.4.11.tgz",
|
||||
"integrity": "sha512-zhlUV12GsaRzMsf9q5M254YhA4+VuF0fG+QFqu6aYpoGlKtz+w8//jBcGVYBgQkR5GHjUomejY84AV+/uPbWdw==",
|
||||
"dev": true,
|
||||
"license": "(MPL-2.0 OR Apache-2.0)",
|
||||
"optionalDependencies": {
|
||||
@@ -10226,9 +10226,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/hono": {
|
||||
"version": "4.12.23",
|
||||
"resolved": "https://registry.npmjs.org/hono/-/hono-4.12.23.tgz",
|
||||
"integrity": "sha512-eIaZ9qDgu7XV0pxOCrg7/WhnQ6Ivm22UcxhXx/A3dcbqbbYgBEkc6e/J/s7j2tS96zoB0S9VBdLwQNCWwUo4LA==",
|
||||
"version": "4.12.26",
|
||||
"resolved": "https://registry.npmjs.org/hono/-/hono-4.12.26.tgz",
|
||||
"integrity": "sha512-uyZtpnYxM9CmQ7QsQknM4zN8EftNqhON1qYeIKM0Se67CCEe2c44xyGURwB0axX2fBDu1dqHrHAc1hmNT8ITkw==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
|
||||
@@ -54,7 +54,7 @@
|
||||
"bits-ui": "2.18.1",
|
||||
"clsx": "2.1.1",
|
||||
"dexie": "4.4.3",
|
||||
"dompurify": "3.4.5",
|
||||
"dompurify": "3.4.11",
|
||||
"eslint": "9.39.4",
|
||||
"eslint-config-prettier": "10.1.8",
|
||||
"eslint-plugin-storybook": "10.4.2",
|
||||
|
||||
@@ -1,4 +1,10 @@
|
||||
import { defineConfig } from '@vite-pwa/assets-generator/config';
|
||||
import { FAVICON_COLORS, PWA_ASSET_GENERATOR } from './src/lib/constants/pwa';
|
||||
import { writeThemeFavicons } from './scripts/favicon-colorize';
|
||||
|
||||
writeThemeFavicons(FAVICON_COLORS.LIGHT, FAVICON_COLORS.DARK, {
|
||||
padding: PWA_ASSET_GENERATOR.FAVICON_PADDING
|
||||
});
|
||||
|
||||
export default defineConfig({
|
||||
headLinkOptions: {
|
||||
@@ -7,7 +13,8 @@ export default defineConfig({
|
||||
preset: {
|
||||
transparent: {
|
||||
sizes: [],
|
||||
favicons: [[48, 'favicon-dark.ico']]
|
||||
favicons: [[48, 'favicon-dark.ico']],
|
||||
padding: PWA_ASSET_GENERATOR.FAVICON_PADDING
|
||||
},
|
||||
maskable: {
|
||||
sizes: []
|
||||
|
||||
@@ -5,15 +5,32 @@ import {
|
||||
} from '@vite-pwa/assets-generator/config';
|
||||
import { readFileSync } from 'node:fs';
|
||||
import { resolve } from 'node:path';
|
||||
import { THEME_COLORS, PWA_GENERATOR_DEVICES, PWA_ASSET_GENERATOR } from './src/lib/constants/pwa';
|
||||
import {
|
||||
THEME_COLORS,
|
||||
PWA_GENERATOR_DEVICES,
|
||||
PWA_ASSET_GENERATOR,
|
||||
FAVICON_COLORS
|
||||
} from './src/lib/constants/pwa';
|
||||
import { SplashOrientation } from './src/lib/enums/splash.enums';
|
||||
import { writeThemeFavicons } from './scripts/favicon-colorize';
|
||||
|
||||
writeThemeFavicons(FAVICON_COLORS.LIGHT, FAVICON_COLORS.DARK, {
|
||||
padding: PWA_ASSET_GENERATOR.FAVICON_PADDING
|
||||
});
|
||||
|
||||
export default defineConfig({
|
||||
headLinkOptions: {
|
||||
preset: PWA_ASSET_GENERATOR.LINK_PRESET
|
||||
},
|
||||
preset: combinePresetAndAppleSplashScreens(
|
||||
minimal2023Preset,
|
||||
{
|
||||
...minimal2023Preset,
|
||||
// tiny margin so favicon.ico / pwa-*.png breathe inside the canvas
|
||||
transparent: {
|
||||
...minimal2023Preset.transparent,
|
||||
padding: PWA_ASSET_GENERATOR.FAVICON_PADDING
|
||||
}
|
||||
},
|
||||
{
|
||||
padding: PWA_ASSET_GENERATOR.SPLASH_PADDING,
|
||||
resizeOptions: {
|
||||
|
||||
@@ -0,0 +1,107 @@
|
||||
import { mkdirSync, readFileSync, writeFileSync } from 'node:fs';
|
||||
import { dirname, resolve } from 'node:path';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
|
||||
const HERE = dirname(fileURLToPath(import.meta.url));
|
||||
const PROJECT_ROOT = resolve(HERE, '..');
|
||||
|
||||
const DEFAULT_LOGO = resolve(PROJECT_ROOT, 'src/lib/assets/logo.svg');
|
||||
const DEFAULT_OUT_DIR = resolve(PROJECT_ROOT, 'static');
|
||||
const DEFAULT_OUT_LIGHT = resolve(DEFAULT_OUT_DIR, 'favicon.svg');
|
||||
const DEFAULT_OUT_DARK = resolve(DEFAULT_OUT_DIR, 'favicon-dark.svg');
|
||||
|
||||
const CURRENT_COLOR = 'currentColor';
|
||||
|
||||
export interface ColorizedFavicon {
|
||||
light: string;
|
||||
dark: string;
|
||||
}
|
||||
|
||||
export interface WriteThemeFaviconsOptions {
|
||||
sourcePath?: string;
|
||||
lightOutPath?: string;
|
||||
darkOutPath?: string;
|
||||
/**
|
||||
* Fraction of the icon (0..1) to leave as an even margin on each side.
|
||||
* Applied by wrapping the inner content in a `<g transform="...">` so the
|
||||
* source `src/lib/assets/logo.svg` is not modified. Pass 0 to disable.
|
||||
*/
|
||||
padding?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Replace every `currentColor` occurrence in the SVG with the given color.
|
||||
* Pure: no filesystem access, so it is straightforward to unit-test.
|
||||
*/
|
||||
export function colorizeFaviconSvg(
|
||||
svg: string,
|
||||
lightColor: string,
|
||||
darkColor: string
|
||||
): ColorizedFavicon {
|
||||
return {
|
||||
light: svg.replaceAll(CURRENT_COLOR, lightColor),
|
||||
dark: svg.replaceAll(CURRENT_COLOR, darkColor)
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Shrink the inner SVG content uniformly and re-center it so `padding` (a
|
||||
* 0..1 fraction) is reserved as equal margin on each side. Returns the input
|
||||
* unchanged for non-positive padding, missing/invalid `viewBox`, or unexpected
|
||||
* markup so the caller always gets a renderable SVG.
|
||||
*/
|
||||
export function padFaviconSvg(svg: string, padding: number): string {
|
||||
if (!(padding > 0) || padding >= 1) return svg;
|
||||
|
||||
const viewBoxMatch = svg.match(/viewBox\s*=\s*["']([^"']+)["']/i);
|
||||
if (!viewBoxMatch) return svg;
|
||||
|
||||
const parts = viewBoxMatch[1]
|
||||
.trim()
|
||||
.split(/[\s,]+/)
|
||||
.map(Number);
|
||||
if (parts.length !== 4 || parts.some((n) => !Number.isFinite(n))) return svg;
|
||||
|
||||
const [, , width, height] = parts;
|
||||
if (width <= 0 || height <= 0) return svg;
|
||||
|
||||
const scale = 1 - padding;
|
||||
const translateX = (padding * width) / 2;
|
||||
const translateY = (padding * height) / 2;
|
||||
|
||||
const openTagStart = svg.search(/<svg\b/i);
|
||||
if (openTagStart === -1) return svg;
|
||||
const openTagEnd = svg.indexOf('>', openTagStart);
|
||||
if (openTagEnd === -1) return svg;
|
||||
const closeStart = svg.lastIndexOf('</svg');
|
||||
if (closeStart === -1 || closeStart <= openTagEnd) return svg;
|
||||
|
||||
const openTag = svg.slice(0, openTagEnd + 1);
|
||||
const inner = svg.slice(openTagEnd + 1, closeStart);
|
||||
const closeTag = svg.slice(closeStart);
|
||||
|
||||
const group = `<g transform="translate(${translateX} ${translateY}) scale(${scale})">`;
|
||||
return `${openTag}${group}${inner}</g>${closeTag}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Read `src/lib/assets/logo.svg`, colorize it for both themes, and write
|
||||
* the results to the static directory so the PWA asset generator can consume
|
||||
* them. Paths can be overridden for tests.
|
||||
*/
|
||||
export function writeThemeFavicons(
|
||||
lightColor: string,
|
||||
darkColor: string,
|
||||
{
|
||||
sourcePath = DEFAULT_LOGO,
|
||||
lightOutPath = DEFAULT_OUT_LIGHT,
|
||||
darkOutPath = DEFAULT_OUT_DARK,
|
||||
padding = 0
|
||||
}: WriteThemeFaviconsOptions = {}
|
||||
): void {
|
||||
const source = readFileSync(sourcePath, 'utf-8');
|
||||
const { light, dark } = colorizeFaviconSvg(source, lightColor, darkColor);
|
||||
mkdirSync(dirname(lightOutPath), { recursive: true });
|
||||
writeFileSync(lightOutPath, padFaviconSvg(light, padding));
|
||||
writeFileSync(darkOutPath, padFaviconSvg(dark, padding));
|
||||
}
|
||||
@@ -48,6 +48,7 @@
|
||||
|
||||
--chat-form-area-height: 8rem;
|
||||
--chat-form-area-offset: 2rem;
|
||||
--chat-form-padding-top: 6rem;
|
||||
--max-message-height: max(24rem, min(80dvh, calc(100dvh - var(--chat-form-area-height) - 12rem)));
|
||||
}
|
||||
|
||||
@@ -55,6 +56,7 @@
|
||||
:root {
|
||||
--chat-form-area-height: 24rem;
|
||||
--chat-form-area-offset: 12rem;
|
||||
--chat-form-padding-top: 6rem;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -141,7 +143,6 @@
|
||||
@apply bg-background text-foreground;
|
||||
scrollbar-width: thin;
|
||||
scrollbar-gutter: stable;
|
||||
overflow: hidden; /* Added due to Mermaid rendering somehow causing the double scrollbar */
|
||||
}
|
||||
|
||||
/* Global scrollbar styling - visible only on hover */
|
||||
@@ -193,3 +194,7 @@
|
||||
scrollbar-width: none;
|
||||
}
|
||||
}
|
||||
|
||||
.mermaidTooltip {
|
||||
display: none !important;
|
||||
}
|
||||
|
||||
@@ -10,9 +10,9 @@ import { isElementInViewport } from '$lib/utils/viewport';
|
||||
*/
|
||||
export function fadeInView(
|
||||
node: HTMLElement,
|
||||
options: { duration?: number; y?: number; skipIfVisible?: boolean } = {}
|
||||
options: { duration?: number; y?: number; delay?: number; skipIfVisible?: boolean } = {}
|
||||
) {
|
||||
const { duration = 300, y = 0, skipIfVisible = false } = options;
|
||||
const { duration = 300, y = 0, delay = 0, skipIfVisible = false } = options;
|
||||
|
||||
if (skipIfVisible && isElementInViewport(node)) {
|
||||
return;
|
||||
@@ -27,10 +27,12 @@ export function fadeInView(
|
||||
(entries) => {
|
||||
for (const entry of entries) {
|
||||
if (entry.isIntersecting) {
|
||||
requestAnimationFrame(() => {
|
||||
node.style.opacity = '1';
|
||||
node.style.transform = 'translateY(0)';
|
||||
});
|
||||
setTimeout(() => {
|
||||
requestAnimationFrame(() => {
|
||||
node.style.opacity = '1';
|
||||
node.style.transform = 'translateY(0)';
|
||||
});
|
||||
}, delay);
|
||||
observer.disconnect();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,7 @@
|
||||
<svg width="512" height="512" viewBox="0 0 512 512" fill="none" xmlns="http://www.w3.org/2000/svg">
|
||||
<path d="M244.95 8C215.233 8 187.774 23.8591 172.923 49.5999L95.6009 183.625C60.2162 244.959 104.481 321.6 175.29 321.6H208L316.977 132.708C348.959 77.2719 308.95 8 244.95 8ZM208 321.6H351.947C415.982 321.6 456.013 390.91 424.013 446.377C409.155 472.132 381.681 488 351.947 488H271.29C200.481 488 156.216 411.359 191.601 350.026L208 321.6Z" fill="currentColor"/>
|
||||
<path d="M208 321.6H16L106.462 164.8L208 321.6Z" fill="currentColor"/>
|
||||
<path d="M388.923 8L208 321.6L253.6 8H388.923Z" fill="currentColor"/>
|
||||
<path d="M304 488H112L202.462 331.2L304 488Z" fill="currentColor"/>
|
||||
<path d="M496 321.6H208L419.399 454.4L496 321.6Z" fill="currentColor"/>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 771 B |
@@ -8,12 +8,13 @@
|
||||
ariaLabel?: string;
|
||||
class?: string;
|
||||
disabled?: boolean;
|
||||
href?: string;
|
||||
icon: Component;
|
||||
iconSize?: string;
|
||||
onclick: (e?: MouseEvent) => void;
|
||||
onclick?: (e?: MouseEvent) => void;
|
||||
size?: ButtonSize;
|
||||
stopPropagationOnClick?: boolean;
|
||||
tooltip: string;
|
||||
tooltip?: string;
|
||||
variant?: ButtonVariant;
|
||||
tooltipSide?: TooltipSide;
|
||||
}
|
||||
@@ -22,6 +23,7 @@
|
||||
icon,
|
||||
tooltip,
|
||||
variant = 'ghost',
|
||||
href = '',
|
||||
size = 'sm',
|
||||
class: className = '',
|
||||
disabled = false,
|
||||
@@ -31,34 +33,49 @@
|
||||
onclick,
|
||||
ariaLabel
|
||||
}: Props = $props();
|
||||
|
||||
let innerWidth = $state(0);
|
||||
const showTooltip = $derived(!!tooltip && innerWidth > 768);
|
||||
</script>
|
||||
|
||||
<Tooltip.Root>
|
||||
<Tooltip.Trigger>
|
||||
<!-- prevent another nested button element -->
|
||||
{#snippet child({ props })}
|
||||
<Button
|
||||
{...props}
|
||||
{variant}
|
||||
{size}
|
||||
{disabled}
|
||||
onclick={(e: MouseEvent) => {
|
||||
if (stopPropagationOnClick) e.stopPropagation();
|
||||
{#snippet button(props = {})}
|
||||
<Button
|
||||
{...props}
|
||||
{href}
|
||||
{variant}
|
||||
{size}
|
||||
{disabled}
|
||||
onclick={(e: MouseEvent) => {
|
||||
if (stopPropagationOnClick) e.stopPropagation();
|
||||
|
||||
onclick?.(e);
|
||||
}}
|
||||
class="h-6 w-6 p-0 {className} flex hover:bg-transparent data-[state=open]:bg-transparent!"
|
||||
aria-label={ariaLabel || tooltip}
|
||||
>
|
||||
{#if icon}
|
||||
{@const IconComponent = icon}
|
||||
<IconComponent class={iconSize} />
|
||||
{/if}
|
||||
</Button>
|
||||
{/snippet}
|
||||
</Tooltip.Trigger>
|
||||
onclick?.(e);
|
||||
}}
|
||||
class="h-6 w-6 p-0 {className} flex hover:bg-transparent data-[state=open]:bg-transparent!"
|
||||
aria-label={ariaLabel || tooltip}
|
||||
>
|
||||
{#if icon}
|
||||
{@const IconComponent = icon}
|
||||
|
||||
<Tooltip.Content side={tooltipSide}>
|
||||
<p>{tooltip}</p>
|
||||
</Tooltip.Content>
|
||||
</Tooltip.Root>
|
||||
<IconComponent class={iconSize} />
|
||||
{/if}
|
||||
</Button>
|
||||
{/snippet}
|
||||
|
||||
{#if showTooltip}
|
||||
<Tooltip.Root>
|
||||
<Tooltip.Trigger>
|
||||
<!-- prevent another nested button element -->
|
||||
{#snippet child({ props })}
|
||||
{@render button(props)}
|
||||
{/snippet}
|
||||
</Tooltip.Trigger>
|
||||
|
||||
<Tooltip.Content side={tooltipSide}>
|
||||
<p>{tooltip}</p>
|
||||
</Tooltip.Content>
|
||||
</Tooltip.Root>
|
||||
{:else}
|
||||
{@render button({ href })}
|
||||
{/if}
|
||||
|
||||
<svelte:window bind:innerWidth />
|
||||
|
||||
@@ -494,7 +494,7 @@
|
||||
/>
|
||||
|
||||
<div
|
||||
class="{INPUT_CLASSES} overflow-hidden rounded-3xl backdrop-blur-md {disabled
|
||||
class="{INPUT_CLASSES} overflow-hidden rounded-4xl md:rounded-3xl backdrop-blur-md {disabled
|
||||
? 'cursor-not-allowed opacity-60'
|
||||
: ''}"
|
||||
data-slot="input-area"
|
||||
@@ -510,7 +510,7 @@
|
||||
/>
|
||||
|
||||
<div
|
||||
class="flex-column relative min-h-[48px] items-center rounded-3xl py-2 pb-2.25 shadow-sm transition-all focus-within:shadow-md md:!py-3"
|
||||
class="flex-column relative min-h-12 items-center rounded-4xl md:rounded-3xl py-2 pb-2.25 shadow-sm transition-all focus-within:shadow-md md:py-3!"
|
||||
onpaste={handlePaste}
|
||||
>
|
||||
<ChatFormTextarea
|
||||
|
||||
+1
-1
@@ -15,7 +15,7 @@
|
||||
<Tooltip.Root>
|
||||
<Tooltip.Trigger class="w-full">
|
||||
<Button
|
||||
class="file-upload-button h-8 w-8 rounded-full p-0"
|
||||
class="file-upload-button md:h-8 md:w-8 h-9 w-9 rounded-full p-0"
|
||||
{disabled}
|
||||
{onclick}
|
||||
variant="secondary"
|
||||
|
||||
+16
-3
@@ -15,6 +15,7 @@
|
||||
import { McpLogo } from '$lib/components/app';
|
||||
import { PencilRuler, ChevronDown, ChevronRight } from '@lucide/svelte';
|
||||
import { HealthCheckStatus } from '$lib/enums';
|
||||
import { AttachmentAction } from '$lib/enums/attachment.enums';
|
||||
|
||||
interface Props {
|
||||
class?: string;
|
||||
@@ -270,14 +271,22 @@
|
||||
</Collapsible.Root>
|
||||
{/if}
|
||||
|
||||
<button type="button" class={sheetItemClass} onclick={onSystemPromptClick}>
|
||||
<button
|
||||
type="button"
|
||||
class={sheetItemClass}
|
||||
onclick={() => attachmentMenu.callbacks[AttachmentAction.SYSTEM_PROMPT_CLICK]()}
|
||||
>
|
||||
<MessageSquare class="h-4 w-4 shrink-0" />
|
||||
|
||||
<span>System Message</span>
|
||||
</button>
|
||||
|
||||
{#if hasMcpPromptsSupport}
|
||||
<button type="button" class={sheetItemClass} onclick={onMcpPromptClick}>
|
||||
<button
|
||||
type="button"
|
||||
class={sheetItemClass}
|
||||
onclick={() => attachmentMenu.callbacks[AttachmentAction.MCP_PROMPT_CLICK]()}
|
||||
>
|
||||
<Zap class="h-4 w-4 shrink-0" />
|
||||
|
||||
<span>MCP Prompt</span>
|
||||
@@ -285,7 +294,11 @@
|
||||
{/if}
|
||||
|
||||
{#if hasMcpResourcesSupport}
|
||||
<button type="button" class={sheetItemClass} onclick={onMcpResourcesClick}>
|
||||
<button
|
||||
type="button"
|
||||
class={sheetItemClass}
|
||||
onclick={() => attachmentMenu.callbacks[AttachmentAction.MCP_RESOURCES_CLICK]()}
|
||||
>
|
||||
<FolderOpen class="h-4 w-4 shrink-0" />
|
||||
|
||||
<span>MCP Resources</span>
|
||||
|
||||
+1
@@ -42,6 +42,7 @@
|
||||
{hasMcpPromptsSupport}
|
||||
{hasMcpResourcesSupport}
|
||||
{onFileUpload}
|
||||
{onSystemPromptClick}
|
||||
{onMcpPromptClick}
|
||||
{onMcpResourcesClick}
|
||||
>
|
||||
|
||||
+1
-1
@@ -20,7 +20,7 @@
|
||||
type="submit"
|
||||
disabled={isDisabled}
|
||||
class={[
|
||||
'h-8 w-8 rounded-full p-0',
|
||||
'md:h-8 md:w-8 h-9 w-9 rounded-full p-0',
|
||||
showErrorState &&
|
||||
'bg-red-400/10 text-red-400 hover:bg-red-400/20 hover:text-red-400 disabled:opacity-100'
|
||||
]}
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
<script lang="ts">
|
||||
import { isMobile } from '$lib/stores/viewport.svelte';
|
||||
import { autoResizeTextarea } from '$lib/utils';
|
||||
import { onMount } from 'svelte';
|
||||
|
||||
@@ -37,7 +38,9 @@
|
||||
}
|
||||
|
||||
export function focus() {
|
||||
textareaElement?.focus();
|
||||
if (isMobile.current) return;
|
||||
|
||||
textareaElement?.focus({ preventScroll: true });
|
||||
}
|
||||
|
||||
export function resetHeight() {
|
||||
|
||||
@@ -231,7 +231,7 @@
|
||||
editedContent = message.content;
|
||||
}
|
||||
|
||||
textareaElement?.focus();
|
||||
textareaElement?.focus({ preventScroll: true });
|
||||
editedExtras = message.extra ? [...message.extra] : [];
|
||||
editedUploadedFiles = [];
|
||||
|
||||
@@ -324,7 +324,7 @@
|
||||
}
|
||||
</script>
|
||||
|
||||
<div use:fadeInView>
|
||||
<div use:fadeInView class="chat-message">
|
||||
{#if message.role === MessageRole.SYSTEM}
|
||||
<ChatMessageSystem
|
||||
bind:textareaElement
|
||||
|
||||
+72
-5
@@ -180,6 +180,9 @@
|
||||
|
||||
let displayedModel = $derived(message.model ?? null);
|
||||
|
||||
// model being switched to while it loads, so the selector bar tracks it
|
||||
let pendingModel = $state<string | null>(null);
|
||||
|
||||
let isCurrentlyLoading = $derived(isLoading());
|
||||
let isStreaming = $derived(isChatStreaming());
|
||||
let hasNoContent = $derived(!message?.content?.trim());
|
||||
@@ -207,6 +210,42 @@
|
||||
isLastAssistantMessage
|
||||
);
|
||||
|
||||
let assistantEl: HTMLDivElement | undefined = $state();
|
||||
let lastUserMessageHeight = $state(0);
|
||||
let assistantMarginTop = $state(0);
|
||||
|
||||
$effect(() => {
|
||||
if (!assistantEl) return;
|
||||
|
||||
assistantMarginTop = Math.round(parseFloat(getComputedStyle(assistantEl).marginTop));
|
||||
|
||||
const chatMessageEl = assistantEl.closest('.chat-message');
|
||||
const previousChatMessage = chatMessageEl?.previousElementSibling;
|
||||
const userMessageEl = previousChatMessage?.querySelector(
|
||||
'.chat-message-user'
|
||||
) as HTMLElement | null;
|
||||
|
||||
if (!userMessageEl) {
|
||||
lastUserMessageHeight = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
const updateHeight = () => {
|
||||
const rect = userMessageEl.getBoundingClientRect();
|
||||
const marginTop = Math.round(parseFloat(getComputedStyle(userMessageEl).marginTop));
|
||||
lastUserMessageHeight = Math.round(rect.height + marginTop);
|
||||
};
|
||||
|
||||
updateHeight();
|
||||
|
||||
const resizeObserver = new ResizeObserver(updateHeight);
|
||||
resizeObserver.observe(userMessageEl);
|
||||
|
||||
return () => {
|
||||
resizeObserver.disconnect();
|
||||
};
|
||||
});
|
||||
|
||||
function handleCopyModel() {
|
||||
void copyToClipboard(displayedModel ?? '');
|
||||
}
|
||||
@@ -219,12 +258,17 @@
|
||||
</script>
|
||||
|
||||
<div
|
||||
class="text-md group w-full leading-7.5 {className}"
|
||||
bind:this={assistantEl}
|
||||
class="chat-message-assistant text-md group w-full leading-7.5 {className}"
|
||||
style:--last-user-message-height={lastUserMessageHeight > 0
|
||||
? `${lastUserMessageHeight}px`
|
||||
: undefined}
|
||||
style:--assistant-margin-top={assistantMarginTop > 0 ? `${assistantMarginTop}px` : undefined}
|
||||
role="group"
|
||||
aria-label="Assistant message with actions"
|
||||
>
|
||||
{#if showProcessingInfoTop}
|
||||
<div class="mt-6 w-full max-w-[48rem]" in:fade>
|
||||
<div class="mt-6 w-full max-w-3xl" in:fade>
|
||||
<div class="processing-container">
|
||||
<span class="processing-text">
|
||||
{modelLoadingText ??
|
||||
@@ -257,7 +301,7 @@
|
||||
{/if}
|
||||
|
||||
{#if showProcessingInfoBottom}
|
||||
<div class="mt-4 w-full max-w-[48rem]" in:fade>
|
||||
<div class="mt-4 w-full max-w-3xl" in:fade>
|
||||
<div class="processing-container">
|
||||
<span class="processing-text">
|
||||
{modelLoadingText ??
|
||||
@@ -277,13 +321,19 @@
|
||||
>
|
||||
{#if isRouter}
|
||||
<ModelsSelectorDropdown
|
||||
currentModel={displayedModel}
|
||||
currentModel={pendingModel ?? displayedModel}
|
||||
disabled={isLoading()}
|
||||
onModelChange={async (modelId: string, modelName: string) => {
|
||||
const status = modelsStore.getModelStatus(modelId);
|
||||
|
||||
if (status !== ServerModelStatus.LOADED) {
|
||||
await modelsStore.loadModel(modelId);
|
||||
pendingModel = modelId;
|
||||
|
||||
try {
|
||||
await modelsStore.loadModel(modelId);
|
||||
} finally {
|
||||
pendingModel = null;
|
||||
}
|
||||
}
|
||||
|
||||
onRegenerate(modelName);
|
||||
@@ -351,6 +401,23 @@
|
||||
</div>
|
||||
|
||||
<style>
|
||||
:global(.chat-message):last-child .chat-message-assistant {
|
||||
--assistant-min-height-offset: calc(
|
||||
var(--last-user-message-height, 19rem) + var(--chat-form-height, 6rem) +
|
||||
var(--chat-form-bottom-position, 0.5rem) + var(--chat-form-padding-top, 6rem) +
|
||||
var(--assistant-margin-top, 3rem)
|
||||
);
|
||||
min-height: calc(100dvh - var(--assistant-min-height-offset));
|
||||
|
||||
@media (width > 768px) {
|
||||
--assistant-min-height-offset: calc(
|
||||
var(--last-user-message-height, 18rem) + var(--chat-form-height, 6rem) +
|
||||
var(--chat-form-bottom-position, 1rem) + var(--chat-form-padding-top, 6rem) +
|
||||
var(--assistant-margin-top, 3rem)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
.processing-container {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
|
||||
+1
-1
@@ -48,7 +48,7 @@
|
||||
|
||||
<div
|
||||
aria-label="User message with actions"
|
||||
class="group flex flex-col items-end gap-3 md:gap-2 {className}"
|
||||
class="chat-message-user group flex flex-col items-end gap-3 md:gap-2 {className}"
|
||||
role="group"
|
||||
>
|
||||
{#if editCtx.isEditing}
|
||||
|
||||
+2
-2
@@ -19,7 +19,7 @@
|
||||
renderMarkdown = false,
|
||||
textColorClass = 'text-foreground',
|
||||
cardBgClass = 'dark:bg-primary/15',
|
||||
maxHeightStyle = 'max-height: var(--max-message-height);'
|
||||
maxHeightStyle = ''
|
||||
}: Props = $props();
|
||||
|
||||
let isMultiline = $state(false);
|
||||
@@ -59,7 +59,7 @@
|
||||
|
||||
{#if content.trim()}
|
||||
<Card
|
||||
class="max-w-[80%] overflow-y-auto rounded-[1.125rem] border-none bg-primary/5 px-3.75 py-1.5 {textColorClass} backdrop-blur-md data-[multiline]:py-2.5 {cardBgClass}"
|
||||
class="chat-message-user-bubble max-w-[80%] overflow-y-auto rounded-[1.125rem] border-none bg-primary/5 px-3.75 py-1.5 {textColorClass} backdrop-blur-md data-multiline:py-2.5 {cardBgClass}"
|
||||
data-multiline={isMultiline ? '' : undefined}
|
||||
style="{maxHeightStyle} overflow-wrap: anywhere; word-break: break-word;"
|
||||
>
|
||||
|
||||
@@ -37,6 +37,7 @@
|
||||
let allConversationMessages = $state<DatabaseMessage[]>([]);
|
||||
let isVisible = $state(false);
|
||||
let previousConversationId = $state<string | null>(null);
|
||||
let previousRouteId = $state<string | null>(null);
|
||||
|
||||
const currentConfig = config();
|
||||
|
||||
@@ -157,8 +158,9 @@
|
||||
});
|
||||
});
|
||||
|
||||
beforeNavigate(() => {
|
||||
beforeNavigate((navigation) => {
|
||||
isVisible = false;
|
||||
previousRouteId = navigation.from?.route.id ?? null;
|
||||
});
|
||||
|
||||
afterNavigate(() => {
|
||||
@@ -249,12 +251,13 @@
|
||||
</script>
|
||||
|
||||
<div
|
||||
class="transition-opacity delay-300 duration-500 ease-out
|
||||
{isVisible ? 'opacity-100' : 'opacity-0'}"
|
||||
class="transition-opacity duration-500 ease-out
|
||||
{isVisible ? 'opacity-100' : 'opacity-0'}
|
||||
{previousRouteId === '/(chat)/chat/[id]' ? '' : 'delay-300'}"
|
||||
>
|
||||
{#each displayMessages as { message, toolMessages, isLastAssistantMessage, siblingInfo } (message.id)}
|
||||
<ChatMessage
|
||||
class="mx-auto mt-12 w-full max-w-[48rem]"
|
||||
class="mx-auto mt-12 w-full max-w-3xl"
|
||||
{message}
|
||||
{toolMessages}
|
||||
{isLastAssistantMessage}
|
||||
|
||||
@@ -1,31 +1,28 @@
|
||||
<script lang="ts">
|
||||
import { Trash2 } from '@lucide/svelte';
|
||||
import { afterNavigate } from '$app/navigation';
|
||||
import { page } from '$app/state';
|
||||
import {
|
||||
ChatScreenForm,
|
||||
ChatMessages,
|
||||
ChatScreenDragOverlay,
|
||||
ChatScreenProcessingInfo,
|
||||
ChatScreenActionScrollDown,
|
||||
DialogEmptyFileAlert,
|
||||
DialogFileUploadError,
|
||||
DialogChatError,
|
||||
ServerLoadingSplash,
|
||||
DialogConfirmation,
|
||||
ChatScreenServerError
|
||||
} from '$lib/components/app';
|
||||
import { setProcessingInfoContext } from '$lib/contexts';
|
||||
import { ErrorDialogType } from '$lib/enums';
|
||||
import { createAutoScrollController } from '$lib/hooks/use-auto-scroll.svelte';
|
||||
import { useChatScreenActiveModel } from '$lib/hooks/use-chat-screen-active-model.svelte';
|
||||
import { useChatScreenDragAndDrop } from '$lib/hooks/use-chat-screen-drag-and-drop.svelte';
|
||||
import { useChatScreenFileUpload } from '$lib/hooks/use-chat-screen-file-upload.svelte';
|
||||
import { useChatScreenScroll } from '$lib/hooks/use-chat-screen-scroll.svelte';
|
||||
import { useKeyboardShortcuts } from '$lib/hooks/use-keyboard-shortcuts.svelte';
|
||||
import { device } from '$lib/stores/device.svelte';
|
||||
import { isMobile } from '$lib/stores/viewport.svelte';
|
||||
import {
|
||||
chatStore,
|
||||
errorDialog,
|
||||
isLoading,
|
||||
isChatStreaming,
|
||||
isEditing,
|
||||
getAddFilesHandler,
|
||||
activeProcessingState
|
||||
} from '$lib/stores/chat.svelte';
|
||||
import {
|
||||
@@ -34,138 +31,81 @@
|
||||
activeConversation
|
||||
} from '$lib/stores/conversations.svelte';
|
||||
import { config } from '$lib/stores/settings.svelte';
|
||||
import { serverLoading, serverError, isRouterMode } from '$lib/stores/server.svelte';
|
||||
import { modelsStore, modelOptions, selectedModelId } from '$lib/stores/models.svelte';
|
||||
import { isFileTypeSupported, filterFilesByModalities } from '$lib/utils';
|
||||
import { parseFilesToMessageExtras, processFilesToChatUploaded } from '$lib/utils/browser-only';
|
||||
import { onMount } from 'svelte';
|
||||
import { serverLoading, serverError } from '$lib/stores/server.svelte';
|
||||
import { parseFilesToMessageExtras } from '$lib/utils/browser-only';
|
||||
import { onDestroy, onMount } from 'svelte';
|
||||
import ChatScreenGreeting from './ChatScreenGreeting.svelte';
|
||||
import ChatScreenActionScrollDown from './ChatScreenActionScrollDown.svelte';
|
||||
import ChatScreenDialogsAndAlerts from './ChatScreenDialogsAndAlerts.svelte';
|
||||
import { ROUTES } from '$lib/constants';
|
||||
|
||||
let { showCenteredEmpty = false } = $props();
|
||||
|
||||
const autoScroll = createAutoScrollController();
|
||||
|
||||
let disableAutoScroll = $derived(Boolean(config().disableAutoScroll));
|
||||
let chatScrollContainer: HTMLDivElement | undefined = $state();
|
||||
let dragCounter = $state(0);
|
||||
let isDragOver = $state(false);
|
||||
let showFileErrorDialog = $state(false);
|
||||
let uploadedFiles = $state<ChatUploadedFile[]>([]);
|
||||
|
||||
let fileErrorData = $state<{
|
||||
generallyUnsupported: File[];
|
||||
modalityUnsupported: File[];
|
||||
modalityReasons: Record<string, string>;
|
||||
supportedTypes: string[];
|
||||
}>({
|
||||
generallyUnsupported: [],
|
||||
modalityUnsupported: [],
|
||||
modalityReasons: {},
|
||||
supportedTypes: []
|
||||
});
|
||||
|
||||
let showDeleteDialog = $state(false);
|
||||
|
||||
let showEmptyFileDialog = $state(false);
|
||||
|
||||
let processingInfoVisible = $state(false);
|
||||
|
||||
let emptyFileNames = $state<string[]>([]);
|
||||
|
||||
let initialMessage = $state('');
|
||||
|
||||
let isEmpty = $derived(
|
||||
showCenteredEmpty && !activeConversation() && activeMessages().length === 0 && !isLoading()
|
||||
);
|
||||
|
||||
let activeErrorDialog = $derived(errorDialog());
|
||||
let isServerLoading = $derived(serverLoading());
|
||||
let hasPropsError = $derived(!!serverError());
|
||||
|
||||
let isCurrentConversationLoading = $derived(isLoading() || isChatStreaming());
|
||||
|
||||
let showProcessingInfo = $derived(
|
||||
isCurrentConversationLoading ||
|
||||
(config().keepStatsVisible && !!page.params.id) ||
|
||||
activeProcessingState() !== null
|
||||
);
|
||||
|
||||
let isRouter = $derived(isRouterMode());
|
||||
|
||||
let conversationModel = $derived(
|
||||
chatStore.getConversationModel(activeMessages() as DatabaseMessage[])
|
||||
);
|
||||
|
||||
let activeModelId = $derived.by(() => {
|
||||
const options = modelOptions();
|
||||
|
||||
if (!isRouter) {
|
||||
return options.length > 0 ? options[0].model : null;
|
||||
}
|
||||
|
||||
const selectedId = selectedModelId();
|
||||
if (selectedId) {
|
||||
const model = options.find((m) => m.id === selectedId);
|
||||
if (model) return model.model;
|
||||
}
|
||||
|
||||
if (conversationModel) {
|
||||
const model = options.find((m) => m.model === conversationModel);
|
||||
if (model) return model.model;
|
||||
}
|
||||
|
||||
return null;
|
||||
});
|
||||
|
||||
let modelPropsVersion = $state(0);
|
||||
|
||||
setProcessingInfoContext({
|
||||
get showProcessingInfo() {
|
||||
return showProcessingInfo;
|
||||
}
|
||||
});
|
||||
|
||||
$effect(() => {
|
||||
if (activeModelId) {
|
||||
const cached = modelsStore.getModelProps(activeModelId);
|
||||
let disableAutoScroll = $derived(Boolean(config().disableAutoScroll) || isMobile.current);
|
||||
let isMobileUserScrolledUp = $state(false);
|
||||
let mobileScrollDownHint = $state(false);
|
||||
let mobileScrollDownHintLockedUntil = $state(0);
|
||||
let emptyFileNames = $state<string[]>([]);
|
||||
let initialMessage = $state('');
|
||||
let showDeleteDialog = $state(false);
|
||||
let showEmptyFileDialog = $state(false);
|
||||
let isEmpty = $derived(
|
||||
showCenteredEmpty && !activeConversation() && activeMessages().length === 0 && !isLoading()
|
||||
);
|
||||
let activeErrorDialog = $derived(errorDialog());
|
||||
let isServerLoading = $derived(serverLoading());
|
||||
let hasPropsError = $derived(!!serverError());
|
||||
let isCurrentConversationLoading = $derived(isLoading() || isChatStreaming());
|
||||
let showProcessingInfo = $derived(
|
||||
isCurrentConversationLoading ||
|
||||
(config().keepStatsVisible && !!page.params.id) ||
|
||||
activeProcessingState() !== null
|
||||
);
|
||||
let chatFormBottomPosition = $derived.by(() => {
|
||||
if (!isMobile.current) return '1rem';
|
||||
if (device.isStandalone) return '1.5rem';
|
||||
if (device.isIOSSafari) return '0.25rem';
|
||||
return '0.5rem';
|
||||
});
|
||||
|
||||
if (!cached) {
|
||||
modelsStore.fetchModelProps(activeModelId).then(() => {
|
||||
modelPropsVersion++;
|
||||
});
|
||||
const autoScroll = createAutoScrollController();
|
||||
const scroll = useChatScreenScroll(autoScroll);
|
||||
const activeModel = useChatScreenActiveModel();
|
||||
const fileUpload = useChatScreenFileUpload({
|
||||
capabilities: () => ({
|
||||
hasVision: activeModel.hasVisionModality,
|
||||
hasAudio: activeModel.hasAudioModality,
|
||||
hasVideo: activeModel.hasVideoModality
|
||||
}),
|
||||
activeModelId: () => activeModel.activeModelId
|
||||
});
|
||||
const dragAndDrop = useChatScreenDragAndDrop({
|
||||
onDrop: fileUpload.handleFileUpload
|
||||
});
|
||||
const { handleKeydown } = useKeyboardShortcuts({
|
||||
deleteActiveConversation: () => {
|
||||
if (activeConversation()) {
|
||||
showDeleteDialog = true;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
let hasAudioModality = $derived.by(() => {
|
||||
if (activeModelId) {
|
||||
void modelPropsVersion;
|
||||
function handleMobileScroll() {
|
||||
if (!isMobile.current) return;
|
||||
|
||||
return modelsStore.modelSupportsAudio(activeModelId);
|
||||
}
|
||||
const container = scroll.chatScrollContainer;
|
||||
if (!container) return;
|
||||
|
||||
return false;
|
||||
});
|
||||
|
||||
let hasVideoModality = $derived.by(() => {
|
||||
if (activeModelId) {
|
||||
void modelPropsVersion;
|
||||
|
||||
return modelsStore.modelSupportsVideo(activeModelId);
|
||||
}
|
||||
|
||||
return false;
|
||||
});
|
||||
|
||||
let hasVisionModality = $derived.by(() => {
|
||||
if (activeModelId) {
|
||||
void modelPropsVersion;
|
||||
|
||||
return modelsStore.modelSupportsVision(activeModelId);
|
||||
}
|
||||
|
||||
return false;
|
||||
});
|
||||
const distanceFromBottom =
|
||||
container.scrollHeight - container.clientHeight - container.scrollTop;
|
||||
isMobileUserScrolledUp = distanceFromBottom > 300;
|
||||
}
|
||||
|
||||
async function handleDeleteConfirm() {
|
||||
const conversation = activeConversation();
|
||||
@@ -177,27 +117,69 @@
|
||||
showDeleteDialog = false;
|
||||
}
|
||||
|
||||
function handleProcessingInfoVisibility(visible: boolean) {
|
||||
processingInfoVisible = visible;
|
||||
}
|
||||
async function handleSendMessage(message: string, files?: ChatUploadedFile[]): Promise<boolean> {
|
||||
const plainFiles = files ? $state.snapshot(files) : undefined;
|
||||
const result = plainFiles
|
||||
? await parseFilesToMessageExtras(plainFiles, activeModel.activeModelId ?? undefined)
|
||||
: undefined;
|
||||
|
||||
function handleDragEnter(event: DragEvent) {
|
||||
event.preventDefault();
|
||||
|
||||
dragCounter++;
|
||||
|
||||
if (event.dataTransfer?.types.includes('Files')) {
|
||||
isDragOver = true;
|
||||
if (result?.emptyFiles && result.emptyFiles.length > 0) {
|
||||
emptyFileNames = result.emptyFiles;
|
||||
showEmptyFileDialog = true;
|
||||
if (files) {
|
||||
const emptyFileNamesSet = new Set(result.emptyFiles);
|
||||
fileUpload.uploadedFiles = fileUpload.uploadedFiles.filter(
|
||||
(file) => !emptyFileNamesSet.has(file.name)
|
||||
);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
handleSendLikeScroll();
|
||||
|
||||
await chatStore.sendMessage(message, result?.extras);
|
||||
return true;
|
||||
}
|
||||
|
||||
function handleDragLeave(event: DragEvent) {
|
||||
event.preventDefault();
|
||||
function handleSendLikeScroll() {
|
||||
if (!isMobile.current) {
|
||||
autoScroll.enable();
|
||||
}
|
||||
|
||||
dragCounter--;
|
||||
setTimeout(() => {
|
||||
const container = scroll.chatScrollContainer;
|
||||
if (!container) return;
|
||||
|
||||
if (dragCounter === 0) {
|
||||
isDragOver = false;
|
||||
const lastUserBubble = container.querySelector(
|
||||
'.chat-message:nth-last-child(2) .chat-message-user .chat-message-user-bubble'
|
||||
) as HTMLElement | null;
|
||||
|
||||
if (isMobile.current) {
|
||||
// Keep the last user message bubble just above the input on mobile
|
||||
const bubbleHeight = lastUserBubble?.scrollHeight ?? 0;
|
||||
const baseHeight = container.scrollHeight - innerHeight;
|
||||
|
||||
container.scrollTo({
|
||||
top: bubbleHeight > 0 ? baseHeight - bubbleHeight : baseHeight,
|
||||
behavior: 'smooth'
|
||||
});
|
||||
} else if (lastUserBubble) {
|
||||
// On desktop, place the last user message near the top of the viewport
|
||||
const topPadding = 24;
|
||||
const bubbleRect = lastUserBubble.getBoundingClientRect();
|
||||
container.scrollTo({
|
||||
top: Math.max(0, container.scrollTop + bubbleRect.top - topPadding),
|
||||
behavior: 'smooth'
|
||||
});
|
||||
} else {
|
||||
autoScroll.scrollToBottom();
|
||||
}
|
||||
}, 100);
|
||||
|
||||
if (isMobile.current) {
|
||||
autoScroll.setDisabled(disableAutoScroll);
|
||||
mobileScrollDownHint = true;
|
||||
mobileScrollDownHintLockedUntil = Date.now() + 500;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -207,273 +189,138 @@
|
||||
}
|
||||
}
|
||||
|
||||
function handleDragOver(event: DragEvent) {
|
||||
event.preventDefault();
|
||||
}
|
||||
|
||||
function handleDrop(event: DragEvent) {
|
||||
event.preventDefault();
|
||||
|
||||
isDragOver = false;
|
||||
dragCounter = 0;
|
||||
|
||||
if (event.dataTransfer?.files) {
|
||||
const files = Array.from(event.dataTransfer.files);
|
||||
|
||||
if (isEditing()) {
|
||||
const handler = getAddFilesHandler();
|
||||
|
||||
if (handler) {
|
||||
handler(files);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
processFiles(files);
|
||||
}
|
||||
}
|
||||
|
||||
function handleFileRemove(fileId: string) {
|
||||
uploadedFiles = uploadedFiles.filter((f) => f.id !== fileId);
|
||||
}
|
||||
|
||||
function handleFileUpload(files: File[]) {
|
||||
processFiles(files);
|
||||
}
|
||||
|
||||
const { handleKeydown } = useKeyboardShortcuts({
|
||||
deleteActiveConversation: () => {
|
||||
if (activeConversation()) {
|
||||
showDeleteDialog = true;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
async function handleSystemPromptAdd(draft: { message: string; files: ChatUploadedFile[] }) {
|
||||
if (draft.message || draft.files.length > 0) {
|
||||
chatStore.savePendingDraft(draft.message, draft.files);
|
||||
}
|
||||
|
||||
await chatStore.addSystemPrompt();
|
||||
}
|
||||
|
||||
function handleScroll() {
|
||||
autoScroll.handleScroll();
|
||||
}
|
||||
|
||||
async function handleSendMessage(message: string, files?: ChatUploadedFile[]): Promise<boolean> {
|
||||
const plainFiles = files ? $state.snapshot(files) : undefined;
|
||||
const result = plainFiles
|
||||
? await parseFilesToMessageExtras(plainFiles, activeModelId ?? undefined)
|
||||
: undefined;
|
||||
|
||||
if (result?.emptyFiles && result.emptyFiles.length > 0) {
|
||||
emptyFileNames = result.emptyFiles;
|
||||
showEmptyFileDialog = true;
|
||||
|
||||
if (files) {
|
||||
const emptyFileNamesSet = new Set(result.emptyFiles);
|
||||
uploadedFiles = uploadedFiles.filter((file) => !emptyFileNamesSet.has(file.name));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
const extras = result?.extras;
|
||||
|
||||
// Enable autoscroll for user-initiated message sending
|
||||
autoScroll.enable();
|
||||
await chatStore.sendMessage(message, extras);
|
||||
autoScroll.scrollToBottom();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
async function processFiles(files: File[]) {
|
||||
const generallySupported: File[] = [];
|
||||
const generallyUnsupported: File[] = [];
|
||||
|
||||
for (const file of files) {
|
||||
if (isFileTypeSupported(file.name, file.type)) {
|
||||
generallySupported.push(file);
|
||||
} else {
|
||||
generallyUnsupported.push(file);
|
||||
}
|
||||
}
|
||||
|
||||
// Use model-specific capabilities for file validation
|
||||
const capabilities = {
|
||||
hasVision: hasVisionModality,
|
||||
hasAudio: hasAudioModality,
|
||||
hasVideo: hasVideoModality
|
||||
};
|
||||
const { supportedFiles, unsupportedFiles, modalityReasons } = filterFilesByModalities(
|
||||
generallySupported,
|
||||
capabilities
|
||||
);
|
||||
|
||||
const allUnsupportedFiles = [...generallyUnsupported, ...unsupportedFiles];
|
||||
|
||||
if (allUnsupportedFiles.length > 0) {
|
||||
const supportedTypes: string[] = ['text files', 'PDFs'];
|
||||
|
||||
if (hasVisionModality) supportedTypes.push('images');
|
||||
if (hasAudioModality) supportedTypes.push('audio files');
|
||||
if (hasVideoModality) supportedTypes.push('video files');
|
||||
|
||||
fileErrorData = {
|
||||
generallyUnsupported,
|
||||
modalityUnsupported: unsupportedFiles,
|
||||
modalityReasons,
|
||||
supportedTypes
|
||||
};
|
||||
showFileErrorDialog = true;
|
||||
}
|
||||
|
||||
if (supportedFiles.length > 0) {
|
||||
const processed = await processFilesToChatUploaded(
|
||||
supportedFiles,
|
||||
activeModelId ?? undefined
|
||||
);
|
||||
uploadedFiles = [...uploadedFiles, ...processed];
|
||||
}
|
||||
}
|
||||
|
||||
afterNavigate(() => {
|
||||
if (!disableAutoScroll) {
|
||||
$effect(() => {
|
||||
const shouldDisableAutoScroll =
|
||||
config().disableAutoScroll || (isMobile.current && isCurrentConversationLoading);
|
||||
autoScroll.setDisabled(shouldDisableAutoScroll);
|
||||
if (!shouldDisableAutoScroll) {
|
||||
autoScroll.enable();
|
||||
}
|
||||
});
|
||||
|
||||
function handleMessagesReady() {
|
||||
if (disableAutoScroll) return;
|
||||
|
||||
if (!autoScroll.userScrolledUp) {
|
||||
requestAnimationFrame(() => {
|
||||
autoScroll.scrollToBottom('instant');
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
onMount(() => {
|
||||
const pendingDraft = chatStore.consumePendingDraft();
|
||||
if (pendingDraft) {
|
||||
initialMessage = pendingDraft.message;
|
||||
fileUpload.uploadedFiles = pendingDraft.files;
|
||||
}
|
||||
|
||||
autoScroll.startObserving();
|
||||
|
||||
if (!disableAutoScroll) {
|
||||
autoScroll.enable();
|
||||
}
|
||||
|
||||
const pendingDraft = chatStore.consumePendingDraft();
|
||||
if (pendingDraft) {
|
||||
initialMessage = pendingDraft.message;
|
||||
uploadedFiles = pendingDraft.files;
|
||||
if (isMobile.current && isCurrentConversationLoading) {
|
||||
mobileScrollDownHint = true;
|
||||
mobileScrollDownHintLockedUntil = Date.now() + 500;
|
||||
}
|
||||
|
||||
handleMobileScroll();
|
||||
});
|
||||
|
||||
$effect(() => {
|
||||
autoScroll.setContainer(chatScrollContainer);
|
||||
});
|
||||
|
||||
$effect(() => {
|
||||
autoScroll.setDisabled(disableAutoScroll);
|
||||
});
|
||||
onDestroy(() => autoScroll.destroy());
|
||||
</script>
|
||||
|
||||
{#if isDragOver}
|
||||
{#if dragAndDrop.isDragOver}
|
||||
<ChatScreenDragOverlay />
|
||||
{/if}
|
||||
|
||||
<svelte:window onkeydown={handleKeydown} />
|
||||
<svelte:window
|
||||
onkeydown={handleKeydown}
|
||||
onscroll={(e) => {
|
||||
scroll.handleScroll(e);
|
||||
handleMobileScroll();
|
||||
if (e.isTrusted && Date.now() > mobileScrollDownHintLockedUntil) {
|
||||
mobileScrollDownHint = false;
|
||||
}
|
||||
}}
|
||||
/>
|
||||
|
||||
{#if isServerLoading}
|
||||
<ServerLoadingSplash />
|
||||
{:else}
|
||||
<div
|
||||
bind:this={chatScrollContainer}
|
||||
aria-label="Chat interface with file drop zone"
|
||||
class="flex h-full flex-col overflow-y-auto px-4 md:px-6"
|
||||
ondragenter={handleDragEnter}
|
||||
ondragleave={handleDragLeave}
|
||||
ondragover={handleDragOver}
|
||||
ondrop={handleDrop}
|
||||
onscroll={handleScroll}
|
||||
class="chat-screen flex grow flex-col min-h-[calc(100dvh-1rem)] md:min-h-full px-4 md:py-0 pt-12 pb-48 md:pb-4"
|
||||
style:--chat-form-bottom-position={chatFormBottomPosition}
|
||||
ondragenter={dragAndDrop.dragHandlers.dragenter}
|
||||
ondragleave={dragAndDrop.dragHandlers.dragleave}
|
||||
ondragover={dragAndDrop.dragHandlers.dragover}
|
||||
ondrop={dragAndDrop.dragHandlers.drop}
|
||||
role="main"
|
||||
>
|
||||
<div class="flex grow flex-col pt-14">
|
||||
{#if !isEmpty}
|
||||
<ChatMessages
|
||||
messages={activeMessages()}
|
||||
onMessagesReady={handleMessagesReady}
|
||||
onUserAction={() => {
|
||||
autoScroll.enable();
|
||||
if (!autoScroll.userScrolledUp) {
|
||||
autoScroll.scrollToBottom();
|
||||
}
|
||||
}}
|
||||
/>
|
||||
{/if}
|
||||
{#if !isEmpty}
|
||||
<ChatMessages
|
||||
messages={activeMessages()}
|
||||
onUserAction={() => {
|
||||
handleSendLikeScroll();
|
||||
}}
|
||||
/>
|
||||
{/if}
|
||||
|
||||
<div
|
||||
class={[
|
||||
'pointer-events-none sticky right-4 left-4 mt-auto transition-all duration-200',
|
||||
isEmpty ? 'bottom-[calc(50dvh-7rem)]' : 'bottom-4 pt-24 md:pt-32'
|
||||
]}
|
||||
>
|
||||
<ChatScreenGreeting {isEmpty} />
|
||||
<div
|
||||
class={[
|
||||
'pointer-events-none md:sticky fixed mt-auto transition-all duration-200',
|
||||
device.isStandalone
|
||||
? 'bottom-6 right-4 left-4'
|
||||
: device.isIOSSafari
|
||||
? 'bottom-1 left-2 right-2'
|
||||
: 'bottom-2 right-2 left-2',
|
||||
isEmpty ? 'md:bottom-[calc(50dvh-7rem)] 2xl:bottom-[calc(50dvh-4rem)]' : 'md:bottom-4'
|
||||
]}
|
||||
style:padding-top={!isEmpty ? 'var(--chat-form-padding-top)' : undefined}
|
||||
>
|
||||
<ChatScreenGreeting {isEmpty} />
|
||||
|
||||
<ChatScreenActionScrollDown
|
||||
container={chatScrollContainer}
|
||||
hasProcessingInfoVisible={processingInfoVisible}
|
||||
/>
|
||||
<ChatScreenServerError />
|
||||
|
||||
<ChatScreenProcessingInfo onVisibilityChange={handleProcessingInfoVisibility} />
|
||||
|
||||
<ChatScreenServerError />
|
||||
|
||||
<div class="conversation-chat-form pointer-events-auto rounded-t-3xl">
|
||||
<ChatScreenForm
|
||||
disabled={hasPropsError || isEditing()}
|
||||
{initialMessage}
|
||||
isLoading={isCurrentConversationLoading}
|
||||
onFileRemove={handleFileRemove}
|
||||
onFileUpload={handleFileUpload}
|
||||
onSend={handleSendMessage}
|
||||
onStop={() => chatStore.stopGeneration()}
|
||||
onSystemPromptAdd={handleSystemPromptAdd}
|
||||
bind:uploadedFiles
|
||||
<div class="pointer-events-none flex flex-col gap-6 items-center w-full">
|
||||
{#if (isMobile.current ? mobileScrollDownHint || isMobileUserScrolledUp : autoScroll.userScrolledUp) && page.url.hash.includes(ROUTES.CHAT) && page.params.id}
|
||||
<ChatScreenActionScrollDown
|
||||
onclick={() => {
|
||||
mobileScrollDownHint = false;
|
||||
scroll.chatScrollContainer?.scrollTo({
|
||||
top: scroll.chatScrollContainer.scrollHeight,
|
||||
behavior: 'smooth'
|
||||
});
|
||||
}}
|
||||
/>
|
||||
</div>
|
||||
{/if}
|
||||
|
||||
{#if showProcessingInfo}
|
||||
<ChatScreenProcessingInfo />
|
||||
{/if}
|
||||
</div>
|
||||
|
||||
<ChatScreenForm
|
||||
class="pointer-events-auto conversation-chat-form"
|
||||
disabled={hasPropsError || isEditing()}
|
||||
{initialMessage}
|
||||
isLoading={isCurrentConversationLoading}
|
||||
onFileRemove={fileUpload.handleFileRemove}
|
||||
onFileUpload={fileUpload.handleFileUpload}
|
||||
onSend={handleSendMessage}
|
||||
onStop={() => chatStore.stopGeneration()}
|
||||
onSystemPromptAdd={handleSystemPromptAdd}
|
||||
bind:uploadedFiles={fileUpload.uploadedFiles}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
{/if}
|
||||
|
||||
<DialogFileUploadError bind:open={showFileErrorDialog} {fileErrorData} />
|
||||
|
||||
<DialogConfirmation
|
||||
bind:open={showDeleteDialog}
|
||||
title="Delete Conversation"
|
||||
description="Are you sure you want to delete this conversation? This action cannot be undone and will permanently remove all messages in this conversation."
|
||||
confirmText="Delete"
|
||||
cancelText="Cancel"
|
||||
variant="destructive"
|
||||
icon={Trash2}
|
||||
onConfirm={handleDeleteConfirm}
|
||||
onCancel={() => (showDeleteDialog = false)}
|
||||
/>
|
||||
|
||||
<DialogEmptyFileAlert
|
||||
bind:open={showEmptyFileDialog}
|
||||
emptyFiles={emptyFileNames}
|
||||
onOpenChange={(open) => {
|
||||
if (!open) {
|
||||
emptyFileNames = [];
|
||||
}
|
||||
}}
|
||||
/>
|
||||
|
||||
<DialogChatError
|
||||
message={activeErrorDialog?.message ?? ''}
|
||||
contextInfo={activeErrorDialog?.contextInfo}
|
||||
onOpenChange={handleErrorDialogOpenChange}
|
||||
open={Boolean(activeErrorDialog)}
|
||||
type={activeErrorDialog?.type ?? ErrorDialogType.SERVER}
|
||||
<ChatScreenDialogsAndAlerts
|
||||
{showDeleteDialog}
|
||||
{handleDeleteConfirm}
|
||||
{showEmptyFileDialog}
|
||||
{emptyFileNames}
|
||||
{activeErrorDialog}
|
||||
{handleErrorDialogOpenChange}
|
||||
{fileUpload}
|
||||
/>
|
||||
|
||||
@@ -1,58 +1,18 @@
|
||||
<script lang="ts">
|
||||
import { ArrowDown } from '@lucide/svelte';
|
||||
import { Button } from '$lib/components/ui/button';
|
||||
import ActionIcon from '$lib/components/app/actions/ActionIcon.svelte';
|
||||
|
||||
interface Props {
|
||||
container: HTMLDivElement | undefined;
|
||||
hasProcessingInfoVisible: boolean;
|
||||
}
|
||||
|
||||
let { container, hasProcessingInfoVisible }: Props = $props();
|
||||
|
||||
let show = $state(false);
|
||||
|
||||
let buttonBottom = $derived(hasProcessingInfoVisible ? '2rem' : '0');
|
||||
|
||||
function checkVisibility() {
|
||||
if (!container) return;
|
||||
const { scrollTop, scrollHeight, clientHeight } = container;
|
||||
const distanceFromBottom = scrollHeight - clientHeight - scrollTop;
|
||||
show = distanceFromBottom > clientHeight * 0.5;
|
||||
}
|
||||
|
||||
function scrollToBottom() {
|
||||
if (container) {
|
||||
container.scrollTo({
|
||||
top: container.scrollHeight,
|
||||
behavior: 'smooth'
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
$effect(() => {
|
||||
const c = container;
|
||||
if (c) {
|
||||
c.addEventListener('scroll', checkVisibility);
|
||||
checkVisibility();
|
||||
return () => {
|
||||
c.removeEventListener('scroll', checkVisibility);
|
||||
};
|
||||
}
|
||||
});
|
||||
let { onclick }: { onclick: (e?: MouseEvent) => void } = $props();
|
||||
</script>
|
||||
|
||||
<div class="relative z-50 mx-auto mb-4 flex max-w-[48rem] justify-center">
|
||||
<Button
|
||||
onclick={scrollToBottom}
|
||||
variant="secondary"
|
||||
size="icon"
|
||||
disabled={!show}
|
||||
class="pointer-events-auto absolute h-10 w-10 rounded-full bg-background/80 shadow-lg backdrop-blur-sm transition-all duration-200 hover:bg-muted/80"
|
||||
style="bottom: {buttonBottom}; transform: translateY({show ? '0' : '2rem'}); opacity: {show
|
||||
? 1
|
||||
: 0};"
|
||||
aria-label="Scroll to bottom"
|
||||
>
|
||||
<ArrowDown class="h-4 w-4" />
|
||||
</Button>
|
||||
<div class="pointer-events-auto flex justify-center relative h-0">
|
||||
<ActionIcon
|
||||
icon={ArrowDown}
|
||||
{onclick}
|
||||
ariaLabel="Scroll to bottom"
|
||||
tooltip="Scroll to bottom"
|
||||
size="lg"
|
||||
iconSize="h-4 w-4"
|
||||
class="h-9 w-9 rounded-full bg-accent text-accent-foreground absolute bottom-4 shadow-md"
|
||||
/>
|
||||
</div>
|
||||
|
||||
@@ -0,0 +1,55 @@
|
||||
<script lang="ts">
|
||||
import { Trash2 } from '@lucide/svelte';
|
||||
import { ErrorDialogType } from '$lib/enums';
|
||||
import {
|
||||
DialogChatError,
|
||||
DialogConfirmation,
|
||||
DialogEmptyFileAlert,
|
||||
DialogFileUploadError
|
||||
} from '$lib/components/app';
|
||||
|
||||
let {
|
||||
showDeleteDialog,
|
||||
handleDeleteConfirm,
|
||||
showEmptyFileDialog,
|
||||
emptyFileNames,
|
||||
activeErrorDialog,
|
||||
handleErrorDialogOpenChange,
|
||||
fileUpload
|
||||
} = $props();
|
||||
</script>
|
||||
|
||||
<DialogFileUploadError
|
||||
bind:open={fileUpload.showFileErrorDialog}
|
||||
fileErrorData={fileUpload.fileErrorData}
|
||||
/>
|
||||
|
||||
<DialogConfirmation
|
||||
bind:open={showDeleteDialog}
|
||||
title="Delete Conversation"
|
||||
description="Are you sure you want to delete this conversation? This action cannot be undone and will permanently remove all messages in this conversation."
|
||||
confirmText="Delete"
|
||||
cancelText="Cancel"
|
||||
variant="destructive"
|
||||
icon={Trash2}
|
||||
onConfirm={handleDeleteConfirm}
|
||||
onCancel={() => (showDeleteDialog = false)}
|
||||
/>
|
||||
|
||||
<DialogEmptyFileAlert
|
||||
bind:open={showEmptyFileDialog}
|
||||
emptyFiles={emptyFileNames}
|
||||
onOpenChange={(open) => {
|
||||
if (!open) {
|
||||
emptyFileNames = [];
|
||||
}
|
||||
}}
|
||||
/>
|
||||
|
||||
<DialogChatError
|
||||
message={activeErrorDialog?.message ?? ''}
|
||||
contextInfo={activeErrorDialog?.contextInfo}
|
||||
onOpenChange={handleErrorDialogOpenChange}
|
||||
open={Boolean(activeErrorDialog)}
|
||||
type={activeErrorDialog?.type ?? ErrorDialogType.SERVER}
|
||||
/>
|
||||
@@ -2,6 +2,7 @@
|
||||
import { afterNavigate } from '$app/navigation';
|
||||
import { page } from '$app/state';
|
||||
import { ChatForm } from '$lib/components/app';
|
||||
import { isMobile } from '$lib/stores/viewport.svelte';
|
||||
import { onMount } from 'svelte';
|
||||
import { useDraftMessages } from '$lib/hooks/use-draft-messages.svelte';
|
||||
|
||||
@@ -32,7 +33,30 @@
|
||||
}: Props = $props();
|
||||
|
||||
let chatFormRef: ChatForm | undefined = $state(undefined);
|
||||
let formWrapperEl: HTMLDivElement | undefined = $state();
|
||||
let chatId = $derived(page.params.id as string | undefined);
|
||||
|
||||
$effect(() => {
|
||||
if (!formWrapperEl) return;
|
||||
|
||||
const formEl = formWrapperEl.querySelector('form') as HTMLElement | null;
|
||||
if (!formEl) return;
|
||||
|
||||
const updateHeight = () => {
|
||||
const height = Math.round(formEl.getBoundingClientRect().height);
|
||||
document.documentElement.style.setProperty('--chat-form-height', `${height}px`);
|
||||
};
|
||||
|
||||
updateHeight();
|
||||
|
||||
const resizeObserver = new ResizeObserver(updateHeight);
|
||||
resizeObserver.observe(formEl);
|
||||
|
||||
return () => {
|
||||
resizeObserver.disconnect();
|
||||
document.documentElement.style.removeProperty('--chat-form-height');
|
||||
};
|
||||
});
|
||||
let hasLoadingAttachments = $derived(uploadedFiles.some((f) => f.isLoading));
|
||||
let message = $derived(initialMessage);
|
||||
let previousIsLoading = $derived(isLoading);
|
||||
@@ -83,12 +107,14 @@
|
||||
}
|
||||
|
||||
onMount(() => {
|
||||
setTimeout(() => chatFormRef?.focus(), 10);
|
||||
if (!isMobile.current) {
|
||||
setTimeout(() => chatFormRef?.focus(), 100);
|
||||
}
|
||||
});
|
||||
|
||||
afterNavigate((navigation) => {
|
||||
if (navigation?.from != null) {
|
||||
setTimeout(() => chatFormRef?.focus(), 10);
|
||||
if (navigation?.from != null && !isMobile.current) {
|
||||
setTimeout(() => chatFormRef?.focus(), 100);
|
||||
}
|
||||
});
|
||||
|
||||
@@ -108,12 +134,12 @@
|
||||
});
|
||||
</script>
|
||||
|
||||
<div class="relative mx-auto max-w-[48rem]">
|
||||
<div class="chat-screen-form-wrapper" bind:this={formWrapperEl}>
|
||||
<ChatForm
|
||||
class="mx-auto max-w-3xl {className}"
|
||||
bind:this={chatFormRef}
|
||||
bind:value={message}
|
||||
bind:uploadedFiles
|
||||
class={className}
|
||||
{disabled}
|
||||
{isLoading}
|
||||
showMcpPromptButton
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
<script lang="ts">
|
||||
import { fadeInView } from '$lib/actions/fade-in-view.svelte';
|
||||
import { serverStore } from '$lib/stores/server.svelte';
|
||||
|
||||
interface Props {
|
||||
@@ -11,10 +10,9 @@
|
||||
|
||||
<div
|
||||
class={[
|
||||
'pointer-events-none mb-4 hidden px-4 text-center',
|
||||
isEmpty && 'pointer-events-auto block!'
|
||||
'pointer-events-none mb-4 hidden px-4 text-center text-balance',
|
||||
isEmpty && 'mb-[calc(50dvh-8rem)] md:mb-6 pointer-events-auto block!'
|
||||
]}
|
||||
use:fadeInView={{ duration: 300 }}
|
||||
>
|
||||
<h1 class="mb-2 text-2xl font-semibold tracking-tight md:text-3xl">Hello there</h1>
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user