mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-07-01 18:17:42 +02:00
Compare commits
18 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 0a0bba05e8 | |||
| 5166aaf868 | |||
| 6ce3d85796 | |||
| e85e9d7637 | |||
| 8dcc3662a2 | |||
| d37fc93505 | |||
| 4470a0764a | |||
| 4301e27319 | |||
| a2c199e479 | |||
| 15dd67d869 | |||
| bde461de8c | |||
| 8faa87db02 | |||
| 6f1f6a961a | |||
| 669696e00d | |||
| 982060fadc | |||
| 6853bee680 | |||
| 487674fbb3 | |||
| acec774ef6 |
@@ -86,6 +86,7 @@ body:
|
||||
description: >
|
||||
If applicable, please copy and paste any relevant log output, including any generated text.
|
||||
This will be automatically formatted into code, so no need for backticks.
|
||||
If you are encountering problems specifically with the `llama_params_fit` module, always upload `--verbose` logs as well.
|
||||
render: shell
|
||||
validations:
|
||||
required: false
|
||||
|
||||
@@ -31,9 +31,10 @@ concurrency:
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
webui-setup:
|
||||
name: WebUI Setup
|
||||
webui-check:
|
||||
name: WebUI Checks
|
||||
runs-on: ubuntu-latest
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
@@ -42,137 +43,66 @@ jobs:
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Setup Node.js
|
||||
id: node
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: "22"
|
||||
cache: "npm"
|
||||
cache-dependency-path: "tools/server/webui/package-lock.json"
|
||||
|
||||
- name: Cache node_modules
|
||||
uses: actions/cache@v4
|
||||
id: cache-node-modules
|
||||
with:
|
||||
path: tools/server/webui/node_modules
|
||||
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
|
||||
restore-keys: |
|
||||
${{ runner.os }}-node-modules-
|
||||
|
||||
- name: Install dependencies
|
||||
if: steps.cache-node-modules.outputs.cache-hit != 'true'
|
||||
id: setup
|
||||
if: ${{ steps.node.conclusion == 'success' }}
|
||||
run: npm ci
|
||||
working-directory: tools/server/webui
|
||||
|
||||
webui-check:
|
||||
needs: webui-setup
|
||||
name: WebUI Check
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: "22"
|
||||
|
||||
- name: Restore node_modules cache
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: tools/server/webui/node_modules
|
||||
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
|
||||
restore-keys: |
|
||||
${{ runner.os }}-node-modules-
|
||||
|
||||
- name: Run type checking
|
||||
if: ${{ always() && steps.setup.conclusion == 'success' }}
|
||||
run: npm run check
|
||||
working-directory: tools/server/webui
|
||||
|
||||
- name: Run linting
|
||||
if: ${{ always() && steps.setup.conclusion == 'success' }}
|
||||
run: npm run lint
|
||||
working-directory: tools/server/webui
|
||||
|
||||
webui-build:
|
||||
needs: webui-check
|
||||
name: WebUI Build
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: "22"
|
||||
|
||||
- name: Restore node_modules cache
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: tools/server/webui/node_modules
|
||||
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
|
||||
restore-keys: |
|
||||
${{ runner.os }}-node-modules-
|
||||
|
||||
- name: Build application
|
||||
if: ${{ always() && steps.setup.conclusion == 'success' }}
|
||||
run: npm run build
|
||||
working-directory: tools/server/webui
|
||||
|
||||
webui-tests:
|
||||
needs: webui-build
|
||||
name: Run WebUI tests
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: "22"
|
||||
|
||||
- name: Restore node_modules cache
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: tools/server/webui/node_modules
|
||||
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
|
||||
restore-keys: |
|
||||
${{ runner.os }}-node-modules-
|
||||
|
||||
- name: Install Playwright browsers
|
||||
id: playwright
|
||||
if: ${{ always() && steps.setup.conclusion == 'success' }}
|
||||
run: npx playwright install --with-deps
|
||||
working-directory: tools/server/webui
|
||||
|
||||
- name: Build Storybook
|
||||
if: ${{ always() && steps.playwright.conclusion == 'success' }}
|
||||
run: npm run build-storybook
|
||||
working-directory: tools/server/webui
|
||||
|
||||
- name: Run Client tests
|
||||
if: ${{ always() && steps.playwright.conclusion == 'success' }}
|
||||
run: npm run test:client
|
||||
working-directory: tools/server/webui
|
||||
|
||||
- name: Run Server tests
|
||||
run: npm run test:server
|
||||
- name: Run Unit tests
|
||||
if: ${{ always() && steps.playwright.conclusion == 'success' }}
|
||||
run: npm run test:unit
|
||||
working-directory: tools/server/webui
|
||||
|
||||
- name: Run UI tests
|
||||
if: ${{ always() && steps.playwright.conclusion == 'success' }}
|
||||
run: npm run test:ui -- --testTimeout=60000
|
||||
working-directory: tools/server/webui
|
||||
|
||||
- name: Run E2E tests
|
||||
if: ${{ always() && steps.playwright.conclusion == 'success' }}
|
||||
run: npm run test:e2e
|
||||
working-directory: tools/server/webui
|
||||
|
||||
server-build:
|
||||
needs: [webui-tests]
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
strategy:
|
||||
|
||||
+103
-29
@@ -420,6 +420,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||
}
|
||||
};
|
||||
|
||||
std::set<std::string> seen_args;
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
const std::string arg_prefix = "--";
|
||||
|
||||
@@ -430,6 +432,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||
if (arg_to_options.find(arg) == arg_to_options.end()) {
|
||||
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
|
||||
}
|
||||
if (!seen_args.insert(arg).second) {
|
||||
LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
|
||||
}
|
||||
auto & tmp = arg_to_options[arg];
|
||||
auto opt = *tmp.first;
|
||||
bool is_positive = tmp.second;
|
||||
@@ -750,6 +755,8 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
|
||||
}
|
||||
};
|
||||
|
||||
std::set<std::string> seen_args;
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
const std::string arg_prefix = "--";
|
||||
|
||||
@@ -760,6 +767,9 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
|
||||
if (arg_to_options.find(arg) == arg_to_options.end()) {
|
||||
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
|
||||
}
|
||||
if (!seen_args.insert(arg).second) {
|
||||
LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
|
||||
}
|
||||
auto opt = *arg_to_options[arg];
|
||||
std::string val;
|
||||
if (opt.value_hint != nullptr) {
|
||||
@@ -1226,13 +1236,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
|
||||
add_opt(common_arg(
|
||||
{"--in-file"}, "FNAME",
|
||||
"an input file (repeat to specify multiple files)",
|
||||
"an input file (use comma-separated values to specify multiple files)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
std::ifstream file(value);
|
||||
if (!file) {
|
||||
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
||||
for (const auto & item : string_split<std::string>(value, ',')) {
|
||||
std::ifstream file(item);
|
||||
if (!file) {
|
||||
throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
|
||||
}
|
||||
params.in_files.push_back(item);
|
||||
}
|
||||
params.in_files.push_back(value);
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
||||
add_opt(common_arg(
|
||||
@@ -1969,9 +1981,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_OFFLOAD"));
|
||||
add_opt(common_arg(
|
||||
{"--image", "--audio"}, "FILE",
|
||||
"path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
|
||||
"path to an image or audio file. use with multimodal models, use comma-separated values for multiple files\n",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.image.emplace_back(value);
|
||||
for (const auto & item : string_split<std::string>(value, ',')) {
|
||||
params.image.emplace_back(item);
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_CLI}));
|
||||
add_opt(common_arg(
|
||||
@@ -2218,12 +2232,39 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
));
|
||||
add_opt(common_arg(
|
||||
{"--override-kv"}, "KEY=TYPE:VALUE",
|
||||
"advanced option to override model metadata by key. may be specified multiple times.\n"
|
||||
"types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false",
|
||||
{"--override-kv"}, "KEY=TYPE:VALUE,...",
|
||||
"advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated or repeat this argument.\n"
|
||||
"types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false",
|
||||
[](common_params & params, const std::string & value) {
|
||||
if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) {
|
||||
throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", value.c_str()));
|
||||
std::vector<std::string> kv_overrides;
|
||||
|
||||
std::string current;
|
||||
bool escaping = false;
|
||||
|
||||
for (const char c : value) {
|
||||
if (escaping) {
|
||||
current.push_back(c);
|
||||
escaping = false;
|
||||
} else if (c == '\\') {
|
||||
escaping = true;
|
||||
} else if (c == ',') {
|
||||
kv_overrides.push_back(current);
|
||||
current.clear();
|
||||
} else {
|
||||
current.push_back(c);
|
||||
}
|
||||
}
|
||||
|
||||
if (escaping) {
|
||||
current.push_back('\\');
|
||||
}
|
||||
|
||||
kv_overrides.push_back(current);
|
||||
|
||||
for (const auto & kv_override : kv_overrides) {
|
||||
if (!string_parse_kv_override(kv_override.c_str(), params.kv_overrides)) {
|
||||
throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", kv_override.c_str()));
|
||||
}
|
||||
}
|
||||
}
|
||||
));
|
||||
@@ -2237,33 +2278,50 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
));
|
||||
add_opt(common_arg(
|
||||
{"--lora"}, "FNAME",
|
||||
"path to LoRA adapter (can be repeated to use multiple adapters)",
|
||||
"path to LoRA adapter (use comma-separated values to load multiple adapters)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.lora_adapters.push_back({ std::string(value), 1.0, "", "", nullptr });
|
||||
for (const auto & item : string_split<std::string>(value, ',')) {
|
||||
params.lora_adapters.push_back({ item, 1.0, "", "", nullptr });
|
||||
}
|
||||
}
|
||||
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
|
||||
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
|
||||
add_opt(common_arg(
|
||||
{"--lora-scaled"}, "FNAME", "SCALE",
|
||||
"path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
|
||||
[](common_params & params, const std::string & fname, const std::string & scale) {
|
||||
params.lora_adapters.push_back({ fname, std::stof(scale), "", "", nullptr });
|
||||
{"--lora-scaled"}, "FNAME:SCALE,...",
|
||||
"path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)\n"
|
||||
"note: use comma-separated values",
|
||||
[](common_params & params, const std::string & value) {
|
||||
for (const auto & item : string_split<std::string>(value, ',')) {
|
||||
auto parts = string_split<std::string>(item, ':');
|
||||
if (parts.size() != 2) {
|
||||
throw std::invalid_argument("lora-scaled format: FNAME:SCALE");
|
||||
}
|
||||
params.lora_adapters.push_back({ parts[0], std::stof(parts[1]), "", "", nullptr });
|
||||
}
|
||||
}
|
||||
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
|
||||
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
|
||||
add_opt(common_arg(
|
||||
{"--control-vector"}, "FNAME",
|
||||
"add a control vector\nnote: this argument can be repeated to add multiple control vectors",
|
||||
"add a control vector\nnote: use comma-separated values to add multiple control vectors",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.control_vectors.push_back({ 1.0f, value, });
|
||||
for (const auto & item : string_split<std::string>(value, ',')) {
|
||||
params.control_vectors.push_back({ 1.0f, item, });
|
||||
}
|
||||
}
|
||||
));
|
||||
add_opt(common_arg(
|
||||
{"--control-vector-scaled"}, "FNAME", "SCALE",
|
||||
{"--control-vector-scaled"}, "FNAME:SCALE,...",
|
||||
"add a control vector with user defined scaling SCALE\n"
|
||||
"note: this argument can be repeated to add multiple scaled control vectors",
|
||||
[](common_params & params, const std::string & fname, const std::string & scale) {
|
||||
params.control_vectors.push_back({ std::stof(scale), fname });
|
||||
"note: use comma-separated values (format: FNAME:SCALE,...)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
for (const auto & item : string_split<std::string>(value, ',')) {
|
||||
auto parts = string_split<std::string>(item, ':');
|
||||
if (parts.size() != 2) {
|
||||
throw std::invalid_argument("control-vector-scaled format: FNAME:SCALE");
|
||||
}
|
||||
params.control_vectors.push_back({ std::stof(parts[1]), parts[0] });
|
||||
}
|
||||
}
|
||||
));
|
||||
add_opt(common_arg(
|
||||
@@ -2353,13 +2411,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
).set_env("HF_TOKEN"));
|
||||
add_opt(common_arg(
|
||||
{"--context-file"}, "FNAME",
|
||||
"file to load context from (repeat to specify multiple files)",
|
||||
"file to load context from (use comma-separated values to specify multiple files)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
std::ifstream file(value, std::ios::binary);
|
||||
if (!file) {
|
||||
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
||||
for (const auto & item : string_split<std::string>(value, ',')) {
|
||||
std::ifstream file(item, std::ios::binary);
|
||||
if (!file) {
|
||||
throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
|
||||
}
|
||||
params.context_files.push_back(item);
|
||||
}
|
||||
params.context_files.push_back(value);
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
|
||||
add_opt(common_arg(
|
||||
@@ -2550,6 +2610,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.api_prefix = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
|
||||
add_opt(common_arg(
|
||||
{"--webui-config"}, "JSON",
|
||||
"JSON that provides default WebUI settings (overrides WebUI defaults)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.webui_config_json = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG"));
|
||||
add_opt(common_arg(
|
||||
{"--webui-config-file"}, "PATH",
|
||||
"JSON file that provides default WebUI settings (overrides WebUI defaults)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.webui_config_json = read_file(value);
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG_FILE"));
|
||||
add_opt(common_arg(
|
||||
{"--webui"},
|
||||
{"--no-webui"},
|
||||
|
||||
+1
-1
@@ -1092,7 +1092,7 @@ common_init_result::common_init_result(common_params & params) :
|
||||
auto cparams = common_context_params_to_llama(params);
|
||||
|
||||
if (params.fit_params) {
|
||||
LOG_INF("%s: fitting params to device memory, to report bugs during this step use -fit off (or --verbose if you can't)\n", __func__);
|
||||
LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
|
||||
llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
|
||||
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
|
||||
params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
|
||||
|
||||
+4
-1
@@ -484,8 +484,11 @@ struct common_params {
|
||||
|
||||
std::map<std::string, std::string> default_template_kwargs;
|
||||
|
||||
// webui configs
|
||||
bool webui = true;
|
||||
std::string webui_config_json;
|
||||
|
||||
// "advanced" endpoints are disabled by default for better security
|
||||
bool webui = true;
|
||||
bool endpoint_slots = true;
|
||||
bool endpoint_props = false; // only control POST requests, not GET
|
||||
bool endpoint_metrics = false;
|
||||
|
||||
+51
-37
@@ -104,10 +104,9 @@ struct ring_buffer {
|
||||
struct common_sampler {
|
||||
common_params_sampling params;
|
||||
|
||||
struct llama_sampler * grmr;
|
||||
struct llama_sampler * chain;
|
||||
|
||||
bool grammar;
|
||||
|
||||
ring_buffer<llama_token> prev;
|
||||
|
||||
std::vector<llama_token_data> cur;
|
||||
@@ -167,15 +166,14 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||
|
||||
lparams.no_perf = params.no_perf;
|
||||
|
||||
llama_sampler * grmr = nullptr;
|
||||
llama_sampler * chain = llama_sampler_chain_init(lparams);
|
||||
|
||||
bool grammar = false;
|
||||
std::vector<llama_sampler *> samplers;
|
||||
|
||||
if (params.grammar.compare(0, 11, "%llguidance") == 0) {
|
||||
#ifdef LLAMA_USE_LLGUIDANCE
|
||||
samplers.push_back(llama_sampler_init_llg(vocab, "lark", params.grammar.c_str()));
|
||||
grammar = true;
|
||||
grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
|
||||
#else
|
||||
GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
|
||||
#endif // LLAMA_USE_LLGUIDANCE
|
||||
@@ -224,15 +222,12 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||
|
||||
if (!params.grammar.empty()) {
|
||||
if (params.grammar_lazy) {
|
||||
samplers.push_back(
|
||||
llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
|
||||
trigger_patterns_c.data(), trigger_patterns_c.size(),
|
||||
trigger_tokens.data(), trigger_tokens.size()));
|
||||
grmr = llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
|
||||
trigger_patterns_c.data(), trigger_patterns_c.size(),
|
||||
trigger_tokens.data(), trigger_tokens.size());
|
||||
} else {
|
||||
samplers.push_back(llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"));
|
||||
grmr = llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
|
||||
}
|
||||
|
||||
grammar = true;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -303,8 +298,8 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||
|
||||
auto * result = new common_sampler {
|
||||
/* .params = */ params,
|
||||
/* .grmr = */ grmr,
|
||||
/* .chain = */ chain,
|
||||
/* .grammar = */ grammar,
|
||||
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
|
||||
/* .cur = */ {},
|
||||
/* .cur_p = */ {},
|
||||
@@ -315,6 +310,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||
|
||||
void common_sampler_free(struct common_sampler * gsmpl) {
|
||||
if (gsmpl) {
|
||||
llama_sampler_free(gsmpl->grmr);
|
||||
llama_sampler_free(gsmpl->chain);
|
||||
|
||||
delete gsmpl;
|
||||
@@ -324,25 +320,12 @@ void common_sampler_free(struct common_sampler * gsmpl) {
|
||||
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
|
||||
const auto tm = gsmpl->tm();
|
||||
|
||||
if (gsmpl->grammar) {
|
||||
const int n_smpl = llama_sampler_chain_n(gsmpl->chain);
|
||||
|
||||
for (int i = 0; i < n_smpl; i++) {
|
||||
auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
|
||||
|
||||
// the grammar sampler is always the first one
|
||||
if (i == 0) {
|
||||
if (accept_grammar) {
|
||||
llama_sampler_accept(smpl, token);
|
||||
}
|
||||
} else {
|
||||
llama_sampler_accept(smpl, token);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
llama_sampler_accept(gsmpl->chain, token);
|
||||
if (gsmpl->grmr && accept_grammar) {
|
||||
llama_sampler_accept(gsmpl->grmr, token);
|
||||
}
|
||||
|
||||
llama_sampler_accept(gsmpl->chain, token);
|
||||
|
||||
gsmpl->prev.push_back(token);
|
||||
}
|
||||
|
||||
@@ -353,8 +336,8 @@ void common_sampler_reset(struct common_sampler * gsmpl) {
|
||||
struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
|
||||
return new common_sampler {
|
||||
/* .params = */ gsmpl->params,
|
||||
/* .grmr = */ llama_sampler_clone(gsmpl->grmr),
|
||||
/* .chain = */ llama_sampler_clone(gsmpl->chain),
|
||||
/* .grammar = */ gsmpl->grammar,
|
||||
/* .prev = */ gsmpl->prev,
|
||||
/* .cur = */ gsmpl->cur,
|
||||
/* .cur_p = */ gsmpl->cur_p,
|
||||
@@ -410,7 +393,7 @@ struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl) {
|
||||
return gsmpl->chain;
|
||||
}
|
||||
|
||||
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx) {
|
||||
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
|
||||
llama_synchronize(ctx);
|
||||
|
||||
// start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
|
||||
@@ -418,11 +401,42 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
||||
|
||||
llama_token id = LLAMA_TOKEN_NULL;
|
||||
|
||||
auto & grmr = gsmpl->grmr;
|
||||
auto & chain = gsmpl->chain;
|
||||
auto & cur_p = gsmpl->cur_p; // initialized by set_logits
|
||||
|
||||
gsmpl->set_logits(ctx, idx);
|
||||
|
||||
if (grammar_first) {
|
||||
llama_sampler_apply(grmr, &cur_p);
|
||||
}
|
||||
|
||||
llama_sampler_apply(chain, &cur_p);
|
||||
|
||||
id = cur_p.data[cur_p.selected].id;
|
||||
|
||||
if (grammar_first) {
|
||||
return id;
|
||||
}
|
||||
|
||||
// check if it the sampled token fits the grammar (grammar-based rejection sampling)
|
||||
{
|
||||
llama_token_data single_token_data = { id, 1.0f, 0.0f };
|
||||
llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
|
||||
|
||||
llama_sampler_apply(grmr, &single_token_data_array);
|
||||
|
||||
const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
|
||||
if (is_valid) {
|
||||
return id;
|
||||
}
|
||||
}
|
||||
|
||||
// resampling:
|
||||
// if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
|
||||
gsmpl->set_logits(ctx, idx);
|
||||
|
||||
llama_sampler_apply(grmr, &cur_p);
|
||||
llama_sampler_apply(chain, &cur_p);
|
||||
|
||||
GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
|
||||
@@ -432,7 +446,7 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
||||
return id;
|
||||
}
|
||||
|
||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft) {
|
||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
|
||||
GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
|
||||
|
||||
std::vector<llama_token> result;
|
||||
@@ -440,7 +454,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
|
||||
|
||||
size_t i = 0;
|
||||
for (; i < draft.size(); i++) {
|
||||
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);
|
||||
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
|
||||
|
||||
common_sampler_accept(gsmpl, id, true);
|
||||
|
||||
@@ -452,7 +466,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
|
||||
}
|
||||
|
||||
if (i == draft.size()) {
|
||||
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);
|
||||
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
|
||||
|
||||
common_sampler_accept(gsmpl, id, true);
|
||||
|
||||
@@ -462,13 +476,13 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
|
||||
return result;
|
||||
}
|
||||
|
||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft) {
|
||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
|
||||
std::vector<int> idxs(draft.size() + 1);
|
||||
for (size_t i = 0; i < idxs.size(); ++i) {
|
||||
idxs[i] = i;
|
||||
}
|
||||
|
||||
return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft);
|
||||
return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first);
|
||||
}
|
||||
|
||||
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
|
||||
|
||||
+6
-3
@@ -57,7 +57,10 @@ struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
|
||||
// - check if the token fits the grammar (if any)
|
||||
// - if not: resample by first applying the grammar constraints and then sampling again (slower path)
|
||||
//
|
||||
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx);
|
||||
// if grammar_first is true, the grammar is applied before the samplers (slower)
|
||||
// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
|
||||
//
|
||||
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
|
||||
|
||||
// generalized version of common_sampler_sample
|
||||
//
|
||||
@@ -75,10 +78,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
||||
//
|
||||
// returns at least 1 token, up to idxs.size()
|
||||
//
|
||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft);
|
||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);
|
||||
|
||||
// assume idxs == [ 0, 1, 2, ..., draft.size() ]
|
||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft);
|
||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);
|
||||
|
||||
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
|
||||
|
||||
|
||||
@@ -315,7 +315,7 @@ llama_tokens common_speculative_gen_draft(
|
||||
for (int i = 0; i < params.n_draft; ++i) {
|
||||
common_batch_clear(batch);
|
||||
|
||||
common_sampler_sample(smpl, ctx_dft, 0);
|
||||
common_sampler_sample(smpl, ctx_dft, 0, true);
|
||||
|
||||
const auto * cur_p = common_sampler_get_candidates(smpl, true);
|
||||
|
||||
|
||||
@@ -1838,7 +1838,7 @@ class MmprojModel(ModelBase):
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
del bid, name, n_dims # unused
|
||||
if ".patch_embd.weight" in new_name:
|
||||
if ".patch_embd.weight" in new_name or ".patch_merger.weight" in new_name:
|
||||
return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32
|
||||
return False
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@ import os
|
||||
import importlib
|
||||
from pathlib import Path
|
||||
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForImageTextToText, AutoConfig
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
@@ -116,11 +116,11 @@ def debug_hook(name):
|
||||
def fn(_m, input, output):
|
||||
if isinstance(input, torch.Tensor):
|
||||
summarize(input, name + "_in")
|
||||
elif isinstance(input, (tuple, list)) and isinstance(input[0], torch.Tensor):
|
||||
elif isinstance(input, (tuple, list)) and len(input) > 0 and isinstance(input[0], torch.Tensor):
|
||||
summarize(input[0], name + "_in")
|
||||
if isinstance(output, torch.Tensor):
|
||||
summarize(output, name + "_out")
|
||||
elif isinstance(output, (tuple, list)) and isinstance(output[0], torch.Tensor):
|
||||
elif isinstance(output, (tuple, list)) and len(output) > 0 and isinstance(output[0], torch.Tensor):
|
||||
summarize(output[0], name + "_out")
|
||||
|
||||
return fn
|
||||
@@ -130,6 +130,7 @@ unreleased_model_name = os.getenv("UNRELEASED_MODEL_NAME")
|
||||
|
||||
parser = argparse.ArgumentParser(description="Process model with specified path")
|
||||
parser.add_argument("--model-path", "-m", help="Path to the model")
|
||||
parser.add_argument("--prompt-file", "-f", help="Optional prompt file", required=False)
|
||||
args = parser.parse_args()
|
||||
|
||||
model_path = os.environ.get("MODEL_PATH", args.model_path)
|
||||
@@ -142,8 +143,13 @@ if model_path is None:
|
||||
print("Loading model and tokenizer using AutoTokenizer:", model_path)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
|
||||
multimodal = False
|
||||
full_config = config
|
||||
|
||||
print("Model type: ", config.model_type)
|
||||
if "vocab_size" not in config and "text_config" in config:
|
||||
config = config.text_config
|
||||
multimodal = True
|
||||
print("Vocab size: ", config.vocab_size)
|
||||
print("Hidden size: ", config.hidden_size)
|
||||
print("Number of layers: ", config.num_hidden_layers)
|
||||
@@ -169,9 +175,14 @@ if unreleased_model_name:
|
||||
print(f"Failed to import or load model: {e}")
|
||||
exit(1)
|
||||
else:
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_path, device_map="auto", offload_folder="offload", trust_remote_code=True, config=config
|
||||
)
|
||||
if multimodal:
|
||||
model = AutoModelForImageTextToText.from_pretrained(
|
||||
model_path, device_map="auto", offload_folder="offload", trust_remote_code=True, config=full_config
|
||||
)
|
||||
else:
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_path, device_map="auto", offload_folder="offload", trust_remote_code=True, config=config
|
||||
)
|
||||
|
||||
for name, module in model.named_modules():
|
||||
if len(list(module.children())) == 0: # only leaf modules
|
||||
@@ -185,7 +196,10 @@ model_name = os.path.basename(model_path)
|
||||
print(f"Model class: {model.__class__.__name__}")
|
||||
|
||||
device = next(model.parameters()).device
|
||||
if os.getenv("MODEL_TESTING_PROMPT"):
|
||||
if args.prompt_file:
|
||||
with open(args.prompt_file, encoding='utf-8') as f:
|
||||
prompt = f.read()
|
||||
elif os.getenv("MODEL_TESTING_PROMPT"):
|
||||
prompt = os.getenv("MODEL_TESTING_PROMPT")
|
||||
else:
|
||||
prompt = "Hello, my name is"
|
||||
@@ -195,9 +209,18 @@ print(f"Input tokens: {input_ids}")
|
||||
print(f"Input text: {repr(prompt)}")
|
||||
print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}")
|
||||
|
||||
batch_size = 512
|
||||
|
||||
with torch.no_grad():
|
||||
outputs = model(input_ids.to(model.device))
|
||||
logits = outputs.logits
|
||||
past = None
|
||||
outputs = None
|
||||
for i in range(0, input_ids.size(1), batch_size):
|
||||
print(f"Processing chunk with tokens {i} to {i + batch_size}")
|
||||
chunk = input_ids[:, i:i + batch_size]
|
||||
outputs = model(chunk.to(model.device), past_key_values=past, use_cache=True)
|
||||
past = outputs.past_key_values
|
||||
|
||||
logits = outputs.logits # type: ignore
|
||||
|
||||
# Extract logits for the last token (next token prediction)
|
||||
last_logits = logits[0, -1, :].float().cpu().numpy()
|
||||
|
||||
@@ -242,7 +242,7 @@ int main(int argc, char ** argv) {
|
||||
bool accept = false;
|
||||
if (params.sampling.temp > 0) {
|
||||
// stochastic verification
|
||||
common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]);
|
||||
common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true);
|
||||
|
||||
auto & dist_tgt = *common_sampler_get_candidates(smpl, true);
|
||||
|
||||
@@ -491,7 +491,7 @@ int main(int argc, char ** argv) {
|
||||
continue;
|
||||
}
|
||||
|
||||
common_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft);
|
||||
common_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft, true);
|
||||
|
||||
const auto * cur_p = common_sampler_get_candidates(drafts[s].smpl, true);
|
||||
|
||||
|
||||
@@ -43,6 +43,8 @@
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
|
||||
#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
@@ -51,6 +53,8 @@
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
|
||||
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
|
||||
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
|
||||
@@ -67,10 +71,14 @@
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
|
||||
#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
|
||||
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
|
||||
#elif defined(__POWERPC__) || defined(__powerpc__)
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14146#issuecomment-2972561679
|
||||
// quants.c
|
||||
@@ -91,6 +99,8 @@
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
|
||||
#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
@@ -99,6 +109,8 @@
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
|
||||
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
|
||||
#elif defined(__loongarch64)
|
||||
// quants.c
|
||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||
@@ -119,6 +131,8 @@
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
|
||||
#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
@@ -127,6 +141,8 @@
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
|
||||
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
|
||||
#elif defined(__riscv)
|
||||
// quants.c
|
||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||
@@ -154,6 +170,8 @@
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
|
||||
#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
|
||||
@@ -161,6 +179,8 @@
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
|
||||
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
|
||||
#elif defined(__s390x__)
|
||||
// quants.c
|
||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||
@@ -187,6 +207,8 @@
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
|
||||
#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
@@ -195,6 +217,8 @@
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
|
||||
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
|
||||
#elif defined(__wasm__)
|
||||
// quants.c
|
||||
#define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
|
||||
@@ -223,6 +247,8 @@
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
|
||||
#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
@@ -231,4 +257,6 @@
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
|
||||
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
|
||||
#endif
|
||||
|
||||
@@ -786,6 +786,133 @@ void ggml_gemv_q4_K_8x8_q8_K(int n,
|
||||
ggml_gemv_q4_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
void ggml_gemv_q8_0_4x4_q8_0(int n,
|
||||
float * GGML_RESTRICT s,
|
||||
size_t bs,
|
||||
const void * GGML_RESTRICT vx,
|
||||
const void * GGML_RESTRICT vy,
|
||||
int nr,
|
||||
int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
const int ncols_interleaved = 4;
|
||||
const int blocklen = 4;
|
||||
|
||||
assert(n % qk == 0);
|
||||
assert(nc % ncols_interleaved == 0);
|
||||
|
||||
UNUSED(nb);
|
||||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
||||
const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx;
|
||||
|
||||
for (int c = 0; c < nc; c += ncols_interleaved) {
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||
float32x4_t acc = vdupq_n_f32(0);
|
||||
for (int b = 0; b < nb; b++) {
|
||||
int8x16x4_t b_low = vld1q_s8_x4((const int8_t *) b_ptr->qs);
|
||||
int8x16x4_t b_high = vld1q_s8_x4((const int8_t *) b_ptr->qs + 64);
|
||||
float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
|
||||
|
||||
int8x16x2_t a = vld1q_s8_x2(a_ptr->qs);
|
||||
float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
|
||||
|
||||
int32x4_t ret = vdupq_n_s32(0);
|
||||
|
||||
ret = vdotq_laneq_s32(ret, b_low.val[0], a.val[0], 0);
|
||||
ret = vdotq_laneq_s32(ret, b_low.val[1], a.val[0], 1);
|
||||
ret = vdotq_laneq_s32(ret, b_low.val[2], a.val[0], 2);
|
||||
ret = vdotq_laneq_s32(ret, b_low.val[3], a.val[0], 3);
|
||||
|
||||
ret = vdotq_laneq_s32(ret, b_high.val[0], a.val[1], 0);
|
||||
ret = vdotq_laneq_s32(ret, b_high.val[1], a.val[1], 1);
|
||||
ret = vdotq_laneq_s32(ret, b_high.val[2], a.val[1], 2);
|
||||
ret = vdotq_laneq_s32(ret, b_high.val[3], a.val[1], 3);
|
||||
|
||||
acc = vfmaq_f32(acc, vcvtq_f32_s32(ret), vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
|
||||
a_ptr++;
|
||||
b_ptr++;
|
||||
}
|
||||
vst1q_f32(s, acc);
|
||||
s += ncols_interleaved;
|
||||
}
|
||||
return;
|
||||
|
||||
#endif // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
||||
ggml_gemv_q8_0_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
void ggml_gemv_q8_0_4x8_q8_0(int n,
|
||||
float * GGML_RESTRICT s,
|
||||
size_t bs,
|
||||
const void * GGML_RESTRICT vx,
|
||||
const void * GGML_RESTRICT vy,
|
||||
int nr,
|
||||
int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
const int ncols_interleaved = 4;
|
||||
const int blocklen = 8;
|
||||
|
||||
assert(n % qk == 0);
|
||||
assert(nc % ncols_interleaved == 0);
|
||||
|
||||
UNUSED(nb);
|
||||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
||||
const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx;
|
||||
|
||||
for (int c = 0; c < nc; c += ncols_interleaved) {
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||
float32x4_t acc = vdupq_n_f32(0);
|
||||
|
||||
for (int b = 0; b < nb; b++) {
|
||||
int8x16x4_t b_low = vld1q_s8_x4((const int8_t *) b_ptr->qs);
|
||||
int8x16x4_t b_high = vld1q_s8_x4((const int8_t *) b_ptr->qs + 64);
|
||||
float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
|
||||
|
||||
int8x8x4_t a_chunks = vld1_s8_x4(a_ptr->qs);
|
||||
int8x16_t a0 = vcombine_s8(a_chunks.val[0], a_chunks.val[0]);
|
||||
int8x16_t a1 = vcombine_s8(a_chunks.val[1], a_chunks.val[1]);
|
||||
int8x16_t a2 = vcombine_s8(a_chunks.val[2], a_chunks.val[2]);
|
||||
int8x16_t a3 = vcombine_s8(a_chunks.val[3], a_chunks.val[3]);
|
||||
float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
|
||||
|
||||
int32x4_t ret0 = vdupq_n_s32(0);
|
||||
int32x4_t ret1 = vdupq_n_s32(0);
|
||||
|
||||
// 0..7
|
||||
ret0 = vdotq_s32(ret0, b_low.val[0], a0);
|
||||
ret1 = vdotq_s32(ret1, b_low.val[1], a0);
|
||||
// 8..15
|
||||
ret0 = vdotq_s32(ret0, b_low.val[2], a1);
|
||||
ret1 = vdotq_s32(ret1, b_low.val[3], a1);
|
||||
// 16..23
|
||||
ret0 = vdotq_s32(ret0, b_high.val[0], a2);
|
||||
ret1 = vdotq_s32(ret1, b_high.val[1], a2);
|
||||
// 24..31
|
||||
ret0 = vdotq_s32(ret0, b_high.val[2], a3);
|
||||
ret1 = vdotq_s32(ret1, b_high.val[3], a3);
|
||||
|
||||
int32x4_t ret = vpaddq_s32(ret0, ret1);
|
||||
|
||||
acc = vfmaq_f32(acc, vcvtq_f32_s32(ret), vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
|
||||
a_ptr++;
|
||||
b_ptr++;
|
||||
}
|
||||
vst1q_f32(s, acc);
|
||||
s += ncols_interleaved;
|
||||
}
|
||||
return;
|
||||
|
||||
#endif // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
||||
ggml_gemv_q8_0_4x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
@@ -2610,3 +2737,159 @@ void ggml_gemm_q4_K_8x8_q8_K(int n,
|
||||
#endif // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
ggml_gemm_q4_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
|
||||
void ggml_gemm_q8_0_4x4_q8_0(int n,
|
||||
float * GGML_RESTRICT s,
|
||||
size_t bs,
|
||||
const void * GGML_RESTRICT vx,
|
||||
const void * GGML_RESTRICT vy,
|
||||
int nr,
|
||||
int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
const int ncols_interleaved = 4;
|
||||
const int blocklen = 4;
|
||||
|
||||
assert(n % qk == 0);
|
||||
assert(nr % 4 == 0);
|
||||
assert(nc % ncols_interleaved == 0);
|
||||
|
||||
UNUSED(nb);
|
||||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
||||
for (int y = 0; y < nr / 4; y++) {
|
||||
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
|
||||
|
||||
float32x4_t sumf[4];
|
||||
for (int m = 0; m < 4; m++) {
|
||||
sumf[m] = vdupq_n_f32(0);
|
||||
}
|
||||
|
||||
for (int l = 0; l < nb; l++) {
|
||||
float32x4_t a_d = vcvt_f32_f16(vld1_f16((const float16_t *) a_ptr[l].d));
|
||||
float32x4_t b_d = vcvt_f32_f16(vld1_f16((const float16_t *) b_ptr[l].d));
|
||||
|
||||
int32x4_t sumi_0 = vdupq_n_s32(0);
|
||||
int32x4_t sumi_1 = vdupq_n_s32(0);
|
||||
int32x4_t sumi_2 = vdupq_n_s32(0);
|
||||
int32x4_t sumi_3 = vdupq_n_s32(0);
|
||||
|
||||
for (int k_group = 0; k_group < 8; k_group += 4) {
|
||||
int8x16x4_t a = vld1q_s8_x4(a_ptr[l].qs + 16 * k_group);
|
||||
int8x16x4_t b = vld1q_s8_x4(b_ptr[l].qs + 16 * k_group);
|
||||
|
||||
for (int k = 0; k < 4; k++) {
|
||||
sumi_0 = vdotq_laneq_s32(sumi_0, b.val[k], a.val[k], 0);
|
||||
sumi_1 = vdotq_laneq_s32(sumi_1, b.val[k], a.val[k], 1);
|
||||
sumi_2 = vdotq_laneq_s32(sumi_2, b.val[k], a.val[k], 2);
|
||||
sumi_3 = vdotq_laneq_s32(sumi_3, b.val[k], a.val[k], 3);
|
||||
}
|
||||
}
|
||||
|
||||
sumf[0] = vmlaq_f32(sumf[0], vmulq_laneq_f32(b_d, a_d, 0), vcvtq_f32_s32(sumi_0));
|
||||
sumf[1] = vmlaq_f32(sumf[1], vmulq_laneq_f32(b_d, a_d, 1), vcvtq_f32_s32(sumi_1));
|
||||
sumf[2] = vmlaq_f32(sumf[2], vmulq_laneq_f32(b_d, a_d, 2), vcvtq_f32_s32(sumi_2));
|
||||
sumf[3] = vmlaq_f32(sumf[3], vmulq_laneq_f32(b_d, a_d, 3), vcvtq_f32_s32(sumi_3));
|
||||
}
|
||||
|
||||
for (int m = 0; m < 4; m++) {
|
||||
vst1q_f32(s + (y * 4 + m) * bs + x * 4, sumf[m]);
|
||||
}
|
||||
}
|
||||
}
|
||||
return;
|
||||
#endif // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
||||
ggml_gemm_q8_0_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
void ggml_gemm_q8_0_4x8_q8_0(int n,
|
||||
float * GGML_RESTRICT s,
|
||||
size_t bs,
|
||||
const void * GGML_RESTRICT vx,
|
||||
const void * GGML_RESTRICT vy,
|
||||
int nr,
|
||||
int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
const int ncols_interleaved = 4;
|
||||
const int blocklen = 8;
|
||||
|
||||
assert(n % qk == 0);
|
||||
assert(nr % 4 == 0);
|
||||
assert(nc % ncols_interleaved == 0);
|
||||
|
||||
UNUSED(nb);
|
||||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
const block_q8_0x4 * b_ptr_base = (const block_q8_0x4 *) vx;
|
||||
|
||||
for (int y = 0; y < nr; y += 4) {
|
||||
const block_q8_0x4 * a_ptr_base = (const block_q8_0x4 *) vy + (y / 4) * nb;
|
||||
|
||||
for (int x = 0; x < nc; x += ncols_interleaved) {
|
||||
const block_q8_0x4 * b_ptr = b_ptr_base + (x / 4) * nb;
|
||||
const block_q8_0x4 * a_ptr = a_ptr_base;
|
||||
|
||||
float32x4_t acc_f32[4];
|
||||
for (int i = 0; i < 4; i++) {
|
||||
acc_f32[i] = vdupq_n_f32(0);
|
||||
}
|
||||
|
||||
for (int b = 0; b < nb; b++) {
|
||||
int32x4_t acc[4];
|
||||
for (int i = 0; i < 4; i++) {
|
||||
acc[i] = vdupq_n_s32(0);
|
||||
}
|
||||
|
||||
// Process 4 chunks of 8 positions each
|
||||
for (int chunk = 0; chunk < 4; chunk++) {
|
||||
int8x16_t a01 = vld1q_s8(a_ptr->qs + chunk * 32);
|
||||
int8x16_t a23 = vld1q_s8(a_ptr->qs + chunk * 32 + 16);
|
||||
int8x16_t b01 = vld1q_s8(b_ptr->qs + chunk * 32);
|
||||
int8x16_t b23 = vld1q_s8(b_ptr->qs + chunk * 32 + 16);
|
||||
|
||||
acc[0] = vmmlaq_s32(acc[0], a01, b01);
|
||||
acc[1] = vmmlaq_s32(acc[1], a01, b23);
|
||||
acc[2] = vmmlaq_s32(acc[2], a23, b01);
|
||||
acc[3] = vmmlaq_s32(acc[3], a23, b23);
|
||||
}
|
||||
|
||||
// Reorder outputs from 2×2 tiles to row-major
|
||||
// acc[0] = [r0c0, r0c1, r1c0, r1c1]
|
||||
// acc[1] = [r0c2, r0c3, r1c2, r1c3]
|
||||
// acc[2] = [r2c0, r2c1, r3c0, r3c1]
|
||||
// acc[3] = [r2c2, r2c3, r3c2, r3c3]
|
||||
int32x4_t row0 = vcombine_s32(vget_low_s32(acc[0]), vget_low_s32(acc[1]));
|
||||
int32x4_t row1 = vcombine_s32(vget_high_s32(acc[0]), vget_high_s32(acc[1]));
|
||||
int32x4_t row2 = vcombine_s32(vget_low_s32(acc[2]), vget_low_s32(acc[3]));
|
||||
int32x4_t row3 = vcombine_s32(vget_high_s32(acc[2]), vget_high_s32(acc[3]));
|
||||
|
||||
// Scales
|
||||
float32x4_t a_d = vcvt_f32_f16(vld1_f16((const __fp16 *) a_ptr->d));
|
||||
float32x4_t b_d = vcvt_f32_f16(vld1_f16((const __fp16 *) b_ptr->d));
|
||||
|
||||
acc_f32[0] = vfmaq_f32(acc_f32[0], vcvtq_f32_s32(row0), vmulq_laneq_f32(b_d, a_d, 0));
|
||||
acc_f32[1] = vfmaq_f32(acc_f32[1], vcvtq_f32_s32(row1), vmulq_laneq_f32(b_d, a_d, 1));
|
||||
acc_f32[2] = vfmaq_f32(acc_f32[2], vcvtq_f32_s32(row2), vmulq_laneq_f32(b_d, a_d, 2));
|
||||
acc_f32[3] = vfmaq_f32(acc_f32[3], vcvtq_f32_s32(row3), vmulq_laneq_f32(b_d, a_d, 3));
|
||||
|
||||
a_ptr++;
|
||||
b_ptr++;
|
||||
}
|
||||
|
||||
for (int row = 0; row < 4; row++) {
|
||||
vst1q_f32(s + (y + row) * bs + x, acc_f32[row]);
|
||||
}
|
||||
}
|
||||
}
|
||||
return;
|
||||
#endif // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
ggml_gemm_q8_0_4x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
@@ -692,6 +692,100 @@ void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_gemv_q8_0_4x4_q8_0_generic(int n,
|
||||
float * GGML_RESTRICT s,
|
||||
size_t bs,
|
||||
const void * GGML_RESTRICT vx,
|
||||
const void * GGML_RESTRICT vy,
|
||||
int nr,
|
||||
int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
const int ncols_interleaved = 4;
|
||||
const int blocklen = 4;
|
||||
|
||||
assert(nr == 1);
|
||||
assert(n % qk == 0);
|
||||
assert(nc % ncols_interleaved == 0);
|
||||
|
||||
UNUSED(bs);
|
||||
UNUSED(nr);
|
||||
|
||||
float sumf[4];
|
||||
int sumi;
|
||||
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
|
||||
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumf[j] = 0.0;
|
||||
}
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / blocklen); k++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
|
||||
sumi += v0 * a_ptr[l].qs[k * blocklen + i];
|
||||
}
|
||||
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
s[x * ncols_interleaved + j] = sumf[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_gemv_q8_0_4x8_q8_0_generic(int n,
|
||||
float * GGML_RESTRICT s,
|
||||
size_t bs,
|
||||
const void * GGML_RESTRICT vx,
|
||||
const void * GGML_RESTRICT vy,
|
||||
int nr,
|
||||
int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
const int ncols_interleaved = 4;
|
||||
const int blocklen = 8;
|
||||
|
||||
assert(nr == 1);
|
||||
assert(n % qk == 0);
|
||||
assert(nc % ncols_interleaved == 0);
|
||||
|
||||
UNUSED(bs);
|
||||
UNUSED(nr);
|
||||
|
||||
float sumf[4];
|
||||
int sumi;
|
||||
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
|
||||
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumf[j] = 0.0;
|
||||
}
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / blocklen); k++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
|
||||
sumi += v0 * a_ptr[l].qs[k * blocklen + i];
|
||||
}
|
||||
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
s[x * ncols_interleaved + j] = sumf[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
@@ -1219,8 +1313,129 @@ void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_gemm_q8_0_4x4_q8_0_generic(int n,
|
||||
float * GGML_RESTRICT s,
|
||||
size_t bs,
|
||||
const void * GGML_RESTRICT vx,
|
||||
const void * GGML_RESTRICT vy,
|
||||
int nr,
|
||||
int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
const int ncols_interleaved = 4;
|
||||
const int blocklen = 4;
|
||||
|
||||
assert(n % qk == 0);
|
||||
assert(nr % 4 == 0);
|
||||
assert(nc % ncols_interleaved == 0);
|
||||
|
||||
float sumf[4][4];
|
||||
int sumi;
|
||||
|
||||
for (int y = 0; y < nr / 4; y++) {
|
||||
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumf[m][j] = 0.0;
|
||||
}
|
||||
}
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / blocklen); k++) {
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
|
||||
sumi += v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i];
|
||||
}
|
||||
sumf[m][j] +=
|
||||
sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_gemm_q8_0_4x8_q8_0_generic(int n,
|
||||
float * GGML_RESTRICT s,
|
||||
size_t bs,
|
||||
const void * GGML_RESTRICT vx,
|
||||
const void * GGML_RESTRICT vy,
|
||||
int nr,
|
||||
int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
const int ncols_interleaved = 4;
|
||||
const int blocklen = 8;
|
||||
|
||||
assert(n % qk == 0);
|
||||
assert(nr % 4 == 0);
|
||||
assert(nc % ncols_interleaved == 0);
|
||||
|
||||
float sumf[4][4];
|
||||
int sumi;
|
||||
|
||||
for (int y = 0; y < nr / 4; y++) {
|
||||
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumf[m][j] = 0.0;
|
||||
}
|
||||
}
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / blocklen); k++) {
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
|
||||
sumi += v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i];
|
||||
}
|
||||
sumf[m][j] +=
|
||||
sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // extern "C"
|
||||
|
||||
static block_q8_0x4 make_block_q8_0x4(block_q8_0 * in, unsigned int blck_size_interleave) {
|
||||
block_q8_0x4 out;
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
out.d[i] = in[i].d;
|
||||
}
|
||||
|
||||
const int end = QK8_0 * 4 / blck_size_interleave;
|
||||
for (int i = 0; i < end; ++i) {
|
||||
int src_id = i % 4;
|
||||
int src_offset = (i / 4) * blck_size_interleave;
|
||||
int dst_offset = i * blck_size_interleave;
|
||||
memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], blck_size_interleave);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
|
||||
block_q4_0x4 out;
|
||||
|
||||
@@ -1534,6 +1749,38 @@ static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block
|
||||
GGML_UNUSED(data_size);
|
||||
}
|
||||
|
||||
static int repack_q8_0_to_q8_0_4_bl(struct ggml_tensor * t,
|
||||
int interleave_block,
|
||||
const void * GGML_RESTRICT data,
|
||||
size_t data_size) {
|
||||
GGML_ASSERT(t->type == GGML_TYPE_Q8_0);
|
||||
GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
|
||||
constexpr int nrows_interleaved = 4;
|
||||
|
||||
block_q8_0x4 * dst = (block_q8_0x4 *) t->data;
|
||||
const block_q8_0 * src = (const block_q8_0 *) data;
|
||||
block_q8_0 dst_tmp[4];
|
||||
int nrow = ggml_nrows(t);
|
||||
int nblocks = t->ne[0] / QK8_0;
|
||||
|
||||
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q8_0));
|
||||
|
||||
if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
for (int b = 0; b < nrow; b += nrows_interleaved) {
|
||||
for (int64_t x = 0; x < nblocks; x++) {
|
||||
for (int i = 0; i < nrows_interleaved; i++) {
|
||||
dst_tmp[i] = src[x + i * nblocks];
|
||||
}
|
||||
*dst++ = make_block_q8_0x4(dst_tmp, interleave_block);
|
||||
}
|
||||
src += nrows_interleaved * nblocks;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_size_interleave) {
|
||||
block_iq4_nlx4 out;
|
||||
|
||||
@@ -1702,6 +1949,14 @@ template <> int repack<block_iq4_nl, 8, 8>(struct ggml_tensor * t, const void *
|
||||
return repack_iq4_nl_to_iq4_nl_8_bl(t, 8, data, data_size);
|
||||
}
|
||||
|
||||
template <> int repack<block_q8_0, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
||||
return repack_q8_0_to_q8_0_4_bl(t, 4, data, data_size);
|
||||
}
|
||||
|
||||
template <> int repack<block_q8_0, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
||||
return repack_q8_0_to_q8_0_4_bl(t, 8, data, data_size);
|
||||
}
|
||||
|
||||
// gemv
|
||||
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
|
||||
void gemv(int, float *, size_t, const void *, const void *, int, int);
|
||||
@@ -1738,6 +1993,14 @@ template <> void gemv<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size
|
||||
ggml_gemv_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
template <> void gemv<block_q8_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||
ggml_gemv_q8_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
template <> void gemv<block_q8_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||
ggml_gemv_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
// gemm
|
||||
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
|
||||
void gemm(int, float *, size_t, const void *, const void *, int, int);
|
||||
@@ -1774,6 +2037,14 @@ template <> void gemm<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size
|
||||
ggml_gemm_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
template <> void gemm<block_q8_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||
ggml_gemm_q8_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
template <> void gemm<block_q8_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||
ggml_gemm_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
class tensor_traits_base : public ggml::cpu::tensor_traits {
|
||||
public:
|
||||
virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
|
||||
@@ -2168,6 +2439,10 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
||||
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
|
||||
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
|
||||
|
||||
// instance for Q8_0
|
||||
static const ggml::cpu::repack::tensor_traits<block_q8_0, 4, 4, GGML_TYPE_Q8_0> q8_0_4x4_q8_0;
|
||||
static const ggml::cpu::repack::tensor_traits<block_q8_0, 8, 4, GGML_TYPE_Q8_0> q8_0_4x8_q8_0;
|
||||
|
||||
if (cur->type == GGML_TYPE_Q4_0) {
|
||||
if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)
|
||||
|| (ggml_cpu_has_riscv_v() && (ggml_cpu_get_rvv_vlen() >= QK4_0))) {
|
||||
@@ -2218,6 +2493,17 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
||||
return &iq4_nl_4x4_q8_0;
|
||||
}
|
||||
}
|
||||
} else if (cur->type == GGML_TYPE_Q8_0) {
|
||||
if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
||||
if (cur->ne[1] % 4 == 0) {
|
||||
return &q8_0_4x8_q8_0;
|
||||
}
|
||||
}
|
||||
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
||||
if (cur->ne[1] % 4 == 0) {
|
||||
return &q8_0_4x4_q8_0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
|
||||
@@ -98,6 +98,10 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q8_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q8_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q8_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q8_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
|
||||
// Native implementations
|
||||
void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
@@ -120,6 +124,10 @@ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q8_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q8_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q8_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q8_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
|
||||
#if defined(__cplusplus)
|
||||
} // extern "C"
|
||||
|
||||
+129
-111
@@ -76,15 +76,31 @@ namespace ggml_cuda_mma {
|
||||
// For the A/C matrices this means I major == row major, J major == column major.
|
||||
// For the B matrix this means I major == column major, J major == row major.
|
||||
// MIRRORED == Each data value is held exactly once per thread subgroup.
|
||||
DATA_LAYOUT_I_MAJOR = 0, // Always used for Turing, Ampere, Ada Lovelace, consumer Blackwell.
|
||||
DATA_LAYOUT_I_MAJOR_MIRRORED = 10,
|
||||
DATA_LAYOUT_J_MAJOR_MIRRORED = 20,
|
||||
DATA_LAYOUT_I_MAJOR = 0, // Always used for Turing, Ampere, Ada Lovelace, consumer Blackwell, matrix A&B for RDNA4 and CDNA.
|
||||
DATA_LAYOUT_J_MAJOR = 10, // Matrix C for CDNA and RDNA4, int and float matrix C for RDNA3.
|
||||
DATA_LAYOUT_I_MAJOR_MIRRORED = 20,
|
||||
DATA_LAYOUT_J_MAJOR_MIRRORED = 30,
|
||||
DATA_LAYOUT_I_MAJOR_DUAL = 40, // Matrix A&B for RDNA3.
|
||||
};
|
||||
// Implemented mma combinations are:
|
||||
// - (I_MAJOR, I_MAJOR) -> I_MAJOR
|
||||
// - (I_MAJOR, I_MAJOR_MIRRORED) -> I_MAJOR
|
||||
// - (I_MAJOR, J_MAJOR_MIRRORED) -> I_MAJOR
|
||||
|
||||
constexpr bool is_i_major(const data_layout dl) {
|
||||
return dl == DATA_LAYOUT_I_MAJOR ||
|
||||
dl == DATA_LAYOUT_I_MAJOR_MIRRORED ||
|
||||
dl == DATA_LAYOUT_I_MAJOR_DUAL;
|
||||
}
|
||||
|
||||
constexpr data_layout get_input_data_layout() {
|
||||
#if defined(RDNA3)
|
||||
return DATA_LAYOUT_I_MAJOR_DUAL;
|
||||
#else
|
||||
return DATA_LAYOUT_I_MAJOR;
|
||||
#endif // defined(RDNA3)
|
||||
}
|
||||
|
||||
template <int I_, int J_, typename T, data_layout ds_=DATA_LAYOUT_I_MAJOR>
|
||||
struct tile {};
|
||||
|
||||
@@ -115,9 +131,9 @@ namespace ggml_cuda_mma {
|
||||
} else if constexpr (I == 32 && J == 4) {
|
||||
return threadIdx.x % 32;
|
||||
} else if constexpr (I == 16 && J == 16) {
|
||||
return 4 * (threadIdx.x / 16) + l;
|
||||
return threadIdx.x % 16;
|
||||
} else if constexpr (I == 32 && J == 32) {
|
||||
return 4 * (threadIdx.x / 32) + 8 * (l / 4) + (l % 4);
|
||||
return threadIdx.x % 32;
|
||||
} else {
|
||||
NO_DEVICE_CODE;
|
||||
return -1;
|
||||
@@ -132,9 +148,9 @@ namespace ggml_cuda_mma {
|
||||
} else if constexpr (I == 32 && J == 4) {
|
||||
return 2 * (threadIdx.x / 32) + l;
|
||||
} else if constexpr (I == 16 && J == 16) {
|
||||
return threadIdx.x % 16;
|
||||
return 4 * (threadIdx.x / 16) + l;
|
||||
} else if constexpr (I == 32 && J == 32) {
|
||||
return threadIdx.x % 32;
|
||||
return 4 * (threadIdx.x / 32) + 8 * (l / 4) + (l % 4);
|
||||
} else {
|
||||
NO_DEVICE_CODE;
|
||||
return -1;
|
||||
@@ -171,28 +187,19 @@ namespace ggml_cuda_mma {
|
||||
}
|
||||
}
|
||||
#elif defined(AMD_WMMA_AVAILABLE)
|
||||
#if defined(RDNA4)
|
||||
static constexpr int ne = I * J / 32;
|
||||
#elif defined(RDNA3)
|
||||
static constexpr int ne = (I == 16 && J == 16) ? I * J / 32 : I * J / 16;
|
||||
#endif // defined(RDNA4)
|
||||
T x[ne] = {0};
|
||||
|
||||
static constexpr __device__ bool supported() {
|
||||
if (I == 16 && J == 16) return true;
|
||||
if (I == 16 && J == 8) return true;
|
||||
if (I == 16 && J == 4) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int get_i(const int l) {
|
||||
if constexpr (I == 16 && J == 16) {
|
||||
#if defined(RDNA4)
|
||||
return 8 * (threadIdx.x / 16) + l;
|
||||
#elif defined(RDNA3)
|
||||
return 2 * l + (threadIdx.x / 16);
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
return -1;
|
||||
#endif // defined(RDNA4)
|
||||
if constexpr (supported()) {
|
||||
return threadIdx.x % 16;
|
||||
} else {
|
||||
NO_DEVICE_CODE;
|
||||
return -1;
|
||||
@@ -201,7 +208,17 @@ namespace ggml_cuda_mma {
|
||||
|
||||
static __device__ __forceinline__ int get_j(const int l) {
|
||||
if constexpr (I == 16 && J == 16) {
|
||||
return threadIdx.x % 16;
|
||||
// matrix C
|
||||
#if defined(RDNA3)
|
||||
return 2 * l + (threadIdx.x / 16);
|
||||
#else
|
||||
return ne * (threadIdx.x / 16) + l;
|
||||
#endif // defined(RDNA3)
|
||||
} else if constexpr (I == 16 && J == 8) {
|
||||
// mmq input for RDNA4
|
||||
return ne * (threadIdx.x / 16) + l;
|
||||
} else if constexpr (I == 16 && J == 4) {
|
||||
return ne * (threadIdx.x / 16) + l;
|
||||
} else {
|
||||
NO_DEVICE_CODE;
|
||||
return -1;
|
||||
@@ -293,12 +310,7 @@ namespace ggml_cuda_mma {
|
||||
}
|
||||
}
|
||||
#elif defined(AMD_WMMA_AVAILABLE)
|
||||
#if defined(RDNA3)
|
||||
// RDNA3 has duplicated data as input.
|
||||
static constexpr int ne = I * J / 32 * 2;
|
||||
#else
|
||||
static constexpr int ne = I * J / 32;
|
||||
#endif // defined(RDNA3)
|
||||
half2 x[ne] = {{0.0f, 0.0f}};
|
||||
|
||||
static constexpr __device__ bool supported() {
|
||||
@@ -317,14 +329,7 @@ namespace ggml_cuda_mma {
|
||||
|
||||
static __device__ __forceinline__ int get_j(const int l) {
|
||||
if constexpr (I == 16 && J == 8) {
|
||||
#if defined(RDNA4)
|
||||
return 4 * (threadIdx.x / 16) + l;
|
||||
#elif defined(RDNA3)
|
||||
return l;
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
return -1;
|
||||
#endif // defined(RDNA4)
|
||||
} else {
|
||||
NO_DEVICE_CODE;
|
||||
return -1;
|
||||
@@ -382,42 +387,19 @@ namespace ggml_cuda_mma {
|
||||
static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR;
|
||||
|
||||
#if defined(AMD_WMMA_AVAILABLE)
|
||||
#if defined(RDNA3)
|
||||
// RDNA3 has duplicated data as input.
|
||||
static constexpr int ne = I * J / 32 * 2;
|
||||
#else
|
||||
static constexpr int ne = I * J / 32;
|
||||
#endif // defined(RDNA3)
|
||||
nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
|
||||
|
||||
static constexpr __device__ bool supported() {
|
||||
if (I == 16 && J == 8) return true;
|
||||
return false;
|
||||
return tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR>::supported();
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int get_i(const int l) {
|
||||
if constexpr (I == 16 && J == 8) {
|
||||
return threadIdx.x % 16;
|
||||
} else {
|
||||
NO_DEVICE_CODE;
|
||||
return -1;
|
||||
}
|
||||
return tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR>::get_i(l);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int get_j(const int l) {
|
||||
if constexpr (I == 16 && J == 8) {
|
||||
#if defined(RDNA4)
|
||||
return 4 * (threadIdx.x / 16) + l;
|
||||
#elif defined(RDNA3)
|
||||
return l;
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
return -1;
|
||||
#endif // defined(RDNA4)
|
||||
} else {
|
||||
NO_DEVICE_CODE;
|
||||
return -1;
|
||||
}
|
||||
return tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR>::get_j(l);
|
||||
}
|
||||
#else
|
||||
static constexpr int ne = I * J / WARP_SIZE;
|
||||
@@ -458,6 +440,28 @@ namespace ggml_cuda_mma {
|
||||
#endif // defined(AMD_WMMA_AVAILABLE)
|
||||
};
|
||||
|
||||
template <int I_, int J_, typename T>
|
||||
struct tile<I_, J_, T, DATA_LAYOUT_J_MAJOR> {
|
||||
static constexpr int I = I_;
|
||||
static constexpr int J = J_;
|
||||
static constexpr data_layout dl = DATA_LAYOUT_J_MAJOR;
|
||||
|
||||
static constexpr int ne = tile<I_, J_, T, DATA_LAYOUT_I_MAJOR>::ne;
|
||||
T x[ne] = {0};
|
||||
|
||||
static constexpr __device__ bool supported() {
|
||||
return tile<I_, J_, T, DATA_LAYOUT_I_MAJOR>::supported();
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int get_i(const int l) {
|
||||
return tile<I_, J_, T, DATA_LAYOUT_I_MAJOR>::get_j(l);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int get_j(const int l) {
|
||||
return tile<I_, J_, T, DATA_LAYOUT_I_MAJOR>::get_i(l);
|
||||
}
|
||||
};
|
||||
|
||||
template <int I_, int J_>
|
||||
struct tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR_MIRRORED> {
|
||||
static constexpr int I = I_;
|
||||
@@ -524,6 +528,42 @@ namespace ggml_cuda_mma {
|
||||
}
|
||||
};
|
||||
|
||||
template <int I_, int J_, typename T>
|
||||
struct tile<I_, J_, T, DATA_LAYOUT_I_MAJOR_DUAL> {
|
||||
static constexpr int I = I_;
|
||||
static constexpr int J = J_;
|
||||
static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_DUAL;
|
||||
|
||||
static constexpr int ne = I * J / 32 * 2;
|
||||
|
||||
T x[ne] = {0};
|
||||
|
||||
static constexpr __device__ bool supported() {
|
||||
if (I == 16 && J == 16) return true;
|
||||
if (I == 16 && J == 8) return true;
|
||||
if (I == 16 && J == 4) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int get_i(const int l) {
|
||||
if constexpr (supported()) {
|
||||
return threadIdx.x % 16;
|
||||
} else {
|
||||
NO_DEVICE_CODE;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int get_j(const int l) {
|
||||
if constexpr (supported()) {
|
||||
return l;
|
||||
} else {
|
||||
NO_DEVICE_CODE;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#if defined(TURING_MMA_AVAILABLE)
|
||||
template <int I, int J>
|
||||
static __device__ __forceinline__ tile<I, J/2, half2> get_half2(const tile<I, J, float> & tile_float) {
|
||||
@@ -569,55 +609,28 @@ namespace ggml_cuda_mma {
|
||||
t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)];
|
||||
}
|
||||
} else {
|
||||
int64_t * xi = (int64_t *) t.x;
|
||||
const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I));
|
||||
xi[0] = xs[0];
|
||||
ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
|
||||
}
|
||||
#elif defined(AMD_WMMA_AVAILABLE)
|
||||
if constexpr (std::is_same_v<T, half2> || std::is_same_v<T, nv_bfloat162>) {
|
||||
#if defined(RDNA4)
|
||||
ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
|
||||
#elif defined(RDNA3)
|
||||
ggml_cuda_memcpy_1<sizeof(t.x)/2>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
|
||||
ggml_cuda_memcpy_1<sizeof(t.x)/2>(t.x + t.ne/2, xs0 + t.get_i(0) * stride + t.get_j(t.ne/2));
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // defined(RDNA4)
|
||||
} else if constexpr (std::is_same_v<T, int>) {
|
||||
if constexpr (I == 16 && J == 4) {
|
||||
int64_t * xi = (int64_t *) t.x;
|
||||
#if defined(RDNA4)
|
||||
const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I));
|
||||
xi[0] = xs[0];
|
||||
#elif defined(RDNA3)
|
||||
static_assert(tile<I,J,T>::ne >= 4, "fragment too small");
|
||||
const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride);
|
||||
xi[0] = xs[0];
|
||||
xi[1] = xs[1];
|
||||
#endif // defined(RDNA4)
|
||||
} else if constexpr (I == 16 && J == 8) {
|
||||
int64_t * xi = (int64_t *) t.x;
|
||||
#if defined(RDNA4)
|
||||
const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I));
|
||||
xi[0] = xs[0];
|
||||
|
||||
const int64_t * xs1 = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I) + 2);
|
||||
xi[1] = xs1[0];
|
||||
#elif defined(RDNA3)
|
||||
static_assert(tile<I,J,T>::ne >= 8, "fragment too small");
|
||||
const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride);
|
||||
// contiguous four 64-bit chunks per lane for the wider RDNA3 fragment
|
||||
xi[0] = xs[0];
|
||||
xi[1] = xs[1];
|
||||
const int64_t * xs1 = xs + 2;
|
||||
xi[2] = xs1[0];
|
||||
xi[3] = xs1[1];
|
||||
#endif // defined(RDNA4)
|
||||
// All wmma layout has contiguous data when i-major.
|
||||
if constexpr (is_i_major(dl)) {
|
||||
// the data must be aligned to 16 bytes when bigger than ggml_cuda_get_max_cpy_bytes()
|
||||
constexpr int aligned_copy_bytes = ggml_cuda_get_max_cpy_bytes();
|
||||
if constexpr (sizeof(t.x) > aligned_copy_bytes) {
|
||||
static_assert(sizeof(t.x) % aligned_copy_bytes == 0, "bad type size");
|
||||
constexpr int aligned_copy_count = sizeof(t.x)/aligned_copy_bytes;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < aligned_copy_count; ++i) {
|
||||
ggml_cuda_memcpy_1<aligned_copy_bytes>(t.x + t.ne/aligned_copy_count*i, xs0 + t.get_i(0) * stride + t.get_j(t.ne/aligned_copy_count*i));
|
||||
}
|
||||
} else {
|
||||
NO_DEVICE_CODE;
|
||||
ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
|
||||
}
|
||||
} else {
|
||||
NO_DEVICE_CODE;
|
||||
#pragma unroll
|
||||
for (int l = 0; l < t.ne; ++l) {
|
||||
t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)];
|
||||
}
|
||||
}
|
||||
#else
|
||||
#pragma unroll
|
||||
@@ -660,9 +673,9 @@ namespace ggml_cuda_mma {
|
||||
#endif // TURING_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
template <typename T, data_layout dl>
|
||||
static __device__ __forceinline__ void load_ldmatrix(
|
||||
tile<16, 8, T> & t, const T * __restrict__ xs0, const int stride) {
|
||||
tile<16, 8, T, dl> & t, const T * __restrict__ xs0, const int stride) {
|
||||
#if defined(TURING_MMA_AVAILABLE)
|
||||
int * xi = (int * ) t.x;
|
||||
const int * xs = (const int *) xs0 + (threadIdx.x % t.I) * stride + (threadIdx.x / t.I) * (t.J / 2);
|
||||
@@ -832,8 +845,9 @@ namespace ggml_cuda_mma {
|
||||
#endif // TURING_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
template <data_layout dl_ab, data_layout dl_d>
|
||||
static __device__ __forceinline__ void mma(
|
||||
tile<16, 8, float> & D, const tile<16, 8, float> & A, const tile<8, 8, float> & B) {
|
||||
tile<16, 8, float, dl_d> & D, const tile<16, 8, float, dl_ab> & A, const tile<8, 8, float, dl_ab> & B) {
|
||||
#ifdef AMPERE_MMA_AVAILABLE
|
||||
const int * Axi = (const int *) A.x;
|
||||
const int * Bxi = (const int *) B.x;
|
||||
@@ -887,8 +901,9 @@ namespace ggml_cuda_mma {
|
||||
#endif // AMPERE_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
template <data_layout dl_ab, data_layout dl_d>
|
||||
static __device__ __forceinline__ void mma(
|
||||
tile<16, 16, float> & D, const tile<16, 8, half2> & A, const tile<16, 8, half2> & B) {
|
||||
tile<16, 16, float, dl_d> & D, const tile<16, 8, half2, dl_ab> & A, const tile<16, 8, half2, dl_ab> & B) {
|
||||
#ifdef TURING_MMA_AVAILABLE
|
||||
const int * Axi = (const int *) A.x;
|
||||
const int * Bxi = (const int *) B.x;
|
||||
@@ -940,8 +955,9 @@ namespace ggml_cuda_mma {
|
||||
#endif // TURING_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
template <data_layout dl_ab, data_layout dl_d>
|
||||
static __device__ __forceinline__ void mma(
|
||||
tile<16, 16, float> & D, const tile<16, 8, nv_bfloat162> & A, const tile<16, 8, nv_bfloat162> & B) {
|
||||
tile<16, 16, float, dl_d> & D, const tile<16, 8, nv_bfloat162, dl_ab> & A, const tile<16, 8, nv_bfloat162, dl_ab> & B) {
|
||||
#if defined(AMD_WMMA_AVAILABLE)
|
||||
#if defined(RDNA4)
|
||||
using bf16x8_t = __attribute__((ext_vector_type(8))) __bf16;
|
||||
@@ -967,8 +983,9 @@ namespace ggml_cuda_mma {
|
||||
#endif // AMPERE_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
template <data_layout dl_d, data_layout dl_ab>
|
||||
static __device__ __forceinline__ void mma(
|
||||
tile<16, 16, int> & D, const tile<16, 8, int> & A, const tile<16, 8, int> & B) {
|
||||
tile<16, 16, int, dl_d> & D, const tile<16, 8, int, dl_ab> & A, const tile<16, 8, int, dl_ab> & B) {
|
||||
#if defined(AMD_MFMA_AVAILABLE)
|
||||
using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
|
||||
int32x4_t * acc = (int32x4_t *) D.x;
|
||||
@@ -1122,8 +1139,9 @@ namespace ggml_cuda_mma {
|
||||
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void mma(
|
||||
tile<16, 16, int> & D, const tile<16, 4, int> & A, const tile<16, 4, int> & B) {
|
||||
template <data_layout dl_d, data_layout dl_ab>
|
||||
static __device__ __forceinline__ void mma(
|
||||
tile<16, 16, int, dl_d> & D, const tile<16, 4, int, dl_ab> & A, const tile<16, 4, int, dl_ab> & B) {
|
||||
#if defined(AMD_WMMA_AVAILABLE)
|
||||
using int32x8_t = __attribute__((__vector_size__(8 * sizeof(int)))) int;
|
||||
int32x8_t * acc = (int32x8_t *) D.x;
|
||||
|
||||
+14
-10
@@ -32,11 +32,13 @@ static __global__ void mul_mat_f(
|
||||
#if (!defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)) || defined(AMD_WMMA_AVAILABLE)
|
||||
#if defined(AMD_WMMA_AVAILABLE)
|
||||
// Special case for tf32, just dummy mma layout as wmma doesn't support it.
|
||||
constexpr int tile_B_I = std::is_same_v<T, float> ? 8 : 16;
|
||||
constexpr int tile_C_J = std::is_same_v<T, float> ? 8 : 16;
|
||||
typedef tile<16, 8, T> tile_A;
|
||||
typedef tile<tile_B_I, 8, T> tile_B;
|
||||
typedef tile<16, tile_C_J, float> tile_C;
|
||||
constexpr bool is_tf32 = std::is_same_v<T, float>;
|
||||
constexpr int tile_B_I = is_tf32 ? 8 : 16;
|
||||
constexpr int tile_C_J = is_tf32 ? 8 : 16;
|
||||
constexpr data_layout ab_layout = is_tf32 ? DATA_LAYOUT_I_MAJOR : get_input_data_layout();
|
||||
typedef tile<16, 8, T, ab_layout> tile_A;
|
||||
typedef tile<tile_B_I, 8, T, ab_layout> tile_B;
|
||||
typedef tile<16, tile_C_J, float, DATA_LAYOUT_J_MAJOR> tile_C;
|
||||
#else
|
||||
#ifdef VOLTA_MMA_AVAILABLE
|
||||
if constexpr (!std::is_same_v<T, half2>) {NO_DEVICE_CODE;} else {
|
||||
@@ -272,11 +274,13 @@ static __global__ void mul_mat_f_ids(
|
||||
#if (!defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)) || defined(AMD_WMMA_AVAILABLE)
|
||||
#if defined(AMD_WMMA_AVAILABLE)
|
||||
// Special case for tf32, just dummy mma layout as wmma doesn't support it.
|
||||
constexpr int tile_B_I = std::is_same_v<T, float> ? 8 : 16;
|
||||
constexpr int tile_C_J = std::is_same_v<T, float> ? 8 : 16;
|
||||
typedef tile<16, 8, T> tile_A;
|
||||
typedef tile<tile_B_I, 8, T> tile_B;
|
||||
typedef tile<16, tile_C_J, float> tile_C;
|
||||
constexpr bool is_tf32 = std::is_same_v<T, float>;
|
||||
constexpr int tile_B_I = is_tf32 ? 8 : 16;
|
||||
constexpr int tile_C_J = is_tf32 ? 8 : 16;
|
||||
constexpr data_layout ab_layout = is_tf32 ? DATA_LAYOUT_I_MAJOR : get_input_data_layout();
|
||||
typedef tile<16, 8, T, ab_layout> tile_A;
|
||||
typedef tile<tile_B_I, 8, T, ab_layout> tile_B;
|
||||
typedef tile<16, tile_C_J, float, DATA_LAYOUT_J_MAJOR> tile_C;
|
||||
#else
|
||||
#ifdef VOLTA_MMA_AVAILABLE
|
||||
if constexpr (!std::is_same_v<T, half2>) {NO_DEVICE_CODE;} else {
|
||||
|
||||
+36
-29
@@ -797,9 +797,10 @@ template <int mmq_x, int mmq_y, mmq_q8_1_ds_layout ds_layout>
|
||||
static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma(
|
||||
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
typedef tile<16, 8, int> tile_A;
|
||||
typedef tile<16, 8, int> tile_B;
|
||||
typedef tile<16, 16, int> tile_C;
|
||||
constexpr data_layout input_layout = get_input_data_layout();
|
||||
typedef tile<16, 8, int, input_layout> tile_A;
|
||||
typedef tile<16, 8, int, input_layout> tile_B;
|
||||
typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
|
||||
|
||||
constexpr int granularity = mmq_get_granularity_device(mmq_x);
|
||||
constexpr int rows_per_warp = granularity;
|
||||
@@ -966,9 +967,10 @@ template <int mmq_x, int mmq_y>
|
||||
static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma(
|
||||
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
typedef tile<16, 8, int> tile_A;
|
||||
typedef tile<16, 8, int> tile_B;
|
||||
typedef tile<16, 16, int> tile_C;
|
||||
constexpr data_layout input_layout = get_input_data_layout();
|
||||
typedef tile<16, 8, int, input_layout> tile_A;
|
||||
typedef tile<16, 8, int, input_layout> tile_B;
|
||||
typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
|
||||
|
||||
constexpr int granularity = mmq_get_granularity_device(mmq_x);
|
||||
constexpr int rows_per_warp = granularity;
|
||||
@@ -1130,10 +1132,11 @@ template <int mmq_x, int mmq_y>
|
||||
static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma(
|
||||
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
|
||||
#if defined(AMD_MFMA_AVAILABLE)
|
||||
typedef tile<16, 8, int> tile_A;
|
||||
typedef tile<16, 8, int> tile_B;
|
||||
typedef tile<16, 16, int> tile_C;
|
||||
typedef tile<64, 2, int> tile_load;
|
||||
constexpr data_layout input_layout = get_input_data_layout();
|
||||
typedef tile<16, 8, int, input_layout> tile_A;
|
||||
typedef tile<16, 8, int, input_layout> tile_B;
|
||||
typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
|
||||
typedef tile<64, 2, int, input_layout> tile_load;
|
||||
|
||||
constexpr int granularity = mmq_get_granularity_device(mmq_x);
|
||||
constexpr int rows_per_warp = granularity;
|
||||
@@ -1179,9 +1182,10 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma(
|
||||
}
|
||||
}
|
||||
#elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles
|
||||
typedef tile<16, 4, int> tile_A;
|
||||
typedef tile<16, 4, int> tile_B;
|
||||
typedef tile<16, 16, int> tile_C;
|
||||
constexpr data_layout input_layout = get_input_data_layout();
|
||||
typedef tile<16, 4, int, input_layout> tile_A;
|
||||
typedef tile<16, 4, int, input_layout> tile_B;
|
||||
typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
|
||||
|
||||
constexpr int granularity = mmq_get_granularity_device(mmq_x);
|
||||
constexpr int rows_per_warp = granularity;
|
||||
@@ -1435,10 +1439,11 @@ template <int mmq_x, int mmq_y>
|
||||
static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma(
|
||||
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
|
||||
#if defined(AMD_MFMA_AVAILABLE)
|
||||
typedef tile<16, 8, int> tile_A;
|
||||
typedef tile<16, 8, int> tile_B;
|
||||
typedef tile<16, 16, int> tile_C;
|
||||
typedef tile<64, 2, int> tile_load;
|
||||
constexpr data_layout input_layout = get_input_data_layout();
|
||||
typedef tile<16, 8, int, input_layout> tile_A;
|
||||
typedef tile<16, 8, int, input_layout> tile_B;
|
||||
typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
|
||||
typedef tile<64, 2, int, input_layout> tile_load;
|
||||
|
||||
constexpr int granularity = mmq_get_granularity_device(mmq_x);
|
||||
constexpr int rows_per_warp = granularity;
|
||||
@@ -1501,10 +1506,10 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma(
|
||||
}
|
||||
}
|
||||
#elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles
|
||||
|
||||
typedef tile<16, 4, int> tile_A;
|
||||
typedef tile<16, 4, int> tile_B;
|
||||
typedef tile<16, 16, int> tile_C;
|
||||
constexpr data_layout input_layout = get_input_data_layout();
|
||||
typedef tile<16, 4, int, input_layout> tile_A;
|
||||
typedef tile<16, 4, int, input_layout> tile_B;
|
||||
typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
|
||||
|
||||
constexpr int granularity = mmq_get_granularity_device(mmq_x);
|
||||
constexpr int rows_per_warp = granularity;
|
||||
@@ -2265,10 +2270,11 @@ template <int mmq_x, int mmq_y>
|
||||
static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
|
||||
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
|
||||
#if defined(AMD_MFMA_AVAILABLE)
|
||||
typedef tile<16, 8, int> tile_A;
|
||||
typedef tile<16, 8, int> tile_B;
|
||||
typedef tile<16, 16, int> tile_C;
|
||||
typedef tile<64, 2, int> tile_load;
|
||||
constexpr data_layout input_layout = get_input_data_layout();
|
||||
typedef tile<16, 8, int, input_layout> tile_A;
|
||||
typedef tile<16, 8, int, input_layout> tile_B;
|
||||
typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
|
||||
typedef tile<64, 2, int, input_layout> tile_load;
|
||||
|
||||
constexpr int granularity = mmq_get_granularity_device(mmq_x);
|
||||
constexpr int rows_per_warp = granularity;
|
||||
@@ -2316,9 +2322,10 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
|
||||
}
|
||||
}
|
||||
#elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles
|
||||
typedef tile<16, 4, int> tile_A;
|
||||
typedef tile<16, 4, int> tile_B;
|
||||
typedef tile<16, 16, int> tile_C;
|
||||
constexpr data_layout input_layout = get_input_data_layout();
|
||||
typedef tile<16, 4, int, input_layout> tile_A;
|
||||
typedef tile<16, 4, int, input_layout> tile_B;
|
||||
typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
|
||||
|
||||
constexpr int granularity = mmq_get_granularity_device(mmq_x);
|
||||
constexpr int rows_per_warp = granularity;
|
||||
@@ -3015,7 +3022,7 @@ static __device__ __forceinline__ void mmq_write_back_mma(
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
constexpr int tileC_IJ = mmq_get_granularity_device(0);
|
||||
typedef tile<tileC_IJ, tileC_IJ, int> tile_C;
|
||||
typedef tile<tileC_IJ, tileC_IJ, int, DATA_LAYOUT_J_MAJOR> tile_C;
|
||||
constexpr int rows_per_warp = granularity;
|
||||
#else
|
||||
typedef tile<16, 8, int> tile_C;
|
||||
|
||||
@@ -2161,8 +2161,14 @@ static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session
|
||||
}
|
||||
|
||||
// src0, src1 & dst must be mapped to the same session
|
||||
if (!hex_supported_buffer(sess, src0, src1, dst)) {
|
||||
return false;
|
||||
if(src1){
|
||||
if (!hex_supported_buffer(sess, src0, src1, dst)) {
|
||||
return false;
|
||||
}
|
||||
}else{
|
||||
if (!hex_supported_buffer(sess, src0, dst)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -2662,6 +2668,10 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
|
||||
req.op = HTP_OP_UNARY_SILU;
|
||||
supported = true;
|
||||
}
|
||||
else if (ggml_get_unary_op(dst) == GGML_UNARY_OP_GELU){
|
||||
req.op = HTP_OP_UNARY_GELU;
|
||||
supported = true;
|
||||
}
|
||||
break;
|
||||
|
||||
case GGML_OP_GLU:
|
||||
@@ -2677,6 +2687,7 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
|
||||
case GGML_OP_SOFT_MAX:
|
||||
req.op = HTP_OP_SOFTMAX;
|
||||
supported = true;
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
@@ -2956,6 +2967,8 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
|
||||
case GGML_OP_UNARY:
|
||||
if (ggml_get_unary_op(node) == GGML_UNARY_OP_SILU) {
|
||||
ggml_hexagon_unary(node, flags);
|
||||
} else if (ggml_get_unary_op(node) == GGML_UNARY_OP_GELU) {
|
||||
ggml_hexagon_unary(node, flags);
|
||||
}
|
||||
break;
|
||||
case GGML_OP_GLU:
|
||||
@@ -3254,7 +3267,6 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
|
||||
auto sess = static_cast<ggml_hexagon_session *>(dev->context);
|
||||
|
||||
bool supp = false;
|
||||
|
||||
switch (op->op) {
|
||||
case GGML_OP_NONE:
|
||||
case GGML_OP_RESHAPE:
|
||||
@@ -3294,10 +3306,13 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
|
||||
if (ggml_get_unary_op(op) == GGML_UNARY_OP_SILU) {
|
||||
supp = ggml_hexagon_supported_activations(sess, op);
|
||||
}
|
||||
else if (ggml_get_unary_op(op) == GGML_UNARY_OP_GELU){
|
||||
supp = ggml_hexagon_supported_activations(sess, op);
|
||||
}
|
||||
break;
|
||||
|
||||
case GGML_OP_GLU:
|
||||
if ((ggml_get_glu_op(op) == GGML_GLU_OP_SWIGLU) /* || (ggml_get_glu_op(op) == GGML_GLU_OP_SWIGLU_OAI) */) {
|
||||
if ((ggml_get_glu_op(op) == GGML_GLU_OP_SWIGLU) || (ggml_get_glu_op(op) == GGML_GLU_OP_SWIGLU_OAI) ) {
|
||||
supp = ggml_hexagon_supported_activations(sess, op);
|
||||
}
|
||||
break;
|
||||
|
||||
@@ -231,7 +231,7 @@ static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0,
|
||||
// x (src0_spad_data) = std::min(src0_p[k], limit);
|
||||
hvx_min_scalar_f32((const uint8_t *) src0, limit, src0_spad_data, nc);
|
||||
// y1 (src1_spad_data) = std::clamp(src1_p[k], -limit, limit);
|
||||
hvx_clamp_scalar_f32((const uint8_t *) src1, limit, limit, src1_spad_data, nc);
|
||||
hvx_clamp_scalar_f32((const uint8_t *) src1, -limit, limit, src1_spad_data, nc);
|
||||
// y (src1_spad_data) = y1 + 1.f
|
||||
hvx_add_scalar_f32(src1_spad_data, 1.0, src1_spad_data, nc);
|
||||
// x1 (dst_spad_data) = alpha * (x)
|
||||
@@ -255,6 +255,91 @@ static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0,
|
||||
src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
|
||||
}
|
||||
|
||||
|
||||
static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
|
||||
struct htp_tensor * dst,
|
||||
const int32_t * op_params,
|
||||
struct htp_spad * src0_spad,
|
||||
struct htp_spad * dst_spad,
|
||||
uint32_t nth,
|
||||
uint32_t ith,
|
||||
uint32_t src0_nrows_per_thread) {
|
||||
htp_act_preamble2;
|
||||
|
||||
uint64_t t1, t2;
|
||||
t1 = HAP_perf_get_qtimer_count();
|
||||
|
||||
const size_t src0_row_size = nb01;
|
||||
const size_t dst_row_size = nb1;
|
||||
|
||||
const uint32_t src0_nrows = ne01 * ne02 * ne03;
|
||||
|
||||
const uint32_t src0_start_row = src0_nrows_per_thread * ith;
|
||||
const uint32_t src0_end_row = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
|
||||
|
||||
// no work for this thread
|
||||
if (src0_start_row >= src0_end_row) {
|
||||
return;
|
||||
}
|
||||
|
||||
int is_aligned = 1;
|
||||
int opt_path = 0;
|
||||
if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) {
|
||||
is_aligned = 0;
|
||||
FARF(HIGH, "silu-f32: unaligned addresses in elementwise op, possibly slower execution\n");
|
||||
}
|
||||
if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
|
||||
opt_path = 1;
|
||||
}
|
||||
|
||||
const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
|
||||
uint8_t * restrict data_dst = (uint8_t *) dst->data;
|
||||
|
||||
uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_row_size);
|
||||
uint8_t * restrict dst_spad_data = dst_spad->data + (ith * dst_row_size);
|
||||
|
||||
const int BLOCK = 8;
|
||||
for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
|
||||
const uint32_t block_end = MIN(ir + BLOCK, src0_end_row);
|
||||
|
||||
// Prefetch next block
|
||||
if (block_end < src0_end_row) {
|
||||
const float * restrict prefetch_ptr = (float *) (data_src0 + (block_end * src0_row_size));
|
||||
htp_l2fetch(prefetch_ptr, 1, block_end * src0_row_size, src0_row_size);
|
||||
}
|
||||
|
||||
// Process rows in current block
|
||||
for (uint32_t ib = ir; ib < block_end; ib++) {
|
||||
const float * restrict src0 = (float *) (data_src0 + (ib * src0_row_size));
|
||||
float * restrict dst = (float *) (data_dst + (ib * dst_row_size));
|
||||
|
||||
// gelu = x * sigmoid(1.702 * x) // current implementation
|
||||
if (1 == opt_path) {
|
||||
hvx_mul_scalar_f32((const uint8_t *) src0, (float) 1.702, (uint8_t *) src0_spad_data, ne0);
|
||||
hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
|
||||
hvx_mul_f32_opt((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
|
||||
} else {
|
||||
hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0);
|
||||
hvx_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
|
||||
hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
t2 = HAP_perf_get_qtimer_count();
|
||||
|
||||
FARF(HIGH, "gelu-f32 %d/%d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, opt_path, ne00, ne01, ne02,
|
||||
ne03, src0_start_row, src0_end_row, ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
|
||||
}
|
||||
|
||||
static void unary_gelu_fp32(unsigned int n, unsigned int i, void * data) {
|
||||
struct htp_ops_context * octx = (struct htp_ops_context *) data;
|
||||
unary_gelu_fp32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i,
|
||||
octx->src0_nrows_per_thread);
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void unary_silu_fp32_per_thread(const struct htp_tensor * src0,
|
||||
struct htp_tensor * dst,
|
||||
const int32_t * op_params,
|
||||
@@ -371,7 +456,10 @@ static int execute_op_activations_fp32(struct htp_ops_context * octx) {
|
||||
act_op_func = glu_swiglu_oai_fp32;
|
||||
op_type = "swiglu-oai-f32";
|
||||
break;
|
||||
|
||||
case HTP_OP_UNARY_GELU:
|
||||
act_op_func = unary_gelu_fp32;
|
||||
op_type = "gelu-f32";
|
||||
break;
|
||||
default:
|
||||
FARF(ERROR, "Unsupported activations Op %u\n", octx->op);
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
|
||||
@@ -51,11 +51,12 @@ enum htp_op {
|
||||
HTP_OP_MUL_MAT_ID = 5,
|
||||
HTP_OP_RMS_NORM = 6,
|
||||
HTP_OP_UNARY_SILU = 7,
|
||||
HTP_OP_GLU_SWIGLU = 8,
|
||||
HTP_OP_GLU_SWIGLU_OAI = 9,
|
||||
HTP_OP_SOFTMAX = 10,
|
||||
HTP_OP_ADD_ID = 11,
|
||||
HTP_OP_ROPE = 12,
|
||||
HTP_OP_UNARY_GELU = 8,
|
||||
HTP_OP_GLU_SWIGLU = 9,
|
||||
HTP_OP_GLU_SWIGLU_OAI = 10,
|
||||
HTP_OP_SOFTMAX = 11,
|
||||
HTP_OP_ADD_ID = 12,
|
||||
HTP_OP_ROPE = 13,
|
||||
INVALID
|
||||
};
|
||||
|
||||
|
||||
@@ -49,6 +49,8 @@ void hvx_mul_f32(const uint8_t * restrict src0,
|
||||
FARF(HIGH, "hvx_mul_f32: unaligned loop in hvx op, possibly slower execution\n");
|
||||
}
|
||||
|
||||
|
||||
bool handled_leftover = false;
|
||||
if (0 == unaligned_loop) {
|
||||
HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0;
|
||||
HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
|
||||
@@ -60,18 +62,59 @@ void hvx_mul_f32(const uint8_t * restrict src0,
|
||||
*vec_out++ = Q6_Vsf_equals_Vqf32(v);
|
||||
}
|
||||
} else {
|
||||
int step_of_1 = num_elems_whole >> 5; // divby 32, because 32 float = 128 bytes per HVX vector
|
||||
int leftover_size = left_over * sizeof(float);
|
||||
|
||||
|
||||
HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0;
|
||||
HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
|
||||
HVX_UVector * restrict vec_out = (HVX_UVector *) dst;
|
||||
|
||||
HVX_Vector slinep;
|
||||
HVX_Vector slinec;
|
||||
HVX_Vector sline;
|
||||
HVX_Vector sline2p;
|
||||
HVX_Vector sline2c;
|
||||
HVX_Vector sline2;
|
||||
|
||||
slinep = *vec_in1++;
|
||||
sline2p = *vec_in2++;
|
||||
#pragma unroll(4)
|
||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
||||
HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32);
|
||||
HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32);
|
||||
for (int i = step_of_1 - 1; i > 0; i--) {
|
||||
slinec = *vec_in1++;
|
||||
sline2c = *vec_in2++;
|
||||
sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);
|
||||
sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
|
||||
|
||||
HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in1, in2);
|
||||
*((HVX_UVector *) (vec_out++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, sline2));
|
||||
slinep = slinec;
|
||||
sline2p = sline2c;
|
||||
}
|
||||
if (step_of_1 > 1) {
|
||||
slinec = htp_is_aligned(vec_in1, VLEN) && left_over == 0 ? slinep : *vec_in1++;
|
||||
sline2c = htp_is_aligned(vec_in2, VLEN) && left_over == 0 ? sline2p : *vec_in2++;
|
||||
|
||||
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
|
||||
sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);
|
||||
sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
|
||||
*((HVX_UVector *) (vec_out++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, sline2));
|
||||
slinep = slinec;
|
||||
sline2p = sline2c;
|
||||
}
|
||||
if (left_over > 0) {
|
||||
slinec = (is_in_one_chunk(vec_in1, leftover_size, VLEN) ? slinep : *vec_in1++);
|
||||
|
||||
sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);
|
||||
sline2c = (is_in_one_chunk(vec_in2, leftover_size, VLEN) ? sline2p : *vec_in2++);
|
||||
sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
|
||||
|
||||
HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(sline, sline2);
|
||||
hvx_vec_store_u(vec_out, leftover_size, Q6_Vsf_equals_Vqf32(out));
|
||||
handled_leftover = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (left_over > 0) {
|
||||
|
||||
if (left_over > 0 && !handled_leftover) {
|
||||
const float * src0f = (const float *) src0 + num_elems_whole;
|
||||
const float * src1f = (const float *) src1 + num_elems_whole;
|
||||
float * dstf = (float *) dst + num_elems_whole;
|
||||
@@ -464,7 +507,7 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
|
||||
}
|
||||
|
||||
HVX_Vector val_vec = hvx_vec_splat_fp32(val);
|
||||
|
||||
bool handled_leftover = false;
|
||||
if (0 == unaligned_loop) {
|
||||
HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
|
||||
HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
|
||||
@@ -475,17 +518,47 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
|
||||
*vec_out++ = Q6_Vsf_equals_Vqf32(v);
|
||||
}
|
||||
} else {
|
||||
int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector
|
||||
int leftover_size = left_over * sizeof(float);
|
||||
|
||||
HVX_Vector * input_v_ptr = (HVX_Vector *) src;
|
||||
HVX_UVector * output_v_ptr = (HVX_UVector *) dst;
|
||||
|
||||
HVX_Vector slinep;
|
||||
HVX_Vector slinec;
|
||||
HVX_Vector sline;
|
||||
|
||||
slinep = *input_v_ptr++;
|
||||
|
||||
#pragma unroll(4)
|
||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
||||
HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
|
||||
for (int i = step_of_1 - 1; i > 0; i--) {
|
||||
slinec = *input_v_ptr++;
|
||||
sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
|
||||
*((HVX_UVector *) (output_v_ptr++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
|
||||
/* Prepare slinep for next iteration */
|
||||
slinep = slinec;
|
||||
}
|
||||
|
||||
HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, val_vec);
|
||||
if (step_of_1 > 0) {
|
||||
slinec = htp_is_aligned(input_v_ptr, VLEN) && left_over == 0 ? slinep : *input_v_ptr++;
|
||||
sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
|
||||
*((HVX_UVector *) (output_v_ptr++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
|
||||
|
||||
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
|
||||
slinep = slinec;
|
||||
}
|
||||
|
||||
if (leftover_size > 0) {
|
||||
slinec = (is_in_one_chunk(input_v_ptr, leftover_size, VLEN) ? slinep : *input_v_ptr++);
|
||||
|
||||
sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
|
||||
|
||||
HVX_Vector sout = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
|
||||
hvx_vec_store_u(output_v_ptr, leftover_size, sout);
|
||||
handled_leftover = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (left_over > 0) {
|
||||
if (left_over > 0 && !handled_leftover) {
|
||||
const float * srcf = (const float *) src + num_elems_whole;
|
||||
float * dstf = (float *) dst + num_elems_whole;
|
||||
|
||||
@@ -875,35 +948,45 @@ float hvx_self_max_f32(const uint8_t * restrict src, const int num_elems) {
|
||||
void hvx_min_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems) {
|
||||
size_t left_over = num_elems & (VLEN_FP32 - 1);
|
||||
size_t num_elems_whole = num_elems - left_over;
|
||||
|
||||
int unalign_address = 0;
|
||||
if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
|
||||
FARF(HIGH, "hvx_min_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
|
||||
unalign_address = 1;
|
||||
}
|
||||
|
||||
assert((1 == htp_is_aligned((void *) src, VLEN)) || (0 == num_elems_whole));
|
||||
|
||||
const float * src_f = (const float *) src;
|
||||
|
||||
HVX_Vector vec_min = Q6_V_vsplat_R(val);
|
||||
HVX_Vector vec_min = hvx_vec_splat_fp32(val);
|
||||
|
||||
HVX_Vector * restrict vec_in = (HVX_Vector *) src;
|
||||
HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
|
||||
if(unalign_address == 0){
|
||||
HVX_Vector * restrict vec_in = (HVX_Vector *) src;
|
||||
HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
|
||||
|
||||
#pragma unroll(4)
|
||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
||||
vec_min = Q6_Vsf_vmin_VsfVsf(vec_min, *vec_in++);
|
||||
*vec_out++ = Q6_Vsf_equals_Vqf32(vec_min);
|
||||
#pragma unroll(4)
|
||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
||||
HVX_Vector min_clamp = Q6_Vsf_vmin_VsfVsf(vec_min, *vec_in++);
|
||||
*vec_out++ = (min_clamp);
|
||||
}
|
||||
}else{
|
||||
HVX_UVector * restrict vec_in = (HVX_Vector *) src;
|
||||
HVX_UVector * restrict vec_out = (HVX_Vector *) dst;
|
||||
|
||||
#pragma unroll(4)
|
||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
||||
HVX_Vector min_clamp = Q6_Vsf_vmin_VsfVsf(vec_min, *vec_in++);
|
||||
*vec_out++ = (min_clamp);
|
||||
}
|
||||
}
|
||||
|
||||
if (left_over > 0) {
|
||||
if (left_over > 0 ) {
|
||||
const float * srcf = (const float *) src + num_elems_whole;
|
||||
float * dstf = (float *) dst + num_elems_whole;
|
||||
|
||||
HVX_Vector in = *(HVX_UVector *) srcf;
|
||||
HVX_UVector in = *(HVX_UVector *) srcf;
|
||||
|
||||
vec_min = Q6_Vsf_vmin_VsfVsf(vec_min, in);
|
||||
HVX_UVector min_clamp = Q6_Vsf_vmin_VsfVsf(vec_min, in);
|
||||
|
||||
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(vec_min));
|
||||
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, (min_clamp));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -915,46 +998,70 @@ void hvx_clamp_scalar_f32(const uint8_t * restrict src,
|
||||
size_t left_over = num_elems & (VLEN_FP32 - 1);
|
||||
size_t num_elems_whole = num_elems - left_over;
|
||||
|
||||
int unalign_address = 0;
|
||||
if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
|
||||
FARF(HIGH, "hvx_clamp_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
|
||||
unalign_address = 1;
|
||||
}
|
||||
|
||||
assert((1 == htp_is_aligned((void *) src, VLEN)) || (0 == num_elems_whole));
|
||||
|
||||
HVX_Vector * restrict vec_in = (HVX_Vector *) src;
|
||||
HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
|
||||
|
||||
HVX_Vector range_left = hvx_vec_splat_fp32(limit_left);
|
||||
HVX_Vector range_right = hvx_vec_splat_fp32(limit_right);
|
||||
|
||||
#pragma unroll(4)
|
||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
||||
HVX_Vector in_vec = *vec_in++;
|
||||
HVX_Vector temp_v = in_vec;
|
||||
if(unalign_address == 0){
|
||||
HVX_Vector * restrict vec_in = (HVX_Vector *) src;
|
||||
HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
|
||||
|
||||
HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, range_right);
|
||||
HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(range_left, in_vec);
|
||||
|
||||
in_vec = Q6_V_vmux_QVV(pred_cap_right, range_right, temp_v);
|
||||
in_vec = Q6_V_vmux_QVV(pred_cap_left, range_left, temp_v);
|
||||
|
||||
*vec_out++ = Q6_Vsf_equals_Vqf32(in_vec);
|
||||
#pragma unroll(4)
|
||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
||||
HVX_Vector in_vec = *vec_in++;
|
||||
HVX_Vector temp_v = in_vec;
|
||||
|
||||
HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, range_right);
|
||||
HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(range_left, in_vec);
|
||||
|
||||
in_vec = Q6_V_vmux_QVV(pred_cap_right, range_right, temp_v);
|
||||
in_vec = Q6_V_vmux_QVV(pred_cap_left, range_left, in_vec);
|
||||
|
||||
*vec_out++ = in_vec;
|
||||
}
|
||||
|
||||
}else{
|
||||
|
||||
HVX_UVector * restrict vec_in = (HVX_UVector *) src;
|
||||
HVX_UVector * restrict vec_out = (HVX_UVector *) dst;
|
||||
|
||||
#pragma unroll(4)
|
||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
||||
HVX_Vector in_vec = *vec_in++;
|
||||
HVX_Vector temp_v = in_vec;
|
||||
|
||||
HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, range_right);
|
||||
HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(range_left, in_vec);
|
||||
|
||||
in_vec = Q6_V_vmux_QVV(pred_cap_right, range_right, temp_v);
|
||||
in_vec = Q6_V_vmux_QVV(pred_cap_left, range_left, in_vec);
|
||||
|
||||
*vec_out++ = in_vec;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (left_over > 0) {
|
||||
const float * srcf = (const float *) src + num_elems_whole;
|
||||
float * dstf = (float *) dst + num_elems_whole;
|
||||
|
||||
HVX_Vector in = *(HVX_UVector *) srcf;
|
||||
HVX_Vector in_vec = *(HVX_UVector *) srcf;
|
||||
|
||||
HVX_Vector temp_v = in;
|
||||
HVX_Vector temp_v = in_vec;
|
||||
|
||||
HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in, range_right);
|
||||
HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(range_left, in);
|
||||
HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, range_right);
|
||||
HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(range_left, in_vec);
|
||||
|
||||
in = Q6_V_vmux_QVV(pred_cap_right, range_right, temp_v);
|
||||
in = Q6_V_vmux_QVV(pred_cap_left, range_left, temp_v);
|
||||
in_vec = Q6_V_vmux_QVV(pred_cap_right, range_right, temp_v);
|
||||
in_vec = Q6_V_vmux_QVV(pred_cap_left, range_left, in_vec);
|
||||
|
||||
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(in));
|
||||
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, in_vec);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -265,12 +265,16 @@ static inline void hvx_bcast_fp32_a(uint8_t * restrict dst, float elem, uint32_t
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Return whether 'n' elements from vector are in the one chunk of 'chunk_size'. */
|
||||
static __attribute__((always_inline)) int32_t is_in_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) {
|
||||
uint32_t left_off = (size_t) addr & (chunk_size - 1);
|
||||
uint32_t right_off = left_off + n;
|
||||
return right_off <= chunk_size;
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void hvx_vec_dump_fp16_n(char * pref, HVX_Vector v, uint32_t n) {
|
||||
HVX_VectorAlias u = { .v = v };
|
||||
|
||||
@@ -994,6 +998,59 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t *
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems){
|
||||
int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector
|
||||
int leftover = num_elems - (step_of_1 * VLEN_FP32);
|
||||
|
||||
int32_t leftover_size = leftover * sizeof(float);
|
||||
|
||||
static const float kMinExp = -87.f; // 0
|
||||
static const float kMaxExp = 87.f; // 1
|
||||
|
||||
const HVX_Vector one = hvx_vec_splat_fp32(1.f);
|
||||
const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
|
||||
const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp);
|
||||
|
||||
const float *input = (float *)src;
|
||||
float *output = (float *)dst;
|
||||
|
||||
HVX_Vector * input_v_ptr = (HVX_Vector *) input;
|
||||
HVX_UVector * output_v_ptr = (HVX_UVector *) output;
|
||||
|
||||
HVX_Vector slinep;
|
||||
HVX_Vector slinec;
|
||||
HVX_Vector sline;
|
||||
|
||||
slinep = *input_v_ptr++;
|
||||
#pragma unroll(4)
|
||||
for (int i = step_of_1 - 1; i > 0; i--) {
|
||||
slinec = *input_v_ptr++;
|
||||
sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
|
||||
*((HVX_UVector *) (output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);
|
||||
/* Prepare slinep for next iteration */
|
||||
slinep = slinec;
|
||||
}
|
||||
|
||||
if (step_of_1 > 0) {
|
||||
slinec = htp_is_aligned(input_v_ptr, 128) && leftover == 0 ? slinep : *input_v_ptr++;
|
||||
sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
|
||||
*((HVX_UVector *) (output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);
|
||||
;
|
||||
|
||||
slinep = slinec;
|
||||
}
|
||||
if (leftover > 0) {
|
||||
slinec = (is_in_one_chunk(input_v_ptr, leftover_size, 128) ? slinep : *input_v_ptr++);
|
||||
|
||||
sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
|
||||
|
||||
HVX_Vector sout = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);
|
||||
hvx_vec_store_u(output_v_ptr, leftover_size, sout);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems);
|
||||
void hvx_mul_f32(const uint8_t * restrict src0,
|
||||
const uint8_t * restrict src1,
|
||||
|
||||
@@ -798,6 +798,7 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
|
||||
break;
|
||||
|
||||
case HTP_OP_UNARY_SILU:
|
||||
case HTP_OP_UNARY_GELU:
|
||||
if (n_bufs != 2) {
|
||||
FARF(ERROR, "Bad act-req buffer list");
|
||||
continue;
|
||||
@@ -806,6 +807,7 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
|
||||
break;
|
||||
|
||||
case HTP_OP_GLU_SWIGLU:
|
||||
case HTP_OP_GLU_SWIGLU_OAI:
|
||||
case HTP_OP_SOFTMAX:
|
||||
if ((n_bufs != 2) && (n_bufs != 3)) {
|
||||
FARF(ERROR, "Bad act-req buffer list");
|
||||
|
||||
+1
-1
@@ -2055,7 +2055,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
||||
LLM_TENSOR_SHORTCONV_INPROJ,
|
||||
LLM_TENSOR_SHORTCONV_OUTPROJ,
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
LLM_TENSOR_OUTPUT_NORM,
|
||||
LLM_TENSOR_OUTPUT_NORM_LFM2,
|
||||
LLM_TENSOR_FFN_GATE_INP,
|
||||
LLM_TENSOR_FFN_GATE_EXPS,
|
||||
LLM_TENSOR_FFN_DOWN_EXPS,
|
||||
|
||||
@@ -362,23 +362,39 @@ const char * llama_sampler_name(const struct llama_sampler * smpl) {
|
||||
}
|
||||
|
||||
void llama_sampler_accept(struct llama_sampler * smpl, llama_token token) {
|
||||
if (!smpl) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (smpl->iface->accept) {
|
||||
smpl->iface->accept(smpl, token);
|
||||
}
|
||||
}
|
||||
|
||||
void llama_sampler_apply(struct llama_sampler * smpl, struct llama_token_data_array * cur_p) {
|
||||
if (!smpl) {
|
||||
return;
|
||||
}
|
||||
|
||||
GGML_ASSERT(smpl->iface->apply);
|
||||
smpl->iface->apply(smpl, cur_p);
|
||||
}
|
||||
|
||||
void llama_sampler_reset(struct llama_sampler * smpl) {
|
||||
if (!smpl) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (smpl->iface->reset) {
|
||||
smpl->iface->reset(smpl);
|
||||
}
|
||||
}
|
||||
|
||||
struct llama_sampler * llama_sampler_clone(const struct llama_sampler * smpl) {
|
||||
if (!smpl) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (smpl->iface->clone) {
|
||||
return smpl->iface->clone(smpl);
|
||||
}
|
||||
|
||||
@@ -542,6 +542,7 @@ static void llama_params_fit_impl(
|
||||
} else {
|
||||
assert(ngl_per_device_high[id].n_layer == n_unassigned);
|
||||
ngl_per_device = ngl_per_device_high;
|
||||
mem = mem_high;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
|
||||
}
|
||||
}
|
||||
@@ -629,6 +630,7 @@ static void llama_params_fit_impl(
|
||||
}
|
||||
} else {
|
||||
ngl_per_device = ngl_per_device_high;
|
||||
mem = mem_high;
|
||||
id_dense_start = id_dense_start_high;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
|
||||
@@ -329,6 +329,7 @@ struct mtmd_context {
|
||||
case PROJECTOR_TYPE_QWEN25O:
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
case PROJECTOR_TYPE_GLMA:
|
||||
audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
|
||||
break;
|
||||
default:
|
||||
|
||||
+12
-7
@@ -46,7 +46,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
|
||||
| `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) |
|
||||
| `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)<br/> |
|
||||
| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
|
||||
| `-c, --ctx-size N` | size of the prompt context (default: 4096, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
|
||||
| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
|
||||
| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity)<br/>(env: LLAMA_ARG_N_PREDICT) |
|
||||
| `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
|
||||
| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
|
||||
@@ -82,13 +82,16 @@ For the ful list of features, please refer to [server's changelog](https://githu
|
||||
| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs<br/>(env: LLAMA_ARG_SPLIT_MODE) |
|
||||
| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1<br/>(env: LLAMA_ARG_TENSOR_SPLIT) |
|
||||
| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)<br/>(env: LLAMA_ARG_MAIN_GPU) |
|
||||
| `-fit, --fit [on\|off]` | whether to adjust unset arguments to fit in device memory ('on' or 'off', default: 'on')<br/>(env: LLAMA_ARG_FIT) |
|
||||
| `-fitt, --fit-target MiB` | target margin per device for --fit option, default: 1024<br/>(env: LLAMA_ARG_FIT_TARGET) |
|
||||
| `-fitc, --fit-ctx N` | minimum ctx size that can be set by --fit option, default: 4096<br/>(env: LLAMA_ARG_FIT_CTX) |
|
||||
| `--check-tensors` | check model tensor data for invalid values (default: false) |
|
||||
| `--override-kv KEY=TYPE:VALUE` | advanced option to override model metadata by key. may be specified multiple times.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false |
|
||||
| `--override-kv KEY=TYPE:VALUE,...` | advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated or repeat this argument.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false |
|
||||
| `--op-offload, --no-op-offload` | whether to offload host tensor operations to device (default: true) |
|
||||
| `--lora FNAME` | path to LoRA adapter (can be repeated to use multiple adapters) |
|
||||
| `--lora-scaled FNAME SCALE` | path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters) |
|
||||
| `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors |
|
||||
| `--control-vector-scaled FNAME SCALE` | add a control vector with user defined scaling SCALE<br/>note: this argument can be repeated to add multiple scaled control vectors |
|
||||
| `--lora FNAME` | path to LoRA adapter (use comma-separated values to load multiple adapters) |
|
||||
| `--lora-scaled FNAME:SCALE,...` | path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)<br/>note: use comma-separated values |
|
||||
| `--control-vector FNAME` | add a control vector<br/>note: use comma-separated values to add multiple control vectors |
|
||||
| `--control-vector-scaled FNAME:SCALE,...` | add a control vector with user defined scaling SCALE<br/>note: use comma-separated values (format: FNAME:SCALE,...) |
|
||||
| `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
|
||||
| `-m, --model FNAME` | model path to load<br/>(env: LLAMA_ARG_MODEL) |
|
||||
| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
|
||||
@@ -120,7 +123,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
|
||||
| `--sampling-seq, --sampler-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) |
|
||||
| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
|
||||
| `--temp N` | temperature (default: 0.8) |
|
||||
| `--top-k N` | top-k sampling (default: 40, 0 = disabled) |
|
||||
| `--top-k N` | top-k sampling (default: 40, 0 = disabled)<br/>(env: LLAMA_ARG_TOP_K) |
|
||||
| `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) |
|
||||
| `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) |
|
||||
| `--top-nsigma N` | top-n-sigma sampling (default: -1.0, -1.0 = disabled) |
|
||||
@@ -177,6 +180,8 @@ For the ful list of features, please refer to [server's changelog](https://githu
|
||||
| `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
|
||||
| `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
|
||||
| `--api-prefix PREFIX` | prefix path the server serves from, without the trailing slash (default: )<br/>(env: LLAMA_ARG_API_PREFIX) |
|
||||
| `--webui-config JSON` | JSON that provides default WebUI settings (overrides WebUI defaults)<br/>(env: LLAMA_ARG_WEBUI_CONFIG) |
|
||||
| `--webui-config-file PATH` | JSON file that provides default WebUI settings (overrides WebUI defaults)<br/>(env: LLAMA_ARG_WEBUI_CONFIG_FILE) |
|
||||
| `--webui, --no-webui` | whether to enable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_WEBUI) |
|
||||
| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
|
||||
| `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
|
||||
|
||||
Binary file not shown.
@@ -544,6 +544,8 @@ struct server_context_impl {
|
||||
|
||||
server_metrics metrics;
|
||||
|
||||
json webui_settings = json::object();
|
||||
|
||||
// Necessary similarity of prompt for slot selection
|
||||
float slot_prompt_similarity = 0.0f;
|
||||
|
||||
@@ -575,6 +577,16 @@ struct server_context_impl {
|
||||
|
||||
params_base = params;
|
||||
|
||||
webui_settings = json::object();
|
||||
if (!params_base.webui_config_json.empty()) {
|
||||
try {
|
||||
webui_settings = json::parse(params_base.webui_config_json);
|
||||
} catch (const std::exception & e) {
|
||||
SRV_ERR("%s: failed to parse webui config: %s\n", __func__, e.what());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
llama_init = common_init_from_params(params_base);
|
||||
|
||||
model = llama_init->model();
|
||||
@@ -3103,7 +3115,6 @@ void server_routes::init_routes() {
|
||||
};
|
||||
}
|
||||
|
||||
// this endpoint is publicly available, please only return what is safe to be exposed
|
||||
json data = {
|
||||
{ "default_generation_settings", default_generation_settings_for_props },
|
||||
{ "total_slots", ctx_server.params_base.n_parallel },
|
||||
@@ -3117,6 +3128,7 @@ void server_routes::init_routes() {
|
||||
{ "endpoint_props", params.endpoint_props },
|
||||
{ "endpoint_metrics", params.endpoint_metrics },
|
||||
{ "webui", params.webui },
|
||||
{ "webui_settings", ctx_server.webui_settings },
|
||||
{ "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) },
|
||||
{ "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
|
||||
{ "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
#include <chrono>
|
||||
#include <queue>
|
||||
#include <filesystem>
|
||||
#include <cstring>
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <winsock2.h>
|
||||
@@ -33,7 +34,8 @@
|
||||
#include <limits.h>
|
||||
#endif
|
||||
|
||||
#define CMD_EXIT "exit"
|
||||
#define CMD_ROUTER_TO_CHILD_EXIT "cmd_router_to_child:exit"
|
||||
#define CMD_CHILD_TO_ROUTER_READY "cmd_child_to_router:ready"
|
||||
|
||||
// address for child process, this is needed because router may run on 0.0.0.0
|
||||
// ref: https://github.com/ggml-org/llama.cpp/issues/17862
|
||||
@@ -166,7 +168,9 @@ server_presets::server_presets(int argc, char ** argv, common_params & base_para
|
||||
env == "LLAMA_ARG_MODEL" ||
|
||||
env == "LLAMA_ARG_MMPROJ" ||
|
||||
env == "LLAMA_ARG_HF_REPO" ||
|
||||
env == "LLAMA_ARG_NO_MODELS_AUTOLOAD") {
|
||||
env == "LLAMA_ARG_NO_MODELS_AUTOLOAD" ||
|
||||
env == "LLAMA_ARG_SSL_KEY_FILE" ||
|
||||
env == "LLAMA_ARG_SSL_CERT_FILE") {
|
||||
control_args[env] = opt;
|
||||
}
|
||||
}
|
||||
@@ -220,6 +224,9 @@ void server_presets::render_args(server_model_meta & meta) {
|
||||
preset.options[control_args["LLAMA_ARG_MMPROJ"]] = meta.path_mmproj;
|
||||
}
|
||||
}
|
||||
// disable SSL for child processes (HTTPS already handled by router)
|
||||
preset.options[control_args["LLAMA_ARG_SSL_KEY_FILE"]] = "";
|
||||
preset.options[control_args["LLAMA_ARG_SSL_CERT_FILE"]] = "";
|
||||
meta.args = preset.to_args();
|
||||
// add back the binary path at the front
|
||||
meta.args.insert(meta.args.begin(), get_server_exec_path().string());
|
||||
@@ -534,6 +541,8 @@ void server_models::load(const std::string & name) {
|
||||
std::vector<char *> argv = to_char_ptr_array(child_args);
|
||||
std::vector<char *> envp = to_char_ptr_array(child_env);
|
||||
|
||||
// TODO @ngxson : maybe separate stdout and stderr in the future
|
||||
// so that we can use stdout for commands and stderr for logging
|
||||
int options = subprocess_option_no_window | subprocess_option_combined_stdout_stderr;
|
||||
int result = subprocess_create_ex(argv.data(), options, envp.data(), inst.subproc.get());
|
||||
if (result != 0) {
|
||||
@@ -547,11 +556,17 @@ void server_models::load(const std::string & name) {
|
||||
// captured variables are guaranteed to be destroyed only after the thread is joined
|
||||
inst.th = std::thread([this, name, child_proc = inst.subproc, port = inst.meta.port]() {
|
||||
// read stdout/stderr and forward to main server log
|
||||
bool state_received = false; // true if child state received
|
||||
FILE * p_stdout_stderr = subprocess_stdout(child_proc.get());
|
||||
if (p_stdout_stderr) {
|
||||
char buffer[4096];
|
||||
while (fgets(buffer, sizeof(buffer), p_stdout_stderr) != nullptr) {
|
||||
LOG("[%5d] %s", port, buffer);
|
||||
if (!state_received && std::strstr(buffer, CMD_CHILD_TO_ROUTER_READY) != nullptr) {
|
||||
// child process is ready
|
||||
this->update_status(name, SERVER_MODEL_STATUS_LOADED);
|
||||
state_received = true;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
SRV_ERR("failed to get stdout/stderr of child process for name=%s\n", name.c_str());
|
||||
@@ -595,7 +610,7 @@ static void interrupt_subprocess(FILE * stdin_file) {
|
||||
// because subprocess.h does not provide a way to send SIGINT,
|
||||
// we will send a command to the child process to exit gracefully
|
||||
if (stdin_file) {
|
||||
fprintf(stdin_file, "%s\n", CMD_EXIT);
|
||||
fprintf(stdin_file, "%s\n", CMD_ROUTER_TO_CHILD_EXIT);
|
||||
fflush(stdin_file);
|
||||
}
|
||||
}
|
||||
@@ -707,32 +722,13 @@ server_http_res_ptr server_models::proxy_request(const server_http_req & req, co
|
||||
return proxy;
|
||||
}
|
||||
|
||||
std::thread server_models::setup_child_server(const common_params & base_params, int router_port, const std::string & name, std::function<void(int)> & shutdown_handler) {
|
||||
std::thread server_models::setup_child_server(const std::function<void(int)> & shutdown_handler) {
|
||||
// send a notification to the router server that a model instance is ready
|
||||
// TODO @ngxson : use HTTP client from libcommon
|
||||
httplib::Client cli(base_params.hostname, router_port);
|
||||
cli.set_connection_timeout(0, 200000); // 200 milliseconds
|
||||
|
||||
httplib::Request req;
|
||||
req.method = "POST";
|
||||
req.path = "/models/status";
|
||||
req.set_header("Content-Type", "application/json");
|
||||
if (!base_params.api_keys.empty()) {
|
||||
req.set_header("Authorization", "Bearer " + base_params.api_keys[0]);
|
||||
}
|
||||
|
||||
json body;
|
||||
body["model"] = name;
|
||||
body["value"] = server_model_status_to_string(SERVER_MODEL_STATUS_LOADED);
|
||||
req.body = body.dump();
|
||||
|
||||
SRV_INF("notifying router server (port=%d) that model %s is ready\n", router_port, name.c_str());
|
||||
auto result = cli.send(std::move(req));
|
||||
if (result.error() != httplib::Error::Success) {
|
||||
auto err_str = httplib::to_string(result.error());
|
||||
SRV_ERR("failed to notify router server: %s\n", err_str.c_str());
|
||||
exit(1); // force exit
|
||||
}
|
||||
common_log_pause(common_log_main());
|
||||
fflush(stdout);
|
||||
fprintf(stdout, "%s\n", CMD_CHILD_TO_ROUTER_READY);
|
||||
fflush(stdout);
|
||||
common_log_resume(common_log_main());
|
||||
|
||||
// setup thread for monitoring stdin
|
||||
return std::thread([shutdown_handler]() {
|
||||
@@ -746,7 +742,7 @@ std::thread server_models::setup_child_server(const common_params & base_params,
|
||||
eof = true;
|
||||
break;
|
||||
}
|
||||
if (line.find(CMD_EXIT) != std::string::npos) {
|
||||
if (line.find(CMD_ROUTER_TO_CHILD_EXIT) != std::string::npos) {
|
||||
SRV_INF("%s", "exit command received, exiting...\n");
|
||||
shutdown_handler(0);
|
||||
break;
|
||||
@@ -822,6 +818,7 @@ void server_models_routes::init_routes() {
|
||||
{"params", json{}},
|
||||
{"n_ctx", 0},
|
||||
}},
|
||||
{"webui_settings", webui_settings},
|
||||
});
|
||||
return res;
|
||||
}
|
||||
@@ -869,18 +866,6 @@ void server_models_routes::init_routes() {
|
||||
return res;
|
||||
};
|
||||
|
||||
// used by child process to notify the router about status change
|
||||
// TODO @ngxson : maybe implement authentication for this endpoint in the future
|
||||
this->post_router_models_status = [this](const server_http_req & req) {
|
||||
auto res = std::make_unique<server_http_res>();
|
||||
json body = json::parse(req.body);
|
||||
std::string model = json_value(body, "model", std::string());
|
||||
std::string value = json_value(body, "value", std::string());
|
||||
models.update_status(model, server_model_status_from_string(value));
|
||||
res_ok(res, {{"success", true}});
|
||||
return res;
|
||||
};
|
||||
|
||||
this->get_router_models = [this](const server_http_req &) {
|
||||
auto res = std::make_unique<server_http_res>();
|
||||
json models_json = json::array();
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
#include "common.h"
|
||||
#include "preset.h"
|
||||
#include "server-common.h"
|
||||
#include "server-http.h"
|
||||
|
||||
#include <mutex>
|
||||
@@ -144,14 +145,23 @@ public:
|
||||
|
||||
// notify the router server that a model instance is ready
|
||||
// return the monitoring thread (to be joined by the caller)
|
||||
static std::thread setup_child_server(const common_params & base_params, int router_port, const std::string & name, std::function<void(int)> & shutdown_handler);
|
||||
static std::thread setup_child_server(const std::function<void(int)> & shutdown_handler);
|
||||
};
|
||||
|
||||
struct server_models_routes {
|
||||
common_params params;
|
||||
json webui_settings = json::object();
|
||||
server_models models;
|
||||
server_models_routes(const common_params & params, int argc, char ** argv, char ** envp)
|
||||
: params(params), models(params, argc, argv, envp) {
|
||||
if (!this->params.webui_config_json.empty()) {
|
||||
try {
|
||||
webui_settings = json::parse(this->params.webui_config_json);
|
||||
} catch (const std::exception & e) {
|
||||
LOG_ERR("%s: failed to parse webui config: %s\n", __func__, e.what());
|
||||
throw;
|
||||
}
|
||||
}
|
||||
init_routes();
|
||||
}
|
||||
|
||||
@@ -162,7 +172,6 @@ struct server_models_routes {
|
||||
server_http_context::handler_t proxy_post;
|
||||
server_http_context::handler_t get_router_models;
|
||||
server_http_context::handler_t post_router_models_load;
|
||||
server_http_context::handler_t post_router_models_status;
|
||||
server_http_context::handler_t post_router_models_unload;
|
||||
};
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
#include "log.h"
|
||||
|
||||
#include <atomic>
|
||||
#include <exception>
|
||||
#include <signal.h>
|
||||
#include <thread> // for std::thread::hardware_concurrency
|
||||
|
||||
@@ -124,7 +125,12 @@ int main(int argc, char ** argv, char ** envp) {
|
||||
std::optional<server_models_routes> models_routes{};
|
||||
if (is_router_server) {
|
||||
// setup server instances manager
|
||||
models_routes.emplace(params, argc, argv, envp);
|
||||
try {
|
||||
models_routes.emplace(params, argc, argv, envp);
|
||||
} catch (const std::exception & e) {
|
||||
LOG_ERR("%s: failed to initialize router models: %s\n", __func__, e.what());
|
||||
return 1;
|
||||
}
|
||||
|
||||
// proxy handlers
|
||||
// note: routes.get_health stays the same
|
||||
@@ -153,7 +159,6 @@ int main(int argc, char ** argv, char ** envp) {
|
||||
routes.get_models = models_routes->get_router_models;
|
||||
ctx_http.post("/models/load", ex_wrapper(models_routes->post_router_models_load));
|
||||
ctx_http.post("/models/unload", ex_wrapper(models_routes->post_router_models_unload));
|
||||
ctx_http.post("/models/status", ex_wrapper(models_routes->post_router_models_status));
|
||||
}
|
||||
|
||||
ctx_http.get ("/health", ex_wrapper(routes.get_health)); // public endpoint (no API key check)
|
||||
@@ -291,7 +296,7 @@ int main(int argc, char ** argv, char ** envp) {
|
||||
const char * router_port = std::getenv("LLAMA_SERVER_ROUTER_PORT");
|
||||
std::thread monitor_thread;
|
||||
if (router_port != nullptr) {
|
||||
monitor_thread = server_models::setup_child_server(params, std::atoi(router_port), params.model_alias, shutdown_handler);
|
||||
monitor_thread = server_models::setup_child_server(shutdown_handler);
|
||||
}
|
||||
|
||||
// this call blocks the main thread until queue_tasks.terminate() is called
|
||||
|
||||
@@ -2,6 +2,9 @@
|
||||
import { Settings } from '@lucide/svelte';
|
||||
import { DialogChatSettings } from '$lib/components/app';
|
||||
import { Button } from '$lib/components/ui/button';
|
||||
import { useSidebar } from '$lib/components/ui/sidebar';
|
||||
|
||||
const sidebar = useSidebar();
|
||||
|
||||
let settingsOpen = $state(false);
|
||||
|
||||
@@ -11,7 +14,9 @@
|
||||
</script>
|
||||
|
||||
<header
|
||||
class="md:background-transparent pointer-events-none fixed top-0 right-0 left-0 z-50 flex items-center justify-end bg-background/40 p-4 backdrop-blur-xl md:left-[var(--sidebar-width)]"
|
||||
class="md:background-transparent pointer-events-none fixed top-0 right-0 left-0 z-50 flex items-center justify-end bg-background/40 p-4 backdrop-blur-xl duration-200 ease-linear {sidebar.open
|
||||
? 'md:left-[var(--sidebar-width)]'
|
||||
: ''}"
|
||||
>
|
||||
<div class="pointer-events-auto flex items-center space-x-2">
|
||||
<Button variant="ghost" size="sm" onclick={toggleSettings}>
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
<script lang="ts">
|
||||
import { base } from '$app/paths';
|
||||
import { AlertTriangle, RefreshCw, Key, CheckCircle, XCircle } from '@lucide/svelte';
|
||||
import { goto } from '$app/navigation';
|
||||
import { Button } from '$lib/components/ui/button';
|
||||
@@ -64,7 +65,7 @@
|
||||
settingsStore.updateConfig('apiKey', apiKeyInput.trim());
|
||||
|
||||
// Test the API key by making a real request to the server
|
||||
const response = await fetch('./props', {
|
||||
const response = await fetch(`${base}/props`, {
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
Authorization: `Bearer ${apiKeyInput.trim()}`
|
||||
|
||||
@@ -130,5 +130,19 @@ describe('ParameterSyncService', () => {
|
||||
expect(result.max_tokens).toBe(-1);
|
||||
expect(result.temperature).toBe(0.7);
|
||||
});
|
||||
|
||||
it('should merge webui settings from props when provided', () => {
|
||||
const result = ParameterSyncService.extractServerDefaults(null, {
|
||||
pasteLongTextToFileLen: 0,
|
||||
pdfAsImage: true,
|
||||
renderUserContentAsMarkdown: false,
|
||||
theme: 'dark'
|
||||
});
|
||||
|
||||
expect(result.pasteLongTextToFileLen).toBe(0);
|
||||
expect(result.pdfAsImage).toBe(true);
|
||||
expect(result.renderUserContentAsMarkdown).toBe(false);
|
||||
expect(result.theme).toBeUndefined();
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
@@ -55,7 +55,55 @@ export const SYNCABLE_PARAMETERS: SyncableParameter[] = [
|
||||
{ key: 'dry_allowed_length', serverKey: 'dry_allowed_length', type: 'number', canSync: true },
|
||||
{ key: 'dry_penalty_last_n', serverKey: 'dry_penalty_last_n', type: 'number', canSync: true },
|
||||
{ key: 'max_tokens', serverKey: 'max_tokens', type: 'number', canSync: true },
|
||||
{ key: 'samplers', serverKey: 'samplers', type: 'string', canSync: true }
|
||||
{ key: 'samplers', serverKey: 'samplers', type: 'string', canSync: true },
|
||||
{
|
||||
key: 'pasteLongTextToFileLen',
|
||||
serverKey: 'pasteLongTextToFileLen',
|
||||
type: 'number',
|
||||
canSync: true
|
||||
},
|
||||
{ key: 'pdfAsImage', serverKey: 'pdfAsImage', type: 'boolean', canSync: true },
|
||||
{
|
||||
key: 'showThoughtInProgress',
|
||||
serverKey: 'showThoughtInProgress',
|
||||
type: 'boolean',
|
||||
canSync: true
|
||||
},
|
||||
{ key: 'showToolCalls', serverKey: 'showToolCalls', type: 'boolean', canSync: true },
|
||||
{
|
||||
key: 'disableReasoningFormat',
|
||||
serverKey: 'disableReasoningFormat',
|
||||
type: 'boolean',
|
||||
canSync: true
|
||||
},
|
||||
{ key: 'keepStatsVisible', serverKey: 'keepStatsVisible', type: 'boolean', canSync: true },
|
||||
{ key: 'showMessageStats', serverKey: 'showMessageStats', type: 'boolean', canSync: true },
|
||||
{
|
||||
key: 'askForTitleConfirmation',
|
||||
serverKey: 'askForTitleConfirmation',
|
||||
type: 'boolean',
|
||||
canSync: true
|
||||
},
|
||||
{ key: 'disableAutoScroll', serverKey: 'disableAutoScroll', type: 'boolean', canSync: true },
|
||||
{
|
||||
key: 'renderUserContentAsMarkdown',
|
||||
serverKey: 'renderUserContentAsMarkdown',
|
||||
type: 'boolean',
|
||||
canSync: true
|
||||
},
|
||||
{ key: 'autoMicOnEmpty', serverKey: 'autoMicOnEmpty', type: 'boolean', canSync: true },
|
||||
{
|
||||
key: 'pyInterpreterEnabled',
|
||||
serverKey: 'pyInterpreterEnabled',
|
||||
type: 'boolean',
|
||||
canSync: true
|
||||
},
|
||||
{
|
||||
key: 'enableContinueGeneration',
|
||||
serverKey: 'enableContinueGeneration',
|
||||
type: 'boolean',
|
||||
canSync: true
|
||||
}
|
||||
];
|
||||
|
||||
export class ParameterSyncService {
|
||||
@@ -74,25 +122,39 @@ export class ParameterSyncService {
|
||||
* Extract server default parameters that can be synced
|
||||
*/
|
||||
static extractServerDefaults(
|
||||
serverParams: ApiLlamaCppServerProps['default_generation_settings']['params'] | null
|
||||
serverParams: ApiLlamaCppServerProps['default_generation_settings']['params'] | null,
|
||||
webuiSettings?: Record<string, string | number | boolean>
|
||||
): ParameterRecord {
|
||||
if (!serverParams) return {};
|
||||
|
||||
const extracted: ParameterRecord = {};
|
||||
|
||||
for (const param of SYNCABLE_PARAMETERS) {
|
||||
if (param.canSync && param.serverKey in serverParams) {
|
||||
const value = (serverParams as unknown as Record<string, ParameterValue>)[param.serverKey];
|
||||
if (value !== undefined) {
|
||||
// Apply precision rounding to avoid JavaScript floating-point issues
|
||||
extracted[param.key] = this.roundFloatingPoint(value);
|
||||
if (serverParams) {
|
||||
for (const param of SYNCABLE_PARAMETERS) {
|
||||
if (param.canSync && param.serverKey in serverParams) {
|
||||
const value = (serverParams as unknown as Record<string, ParameterValue>)[
|
||||
param.serverKey
|
||||
];
|
||||
if (value !== undefined) {
|
||||
// Apply precision rounding to avoid JavaScript floating-point issues
|
||||
extracted[param.key] = this.roundFloatingPoint(value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Handle samplers array conversion to string
|
||||
if (serverParams.samplers && Array.isArray(serverParams.samplers)) {
|
||||
extracted.samplers = serverParams.samplers.join(';');
|
||||
}
|
||||
}
|
||||
|
||||
// Handle samplers array conversion to string
|
||||
if (serverParams.samplers && Array.isArray(serverParams.samplers)) {
|
||||
extracted.samplers = serverParams.samplers.join(';');
|
||||
if (webuiSettings) {
|
||||
for (const param of SYNCABLE_PARAMETERS) {
|
||||
if (param.canSync && param.serverKey in webuiSettings) {
|
||||
const value = webuiSettings[param.serverKey];
|
||||
if (value !== undefined) {
|
||||
extracted[param.key] = this.roundFloatingPoint(value);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return extracted;
|
||||
|
||||
@@ -40,6 +40,10 @@ class ServerStore {
|
||||
return this.props?.default_generation_settings?.n_ctx ?? null;
|
||||
}
|
||||
|
||||
get webuiSettings(): Record<string, string | number | boolean> | undefined {
|
||||
return this.props?.webui_settings;
|
||||
}
|
||||
|
||||
get isRouterMode(): boolean {
|
||||
return this.role === ServerRole.ROUTER;
|
||||
}
|
||||
|
||||
@@ -66,7 +66,8 @@ class SettingsStore {
|
||||
*/
|
||||
private getServerDefaults(): Record<string, string | number | boolean> {
|
||||
const serverParams = serverStore.defaultParams;
|
||||
return serverParams ? ParameterSyncService.extractServerDefaults(serverParams) : {};
|
||||
const webuiSettings = serverStore.webuiSettings;
|
||||
return ParameterSyncService.extractServerDefaults(serverParams, webuiSettings);
|
||||
}
|
||||
|
||||
constructor() {
|
||||
|
||||
+1
@@ -176,6 +176,7 @@ export interface ApiLlamaCppServerProps {
|
||||
bos_token: string;
|
||||
eos_token: string;
|
||||
build_info: string;
|
||||
webui_settings?: Record<string, string | number | boolean>;
|
||||
}
|
||||
|
||||
export interface ApiChatCompletionRequest {
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import { base } from '$app/paths';
|
||||
import { error } from '@sveltejs/kit';
|
||||
import { browser } from '$app/environment';
|
||||
import { config } from '$lib/stores/settings.svelte';
|
||||
@@ -22,7 +23,7 @@ export async function validateApiKey(fetch: typeof globalThis.fetch): Promise<vo
|
||||
headers.Authorization = `Bearer ${apiKey}`;
|
||||
}
|
||||
|
||||
const response = await fetch(`./props`, { headers });
|
||||
const response = await fetch(`${base}/props`, { headers });
|
||||
|
||||
if (!response.ok) {
|
||||
if (response.status === 401 || response.status === 403) {
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
<script lang="ts">
|
||||
import '../app.css';
|
||||
import { base } from '$app/paths';
|
||||
import { page } from '$app/state';
|
||||
import { untrack } from 'svelte';
|
||||
import { ChatSidebar, DialogConversationTitleUpdate } from '$lib/components/app';
|
||||
@@ -157,7 +158,7 @@
|
||||
headers.Authorization = `Bearer ${apiKey.trim()}`;
|
||||
}
|
||||
|
||||
fetch(`./props`, { headers })
|
||||
fetch(`${base}/props`, { headers })
|
||||
.then((response) => {
|
||||
if (response.status === 401 || response.status === 403) {
|
||||
window.location.reload();
|
||||
|
||||
Reference in New Issue
Block a user