mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-29 17:17:40 +02:00
Compare commits
36 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 06938ac129 | |||
| d545a2a993 | |||
| 4da6370d43 | |||
| e3666269f9 | |||
| 63e66fdd23 | |||
| 5c394fdc8b | |||
| 4fb16eccce | |||
| bfb4308b05 | |||
| 2187e00337 | |||
| 0b7154066e | |||
| 60130d18f9 | |||
| a468b89018 | |||
| d5ab0834ab | |||
| 69cea5b669 | |||
| f8e67fc583 | |||
| 2365315955 | |||
| f7a0777a5c | |||
| 4f3a4beb8d | |||
| 8f7f3bf141 | |||
| d178a11818 | |||
| 354ebac8cb | |||
| 1fd5f48037 | |||
| 210a6570ce | |||
| b8275a8acc | |||
| 5dcb711666 | |||
| 5aa3a64596 | |||
| 27d9ed8397 | |||
| 335abed17d | |||
| de6f727aae | |||
| 95b8b8ec1a | |||
| 55ac0909e5 | |||
| bef69f1306 | |||
| 5aba5364d9 | |||
| 8e6fff84de | |||
| 02a57017f6 | |||
| 48b88c3b00 |
+28
-1
@@ -3,6 +3,7 @@
|
||||
glibc,
|
||||
config,
|
||||
stdenv,
|
||||
stdenvNoCC,
|
||||
runCommand,
|
||||
cmake,
|
||||
ninja,
|
||||
@@ -19,6 +20,8 @@
|
||||
openssl,
|
||||
shaderc,
|
||||
spirv-headers,
|
||||
nodejs,
|
||||
importNpmLock,
|
||||
useBlas ?
|
||||
builtins.all (x: !x) [
|
||||
useCuda
|
||||
@@ -130,7 +133,31 @@ effectiveStdenv.mkDerivation (finalAttrs: {
|
||||
src = lib.cleanSource ../../.;
|
||||
};
|
||||
|
||||
postPatch = ''
|
||||
# Builds the webui locally, taking care not to require updating any sha256 hash.
|
||||
webui = stdenvNoCC.mkDerivation {
|
||||
pname = "webui";
|
||||
version = llamaVersion;
|
||||
src = lib.cleanSource ../../tools/ui;
|
||||
|
||||
nativeBuildInputs = [
|
||||
nodejs
|
||||
importNpmLock.linkNodeModulesHook
|
||||
];
|
||||
|
||||
# no sha256 required when using buildNodeModules
|
||||
npmDeps = importNpmLock.buildNodeModules {
|
||||
npmRoot = ../../tools/ui;
|
||||
inherit nodejs;
|
||||
};
|
||||
|
||||
installPhase = ''
|
||||
LLAMA_UI_OUT_DIR=$out npm run build --offline
|
||||
'';
|
||||
};
|
||||
|
||||
postPatch = lib.optionalString useWebUi ''
|
||||
cp -r ${finalAttrs.webui} tools/ui/dist
|
||||
chmod -R u+w tools/ui/dist
|
||||
'';
|
||||
|
||||
# With PR#6015 https://github.com/ggml-org/llama.cpp/pull/6015,
|
||||
|
||||
@@ -619,10 +619,11 @@ jobs:
|
||||
run: |
|
||||
choco install ninja
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: release-windows-2025-${{ matrix.arch }}-${{ matrix.backend }}
|
||||
# TODO: these jobs need to use llvm toolchain in order to utilize the ccache
|
||||
#- name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: release-windows-2025-${{ matrix.arch }}-${{ matrix.backend }}
|
||||
|
||||
- name: Install OpenCL Headers and Libs
|
||||
id: install_opencl
|
||||
@@ -650,10 +651,10 @@ jobs:
|
||||
cmake -S . -B build ${{ matrix.defines }} -DGGML_NATIVE=OFF -DGGML_CPU=OFF -DGGML_BACKEND_DL=ON -DLLAMA_BUILD_BORINGSSL=ON
|
||||
cmake --build build --config Release --target ${{ matrix.target }}
|
||||
|
||||
- name: ccache-clear
|
||||
uses: ./.github/actions/ccache-clear
|
||||
with:
|
||||
key: release-windows-2025-${{ matrix.arch }}-${{ matrix.backend }}
|
||||
#- name: ccache-clear
|
||||
# uses: ./.github/actions/ccache-clear
|
||||
# with:
|
||||
# key: release-windows-2025-${{ matrix.arch }}-${{ matrix.backend }}
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
|
||||
@@ -42,23 +42,6 @@ jobs:
|
||||
server-metal:
|
||||
runs-on: [self-hosted, llama-server, macOS, ARM64]
|
||||
|
||||
name: server-metal (${{ matrix.wf_name }})
|
||||
strategy:
|
||||
matrix:
|
||||
build_type: [Release]
|
||||
wf_name: ["GPUx1"]
|
||||
include:
|
||||
- build_type: Release
|
||||
extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
|
||||
wf_name: "GPUx1, backend-sampling"
|
||||
- build_type: Release
|
||||
extra_args: "GGML_METAL_DEVICES=2"
|
||||
wf_name: "GPUx2"
|
||||
- build_type: Release
|
||||
extra_args: "GGML_METAL_DEVICES=2 LLAMA_ARG_BACKEND_SAMPLING=1"
|
||||
wf_name: "GPUx2, backend-sampling"
|
||||
fail-fast: false
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
@@ -67,44 +50,58 @@ jobs:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v6
|
||||
with:
|
||||
node-version: "24"
|
||||
cache: "npm"
|
||||
cache-dependency-path: "tools/ui/package-lock.json"
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build -DGGML_SCHED_NO_REALLOC=ON
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) --target llama-server
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_tests
|
||||
if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
|
||||
- name: Python setup
|
||||
id: setup_python
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
export ${{ matrix.extra_args }}
|
||||
|
||||
- name: Tests (GPUx1)
|
||||
id: server_integration_tests
|
||||
if: ${{ !github.event.pull_request }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
source venv/bin/activate
|
||||
pytest -v -x -m "not slow"
|
||||
|
||||
- name: Tests (GPUx1, backend-sampling)
|
||||
id: server_integration_tests_backend_sampling
|
||||
if: ${{ !github.event.pull_request }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
source venv/bin/activate
|
||||
export LLAMA_ARG_BACKEND_SAMPLING=1
|
||||
pytest -v -x -m "not slow"
|
||||
|
||||
- name: Tests (GPUx2)
|
||||
id: server_integration_tests_gpu2
|
||||
if: ${{ !github.event.pull_request }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
source venv/bin/activate
|
||||
export GGML_METAL_DEVICES=2
|
||||
pytest -v -x -m "not slow"
|
||||
|
||||
- name: Tests (GPUx2, backend-sampling)
|
||||
id: server_integration_tests_gpu2_backend_sampling
|
||||
if: ${{ !github.event.pull_request }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
source venv/bin/activate
|
||||
export GGML_METAL_DEVICES=2 LLAMA_ARG_BACKEND_SAMPLING=1
|
||||
pytest -v -x -m "not slow"
|
||||
|
||||
server-cuda:
|
||||
runs-on: [self-hosted, llama-server, Linux, NVIDIA]
|
||||
|
||||
name: server-cuda (${{ matrix.wf_name }})
|
||||
strategy:
|
||||
matrix:
|
||||
build_type: [Release]
|
||||
wf_name: ["GPUx1"]
|
||||
include:
|
||||
- build_type: Release
|
||||
extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
|
||||
wf_name: "GPUx1, backend-sampling"
|
||||
fail-fast: false
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
@@ -117,32 +114,36 @@ jobs:
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build -DGGML_CUDA=ON -DGGML_SCHED_NO_REALLOC=ON
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||
cmake --build build --config Release -j $(nproc) --target llama-server
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_tests
|
||||
if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
|
||||
- name: Python setup
|
||||
id: setup_python
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
export ${{ matrix.extra_args }}
|
||||
|
||||
- name: Tests (GPUx1)
|
||||
id: server_integration_tests
|
||||
if: ${{ !github.event.pull_request }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
source venv/bin/activate
|
||||
pytest -v -x -m "not slow"
|
||||
|
||||
- name: Tests (GPUx1, backend-sampling)
|
||||
id: server_integration_tests_backend_sampling
|
||||
if: ${{ !github.event.pull_request }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
source venv/bin/activate
|
||||
export LLAMA_ARG_BACKEND_SAMPLING=1
|
||||
pytest -v -x -m "not slow"
|
||||
|
||||
server-kleidiai:
|
||||
runs-on: ah-ubuntu_22_04-c8g_8x
|
||||
|
||||
name: server-kleidiai (${{ matrix.wf_name }})
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build_type: Release
|
||||
extra_build_flags: "-DGGML_CPU_KLEIDIAI=ON"
|
||||
extra_args: ""
|
||||
wf_name: "CPUx1, kleidiai"
|
||||
fail-fast: false
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
@@ -181,16 +182,21 @@ jobs:
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build -DGGML_SCHED_NO_REALLOC=ON ${{ matrix.extra_build_flags }}
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||
cmake -B build -DGGML_SCHED_NO_REALLOC=ON -DGGML_CPU_KLEIDIAI=ON
|
||||
cmake --build build --config Release -j $(nproc) --target llama-server
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_tests
|
||||
if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
|
||||
- name: Python setup
|
||||
id: setup_python
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
export ${{ matrix.extra_args }}
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_tests
|
||||
if: ${{ !github.event.pull_request }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
source venv/bin/activate
|
||||
pytest -v -x -m "not slow"
|
||||
|
||||
@@ -102,7 +102,6 @@ jobs:
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_tests
|
||||
if: ${{ !github.event.pull_request }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
pytest -v -x -m "not slow"
|
||||
@@ -116,7 +115,6 @@ jobs:
|
||||
|
||||
- name: Tests (Backend sampling)
|
||||
id: server_integration_tests_backend_sampling
|
||||
if: ${{ !github.event.pull_request }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
export LLAMA_ARG_BACKEND_SAMPLING=1
|
||||
@@ -169,7 +167,6 @@ jobs:
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_tests
|
||||
if: ${{ !github.event.pull_request }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
$env:PYTHONIOENCODING = ":replace"
|
||||
|
||||
@@ -143,6 +143,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
- [x] [LFM2 models](https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38)
|
||||
- [x] [Hunyuan models](https://huggingface.co/collections/tencent/hunyuan-dense-model-6890632cda26b19119c9c5e7)
|
||||
- [x] [BailingMoeV2 (Ring/Ling 2.0) models](https://huggingface.co/collections/inclusionAI/ling-v2-68bf1dd2fc34c306c1fa6f86)
|
||||
- [x] [Mellum models](https://huggingface.co/JetBrains/models?search=mellum)
|
||||
|
||||
#### Multimodal
|
||||
|
||||
|
||||
+5
-5
@@ -12,16 +12,16 @@
|
||||
|
||||
## Reporting a vulnerability
|
||||
|
||||
> [!IMPORTANT]
|
||||
> The private security disclosure program is disabled until further notice. Please submit patches with fixes directly to the repo as public PRs. Emails will be ignored.
|
||||
|
||||
If you have discovered a security vulnerability in this project that falls inside the [covered topics](#covered-topics), please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
|
||||
|
||||
Please disclose it as a private [security advisory](https://github.com/ggml-org/llama.cpp/security/advisories/new).
|
||||
|
||||
A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
|
||||
|
||||
> [!IMPORTANT]
|
||||
> For collaborators: if you are interested in helping out with reviewing private security disclosures, please see: https://github.com/ggml-org/llama.cpp/discussions/18080
|
||||
|
||||
## Requirements
|
||||
### Requirements
|
||||
|
||||
Before submitting your report, ensure you meet the following requirements:
|
||||
|
||||
@@ -31,7 +31,7 @@ Before submitting your report, ensure you meet the following requirements:
|
||||
|
||||
Maintainers reserve the right to close the report if these requirements are not fulfilled.
|
||||
|
||||
## Covered Topics
|
||||
### Covered Topics
|
||||
|
||||
Only vulnerabilities that fall within these parts of the project are considered valid. For problems falling outside of this list, please report them as issues.
|
||||
|
||||
|
||||
+12
-11
@@ -353,7 +353,6 @@ static handle_model_result common_params_handle_model(struct common_params_model
|
||||
model.path = "";
|
||||
}
|
||||
common_download_opts hf_opts = opts;
|
||||
hf_opts.download_mmproj = true; // also look for mmproj when downloading hf model
|
||||
auto download_result = common_download_model(model, hf_opts);
|
||||
|
||||
if (download_result.model_path.empty()) {
|
||||
@@ -441,10 +440,11 @@ bool common_params_handle_models(common_params & params, llama_example curr_ex)
|
||||
COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params.speculative.types.end();
|
||||
|
||||
common_download_opts opts;
|
||||
opts.bearer_token = params.hf_token;
|
||||
opts.offline = params.offline;
|
||||
opts.skip_download = params.skip_download;
|
||||
opts.download_mtp = spec_type_draft_mtp;
|
||||
opts.bearer_token = params.hf_token;
|
||||
opts.offline = params.offline;
|
||||
opts.skip_download = params.skip_download;
|
||||
opts.download_mtp = spec_type_draft_mtp;
|
||||
opts.download_mmproj = !params.no_mmproj;
|
||||
|
||||
try {
|
||||
auto res = common_params_handle_model(params.model, opts);
|
||||
@@ -1041,11 +1041,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
// we define here to make sure it's included in llama-gen-docs
|
||||
if (ex == LLAMA_EXAMPLE_COMPLETION) {
|
||||
params.use_jinja = false; // disable jinja by default
|
||||
|
||||
} else if (ex == LLAMA_EXAMPLE_MTMD) {
|
||||
params.use_jinja = false; // disable jinja by default
|
||||
params.sampling.temp = 0.2; // lower temp by default for better quality
|
||||
|
||||
} else if (ex == LLAMA_EXAMPLE_SERVER) {
|
||||
params.n_parallel = -1; // auto by default
|
||||
}
|
||||
@@ -1066,7 +1064,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
sampler_type_names.pop_back(); // remove last semicolon
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* filter options by example
|
||||
* rules:
|
||||
@@ -1080,7 +1077,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
add_opt(common_arg(
|
||||
{"-h", "--help", "--usage"},
|
||||
"print usage and exit",
|
||||
@@ -3031,6 +3027,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.timeout_write = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TIMEOUT"));
|
||||
add_opt(common_arg(
|
||||
{"--sse-ping-interval"}, "N",
|
||||
string_format("server SSE ping interval in seconds (-1 = disabled, default: %d)", params.sse_ping_interval),
|
||||
[](common_params & params, int value) {
|
||||
params.sse_ping_interval = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSE_PING_INTERVAL"));
|
||||
add_opt(common_arg(
|
||||
{"--threads-http"}, "N",
|
||||
string_format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http),
|
||||
@@ -4081,7 +4084,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.sampling.top_k = 0;
|
||||
params.sampling.min_p = 0.01f;
|
||||
params.use_jinja = true;
|
||||
//params.default_template_kwargs["reasoning_effort"] = "\"high\"";
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
||||
|
||||
@@ -4100,7 +4102,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.sampling.top_k = 0;
|
||||
params.sampling.min_p = 0.01f;
|
||||
params.use_jinja = true;
|
||||
//params.default_template_kwargs["reasoning_effort"] = "\"high\"";
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
||||
|
||||
|
||||
+14
-15
@@ -1389,8 +1389,6 @@ common_init_result_ptr common_init_from_params(common_params & params, bool mode
|
||||
if (params.warmup) {
|
||||
LOG_INF("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
|
||||
|
||||
llama_set_warmup(lctx, true);
|
||||
|
||||
std::vector<llama_token> tmp;
|
||||
llama_token bos = llama_vocab_bos(vocab);
|
||||
llama_token eos = llama_vocab_eos(vocab);
|
||||
@@ -1421,7 +1419,6 @@ common_init_result_ptr common_init_from_params(common_params & params, bool mode
|
||||
llama_memory_clear(llama_get_memory(lctx), true);
|
||||
llama_synchronize(lctx);
|
||||
llama_perf_context_reset(lctx);
|
||||
llama_set_warmup(lctx, false);
|
||||
|
||||
// reset samplers to reset RNG state after warmup to the seeded state
|
||||
res->reset_samplers();
|
||||
@@ -1563,6 +1560,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
||||
cparams.n_ctx = params.n_ctx;
|
||||
cparams.n_seq_max = params.n_parallel;
|
||||
cparams.n_rs_seq = params.speculative.need_n_rs_seq();
|
||||
cparams.n_outputs_max = std::max(params.n_outputs_max, 0);
|
||||
cparams.n_batch = params.n_batch;
|
||||
cparams.n_ubatch = params.n_ubatch;
|
||||
cparams.n_threads = params.cpuparams.n_threads;
|
||||
@@ -1984,36 +1982,37 @@ bool common_replay_last_token(struct llama_context * ctx, llama_token last_token
|
||||
|
||||
bool common_prompt_batch_decode(
|
||||
struct llama_context * ctx,
|
||||
const std::vector<llama_token> & tokens,
|
||||
const std::vector<llama_token> & all_tokens,
|
||||
int n_new,
|
||||
int & n_past,
|
||||
int n_batch,
|
||||
std::string_view state_path,
|
||||
bool save_state) {
|
||||
const int n_eval = tokens.size();
|
||||
if (n_eval == 0) {
|
||||
if (n_new == 0) {
|
||||
return true;
|
||||
}
|
||||
const int offset = all_tokens.size() - n_new;
|
||||
|
||||
if (save_state && n_eval > 1) {
|
||||
const int n_tokens_before_last = n_eval - 1;
|
||||
if (save_state && n_new > 1) {
|
||||
const int n_tokens_before_last = n_new - 1;
|
||||
|
||||
GGML_ASSERT(n_eval <= n_batch);
|
||||
GGML_ASSERT(n_new <= n_batch);
|
||||
|
||||
// Decode all but the last token so we can save the memory state before decoding the last token.
|
||||
// This is done so we can restore the session state later and replay the last token.
|
||||
// Memory implementations in recurrent/hybrid models don't support removing tokens from their
|
||||
// memory, so we can't just remove the last token from the memory and replay the last token which
|
||||
// is the reason for this logic.
|
||||
if (llama_decode(ctx, llama_batch_get_one(const_cast<llama_token*>(tokens.data()), n_tokens_before_last))) {
|
||||
if (llama_decode(ctx, llama_batch_get_one(const_cast<llama_token*>(all_tokens.data() + offset), n_tokens_before_last))) {
|
||||
LOG_ERR("%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
}
|
||||
n_past += n_tokens_before_last;
|
||||
|
||||
llama_state_save_file(ctx, state_path.data(), tokens.data(), n_tokens_before_last);
|
||||
LOG_INF("saved session before last token to %s, n_tokens = %d\n", state_path.data(), n_tokens_before_last);
|
||||
llama_state_save_file(ctx, state_path.data(), all_tokens.data(), all_tokens.size());
|
||||
LOG_INF("saved session before last token to %s, n_new = %zu\n", state_path.data(), all_tokens.size());
|
||||
|
||||
llama_token last_token = tokens.back();
|
||||
llama_token last_token = all_tokens.back();
|
||||
llama_batch batch = llama_batch_get_one(&last_token, 1);
|
||||
int32_t pos = n_past;
|
||||
batch.pos = &pos;
|
||||
@@ -2024,11 +2023,11 @@ bool common_prompt_batch_decode(
|
||||
}
|
||||
n_past++;
|
||||
} else {
|
||||
if (llama_decode(ctx, llama_batch_get_one(const_cast<llama_token*>(tokens.data()), n_eval))) {
|
||||
if (llama_decode(ctx, llama_batch_get_one(const_cast<llama_token*>(all_tokens.data() + offset), n_new))) {
|
||||
LOG_ERR("%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
}
|
||||
n_past += n_eval;
|
||||
n_past += n_new;
|
||||
}
|
||||
|
||||
return true;
|
||||
|
||||
+5
-1
@@ -277,6 +277,7 @@ struct common_params_sampling {
|
||||
std::vector<llama_token> reasoning_budget_end; // end tag token sequence
|
||||
std::vector<llama_token> reasoning_budget_forced; // forced sequence (message + end tag)
|
||||
std::string reasoning_budget_message; // message injected before end tag when budget exhausted
|
||||
bool reasoning_control = false; // create the budget sampler on demand so reasoning can be ended at runtime
|
||||
|
||||
bool backend_sampling = false;
|
||||
|
||||
@@ -431,6 +432,7 @@ struct common_params {
|
||||
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
||||
int32_t n_parallel = 1; // number of parallel sequences to decode
|
||||
int32_t n_sequences = 1; // number of sequences to decode
|
||||
int32_t n_outputs_max = 0; // max outputs in a batch (0 = n_batch)
|
||||
int32_t grp_attn_n = 1; // group-attention factor
|
||||
int32_t grp_attn_w = 512; // group-attention width
|
||||
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
|
||||
@@ -590,6 +592,7 @@ struct common_params {
|
||||
bool reuse_port = false; // allow multiple sockets to bind to the same port
|
||||
int32_t timeout_read = 3600; // http read timeout in seconds
|
||||
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
||||
int32_t sse_ping_interval = 30; // SSE ping interval in seconds
|
||||
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
||||
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
|
||||
bool cache_prompt = true; // whether to enable prompt caching
|
||||
@@ -927,7 +930,8 @@ void common_batch_add(
|
||||
// tokens from memory, so this approach works across all model architectures.
|
||||
bool common_prompt_batch_decode(
|
||||
struct llama_context * ctx,
|
||||
const std::vector<llama_token> & embd,
|
||||
const std::vector<llama_token> & all_tokens,
|
||||
int n_new,
|
||||
int & n_past,
|
||||
int n_batch,
|
||||
std::string_view state_path,
|
||||
|
||||
+1
-1
@@ -293,7 +293,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
|
||||
}
|
||||
|
||||
// reasoning budget sampler (skip when budget is unlimited unless a lazy grammar is active, which needs rbudget for thinking-block suppression)
|
||||
if (!params.reasoning_budget_start.empty() && !params.reasoning_budget_end.empty() && (params.grammar_lazy || params.reasoning_budget_tokens >= 0)) {
|
||||
if (!params.reasoning_budget_start.empty() && !params.reasoning_budget_end.empty() && (params.grammar_lazy || params.reasoning_budget_tokens >= 0 || params.reasoning_control)) {
|
||||
rbudget = common_reasoning_budget_init(
|
||||
vocab,
|
||||
params.reasoning_budget_start,
|
||||
|
||||
+34
-12
@@ -1317,6 +1317,40 @@ static uint32_t common_get_enabled_speculative_configs(const std::vector<common_
|
||||
return result;
|
||||
}
|
||||
|
||||
int32_t common_speculative_n_max(const common_params_speculative * spec) {
|
||||
int32_t n_max = 0;
|
||||
|
||||
for (const auto type : spec->types) {
|
||||
switch (type) {
|
||||
case COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE:
|
||||
case COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3:
|
||||
case COMMON_SPECULATIVE_TYPE_DRAFT_MTP:
|
||||
n_max = std::max(n_max, std::max(0, spec->draft.n_max));
|
||||
break;
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE:
|
||||
n_max = std::max(n_max, (int32_t) spec->ngram_simple.size_m);
|
||||
break;
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K:
|
||||
n_max = std::max(n_max, (int32_t) spec->ngram_map_k.size_m);
|
||||
break;
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V:
|
||||
n_max = std::max(n_max, (int32_t) spec->ngram_map_k4v.size_m);
|
||||
break;
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_MOD:
|
||||
n_max = std::max(n_max, std::max(0, spec->ngram_mod.n_max));
|
||||
break;
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_CACHE:
|
||||
n_max = std::max(n_max, (int32_t) 8);
|
||||
break;
|
||||
case COMMON_SPECULATIVE_TYPE_NONE:
|
||||
case COMMON_SPECULATIVE_TYPE_COUNT:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return n_max;
|
||||
}
|
||||
|
||||
// initialization of the speculative decoding system
|
||||
//
|
||||
common_speculative * common_speculative_init(common_params_speculative & params, uint32_t n_seq) {
|
||||
@@ -1325,8 +1359,6 @@ common_speculative * common_speculative_init(common_params_speculative & params,
|
||||
{
|
||||
uint32_t enabled_configs = common_get_enabled_speculative_configs(params.types);
|
||||
|
||||
bool has_draft_model_path = !params.draft.mparams.path.empty();
|
||||
|
||||
bool has_draft_simple = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE));
|
||||
bool has_draft_eagle3 = false; // TODO PR-18039: if params.speculative.eagle3
|
||||
bool has_mtp = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_MTP)) && params.draft.ctx_dft != nullptr;
|
||||
@@ -1359,16 +1391,6 @@ common_speculative * common_speculative_init(common_params_speculative & params,
|
||||
if (has_ngram_cache) {
|
||||
configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, params));
|
||||
}
|
||||
if (has_draft_simple) {
|
||||
if (!has_draft_model_path) {
|
||||
LOG_WRN("%s: draft model is not specified - cannot use 'draft' type\n", __func__);
|
||||
has_draft_simple = false;
|
||||
}
|
||||
} else if (has_draft_model_path && !has_mtp && !has_draft_eagle3) {
|
||||
LOG_WRN("%s: draft model is specified but 'draft' speculative type is not explicitly enabled - enabling it\n", __func__);
|
||||
has_draft_simple = true;
|
||||
}
|
||||
|
||||
if (has_draft_simple) {
|
||||
configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE, params));
|
||||
}
|
||||
|
||||
@@ -20,6 +20,9 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
|
||||
// convert type to string
|
||||
std::string common_speculative_type_to_str(enum common_speculative_type type);
|
||||
|
||||
// return the max number of draft tokens based on the speculative parameters
|
||||
int32_t common_speculative_n_max(const common_params_speculative * spec);
|
||||
|
||||
common_speculative * common_speculative_init(common_params_speculative & params, uint32_t n_seq);
|
||||
|
||||
void common_speculative_free(common_speculative * spec);
|
||||
|
||||
@@ -58,6 +58,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
|
||||
"Ernie4_5_ForCausalLM": "ernie",
|
||||
"Ernie4_5_MoeForCausalLM": "ernie",
|
||||
"EuroBertModel": "bert",
|
||||
"Exaone4_5_ForConditionalGeneration": "exaone",
|
||||
"Exaone4ForCausalLM": "exaone",
|
||||
"ExaoneForCausalLM": "exaone",
|
||||
"ExaoneMoEForCausalLM": "exaone",
|
||||
@@ -134,6 +135,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
|
||||
"Mamba2ForCausalLM": "mamba",
|
||||
"MambaForCausalLM": "mamba",
|
||||
"MambaLMHeadModel": "mamba",
|
||||
"MellumForCausalLM": "mellum",
|
||||
"MiMoV2FlashForCausalLM": "mimo",
|
||||
"MiMoV2ForCausalLM": "mimo",
|
||||
"MiniCPM3ForCausalLM": "minicpm",
|
||||
@@ -214,6 +216,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
|
||||
"Starcoder2ForCausalLM": "starcoder",
|
||||
"Step3p5ForCausalLM": "step3",
|
||||
"StepVLForConditionalGeneration": "step3",
|
||||
"Step3p7ForConditionalGeneration": "step3",
|
||||
"T5EncoderModel": "t5",
|
||||
"T5ForConditionalGeneration": "t5",
|
||||
"T5WithLMHeadModel": "t5",
|
||||
@@ -240,6 +243,7 @@ MMPROJ_MODEL_MAP: dict[str, str] = {
|
||||
"DeepseekOCR2ForCausalLM": "deepseek",
|
||||
"DeepseekOCRForCausalLM": "deepseek",
|
||||
"DotsOCRForCausalLM": "dotsocr",
|
||||
"Exaone4_5_ForConditionalGeneration": "exaone",
|
||||
"Gemma3ForConditionalGeneration": "gemma",
|
||||
"Gemma3nForConditionalGeneration": "gemma",
|
||||
"Gemma4ForConditionalGeneration": "gemma",
|
||||
@@ -281,6 +285,7 @@ MMPROJ_MODEL_MAP: dict[str, str] = {
|
||||
"Sarashina2VisionForCausalLM": "sarashina2",
|
||||
"SmolVLMForConditionalGeneration": "smolvlm",
|
||||
"StepVLForConditionalGeneration": "step3",
|
||||
"Step3p7ForConditionalGeneration": "step3",
|
||||
"UltravoxModel": "ultravox",
|
||||
"VoxtralForConditionalGeneration": "ultravox",
|
||||
"YoutuVLForConditionalGeneration": "youtuvl",
|
||||
|
||||
+10
-1
@@ -1657,6 +1657,15 @@ class TextModel(ModelBase):
|
||||
if chkhsh == "36f3066e97b7f3994b379aaacde306c1444c6ae84e81a5ae3cd2b7ed3b8c42d4":
|
||||
# ref: https://huggingface.co/openbmb/MiniCPM5-1B
|
||||
res = "minicpm5"
|
||||
if chkhsh == "f241072145675bf8322086f115aebad05e9f869557a238bf2150a2a417d1bf60":
|
||||
# ref: https://huggingface.co/ibm-granite/granite-embedding-97m-multilingual-r2
|
||||
res = "granite-embed-multi-97m"
|
||||
if chkhsh == "789696f5946cc0fc59371f39f6097cafed196b3acded6140432f26bbb1ae1669":
|
||||
# ref: https://huggingface.co/ibm-granite/granite-embedding-311m-multilingual-r2
|
||||
res = "granite-embed-multi-311m"
|
||||
if chkhsh == "9dcf830ee9990cdbf78cc523a5f7bd9ad8f3f9890c2d3581d2785ad10f07049d":
|
||||
# ref: https://huggingface.co/JetBrains/Mellum2-12B-A2.5B-Base
|
||||
res = "mellum2"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
@@ -2593,7 +2602,7 @@ def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> st
|
||||
# Step3-VL keeps text config under text_config but uses a custom top-level architecture.
|
||||
# For text conversion we route to a dedicated text-only class.
|
||||
# TODO: refactor this later to avoid adding exception here
|
||||
if model_type == ModelType.TEXT and arch in ("StepVLForConditionalGeneration", "Sarashina2VisionForCausalLM"):
|
||||
if model_type == ModelType.TEXT and arch in ("StepVLForConditionalGeneration", "Sarashina2VisionForCausalLM", "Exaone4_5_ForConditionalGeneration", "Step3p7ForConditionalGeneration"):
|
||||
return arch
|
||||
|
||||
# if "architectures" is found in the sub-config, use that instead
|
||||
|
||||
@@ -603,6 +603,12 @@ class ModernBertModel(BertModel):
|
||||
self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
||||
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
|
||||
# FFN activation: ModernBert uses a GLU pair (ffn_up output is 2*n_ff). The
|
||||
# original ModernBERT uses GELU (-> GeGLU); some derivatives such as IBM
|
||||
# Granite Embedding 97m R2 use SiLU (-> SwiGLU). Persist this so the
|
||||
# llama.cpp graph can pick the matching activation.
|
||||
if hidden_act := self.hparams.get("hidden_activation"):
|
||||
self.gguf_writer.add_hidden_act(hidden_act)
|
||||
|
||||
@classmethod
|
||||
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
|
||||
|
||||
+97
-2
@@ -3,14 +3,15 @@ from __future__ import annotations
|
||||
import math
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Iterable, TYPE_CHECKING
|
||||
from typing import Callable, Iterable, TYPE_CHECKING
|
||||
|
||||
import torch
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from torch import Tensor
|
||||
|
||||
from .base import ModelBase, TextModel, gguf
|
||||
from .base import MmprojModel, ModelBase, TextModel, gguf
|
||||
from .qwenvl import Qwen2VLVisionModel
|
||||
|
||||
|
||||
@ModelBase.register("ExaoneForCausalLM")
|
||||
@@ -208,3 +209,97 @@ class ExaoneMoEModel(Exaone4Model):
|
||||
experts = [k for d in self._experts for k in d.keys()]
|
||||
if len(experts) > 0:
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("Exaone4_5_ForConditionalGeneration")
|
||||
class Exaone4_5_TextModel(Exaone4Model):
|
||||
"""Text tower of EXAONE 4.5; Tensors match EXAONE4"""
|
||||
|
||||
model_arch = gguf.MODEL_ARCH.EXAONE4
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
n_nextn = int(self.hparams.get("num_nextn_predict_layers", 0) or 0)
|
||||
if n_nextn > 0:
|
||||
self.block_count = self.hparams["num_hidden_layers"] + n_nextn
|
||||
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
n_nextn = int(self.hparams.get("num_nextn_predict_layers", 0) or 0)
|
||||
if n_nextn > 0:
|
||||
self.gguf_writer.add_nextn_predict_layers(n_nextn)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if name.startswith("mtp."):
|
||||
n_nextn = int(self.hparams.get("num_nextn_predict_layers", 0) or 0)
|
||||
if n_nextn <= 0:
|
||||
return
|
||||
nh = self.hparams["num_hidden_layers"]
|
||||
if ".layers." in name:
|
||||
share = self.hparams.get("mtp_share_layers", False)
|
||||
mtp_bid = bid if bid is not None else 0
|
||||
if share:
|
||||
for k in range(n_nextn):
|
||||
nn = name.replace(f"mtp.layers.{mtp_bid}", f"model.layers.{nh + k}")
|
||||
yield from super().modify_tensors(data_torch, nn, nh + k)
|
||||
return
|
||||
name = name.replace(f"mtp.layers.{mtp_bid}", f"model.layers.{mtp_bid + nh}")
|
||||
else:
|
||||
remapper = {
|
||||
"mtp.fc": gguf.MODEL_TENSOR.NEXTN_EH_PROJ,
|
||||
"mtp.pre_fc_norm_embedding": gguf.MODEL_TENSOR.NEXTN_ENORM,
|
||||
"mtp.pre_fc_norm_hidden": gguf.MODEL_TENSOR.NEXTN_HNORM,
|
||||
"mtp.norm": gguf.MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM,
|
||||
}
|
||||
_n = Path(name)
|
||||
key = _n.stem
|
||||
if key not in remapper:
|
||||
return
|
||||
for bid_mtp in range(nh, self.block_count):
|
||||
mapped_name = self.format_tensor_name(remapper[key], bid_mtp, suffix=_n.suffix)
|
||||
yield from ModelBase.modify_tensors(self, data_torch, mapped_name, bid_mtp)
|
||||
return
|
||||
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("Exaone4_5_ForConditionalGeneration")
|
||||
class Exaone4_5VisionModel(Qwen2VLVisionModel):
|
||||
"""Vision tower for EXAONE 4.5; Qwen2-VL-style ViT (GQA) + patch merger"""
|
||||
|
||||
@classmethod
|
||||
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
|
||||
name, gen = item
|
||||
name = name.replace("model.visual.", "visual.", 1)
|
||||
return super().filter_tensors((name, gen))
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
MmprojModel.set_gguf_parameters(self)
|
||||
assert self.hparams_vision is not None
|
||||
hparams = self.hparams_vision
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.EXAONE4_5)
|
||||
self.gguf_writer.add_vision_use_silu(True)
|
||||
self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
|
||||
self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])
|
||||
num_kv_head = self.find_vparam(["num_key_value_heads"], optional=True)
|
||||
if num_kv_head is not None:
|
||||
self.gguf_writer.add_vision_head_count_kv(num_kv_head)
|
||||
eps = hparams.get("rms_norm_eps", self.global_config.get("rms_norm_eps", 1e-6))
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(eps)
|
||||
if (window_size := hparams.get("window_size")) is not None:
|
||||
self.gguf_writer.add_vision_window_size(window_size)
|
||||
fullatt_block_indexes = hparams.get("fullatt_block_indexes")
|
||||
if fullatt_block_indexes:
|
||||
n_wa_pattern = fullatt_block_indexes[0] + 1
|
||||
for i in range(1, len(fullatt_block_indexes)):
|
||||
if fullatt_block_indexes[i] - fullatt_block_indexes[i - 1] != n_wa_pattern:
|
||||
raise ValueError(f"Invalid EXAONE4.5 fullatt_block_indexes: {fullatt_block_indexes}")
|
||||
self.gguf_writer.add_vision_n_wa_pattern(n_wa_pattern)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if ".qkv." in name:
|
||||
yield from ModelBase.modify_tensors(self, data_torch, name, bid)
|
||||
return
|
||||
|
||||
yield from Qwen2VLVisionModel.modify_tensors(self, data_torch, name, bid)
|
||||
|
||||
@@ -0,0 +1,61 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Iterable, TYPE_CHECKING
|
||||
|
||||
import torch
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from torch import Tensor
|
||||
|
||||
from .base import ModelBase, TextModel, gguf, logger
|
||||
|
||||
|
||||
@ModelBase.register("MellumForCausalLM")
|
||||
class MellumModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.MELLUM
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
|
||||
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
|
||||
logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
|
||||
|
||||
use_sliding_window = self.hparams.get("use_sliding_window")
|
||||
sliding_window = self.hparams.get("sliding_window")
|
||||
if (use_sliding_window is True or use_sliding_window is None) and sliding_window is not None:
|
||||
self.gguf_writer.add_sliding_window(sliding_window)
|
||||
logger.info(f"gguf: sliding window = {sliding_window}")
|
||||
self.gguf_writer.add_sliding_window_pattern([t == "sliding_attention" for t in self.hparams["layer_types"]])
|
||||
logger.info(f"gguf: sliding window pattern length = {len(self.hparams['layer_types'])}")
|
||||
|
||||
_experts: list[dict[str, Tensor]] | None = None
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if name.find("experts") != -1:
|
||||
n_experts = self.find_hparam(["num_local_experts", "num_experts"])
|
||||
assert bid is not None
|
||||
|
||||
if self._experts is None:
|
||||
self._experts = [{} for _ in range(self.block_count)]
|
||||
|
||||
self._experts[bid][name] = data_torch
|
||||
|
||||
if len(self._experts[bid]) >= n_experts * 3:
|
||||
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
||||
datas: list[Tensor] = []
|
||||
|
||||
for xid in range(n_experts):
|
||||
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
||||
datas.append(self._experts[bid][ename])
|
||||
del self._experts[bid][ename]
|
||||
|
||||
data_torch = torch.stack(datas, dim=0)
|
||||
|
||||
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
||||
|
||||
yield from super().modify_tensors(data_torch, merged_name, bid)
|
||||
return
|
||||
else:
|
||||
return
|
||||
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
+125
-19
@@ -15,7 +15,7 @@ from .base import MmprojModel, ModelBase, TextModel, _MISTRAL_COMMON_DATASET_MEA
|
||||
from .qwen import Qwen3Model
|
||||
|
||||
|
||||
@ModelBase.register("StepVLForConditionalGeneration")
|
||||
@ModelBase.register("StepVLForConditionalGeneration", "Step3p7ForConditionalGeneration")
|
||||
class Step3VLVisionModel(MmprojModel):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
@@ -95,10 +95,38 @@ class Step3VLTextModel(Qwen3Model):
|
||||
model_arch = gguf.MODEL_ARCH.QWEN3
|
||||
|
||||
|
||||
@ModelBase.register("Step3p5ForCausalLM")
|
||||
@ModelBase.register("Step3p5ForCausalLM", "Step3p7ForConditionalGeneration")
|
||||
class Step35Model(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.STEP35
|
||||
|
||||
# The --mtp / --no-mtp toggles are ModelBase.mtp_only / no_mtp (set in
|
||||
# convert_hf_to_gguf.py main()). Unlike Qwen3.5, which stores MTP under a
|
||||
# `mtp.*` namespace, Step3.5 appends MTP layers at
|
||||
# `model.layers.{num_hidden_layers + i}`, so we filter them by layer index.
|
||||
# The trunk layer count is captured before indexing so the classmethod
|
||||
# filter_tensors can tell the appended MTP block(s) apart from the trunk.
|
||||
_n_main_layers: int | None = None
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
# NextN/MTP layers are appended past num_hidden_layers; extend the
|
||||
# tensor map to cover them so the MTP block's tensors get correctly
|
||||
# indexed names. When --no-mtp drops the MTP blocks, fall back to the
|
||||
# base num_hidden_layers so we don't reserve unused slots.
|
||||
n_nextn = int(self.hparams.get("num_nextn_predict_layers", 0))
|
||||
if n_nextn > 0 and not self.no_mtp:
|
||||
self.block_count += n_nextn
|
||||
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
||||
|
||||
def index_tensors(self, remote_hf_model_id: str | None = None):
|
||||
# filter_tensors is a classmethod and can't reach self.hparams; stash
|
||||
# the trunk layer count here (before indexing runs) so it can detect
|
||||
# the appended MTP layers by index.
|
||||
hparams = {**self.hparams, **self.hparams.get("text_config", {})}
|
||||
key = next((k for k in ["n_layers", "num_hidden_layers", "n_layer", "num_layers"] if k in hparams), None)
|
||||
type(self)._n_main_layers = hparams.get(key)
|
||||
return super().index_tensors(remote_hf_model_id=remote_hf_model_id)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
rope_theta = self.hparams.get("rope_theta")
|
||||
if isinstance(rope_theta, list):
|
||||
@@ -119,8 +147,25 @@ class Step35Model(TextModel):
|
||||
n_head_swa = attn_other.get("num_attention_heads", n_head_base)
|
||||
n_kv_swa = attn_other.get("num_attention_groups", n_kv_base)
|
||||
|
||||
layer_types = layer_types[: self.block_count]
|
||||
partial_rotary_factors = partial_rotary_factors[: self.block_count]
|
||||
n_nextn = int(self.hparams.get("num_nextn_predict_layers", 0))
|
||||
|
||||
# The Step3p5 HF checkpoint stores layer_types/partial_rotary_factors
|
||||
# entries for the MTP blocks past num_hidden_layers; preserve them so
|
||||
# the MTP layer's attention shape, SWA flag, and partial RoPE dim are
|
||||
# set correctly. Pad with full-attention defaults if the checkpoint
|
||||
# truncated them.
|
||||
def _pad(arr, n, default):
|
||||
arr = list(arr)
|
||||
if len(arr) < n:
|
||||
arr = arr + [default] * (n - len(arr))
|
||||
return arr[:n]
|
||||
|
||||
layer_types = _pad(layer_types, self.block_count, "full_attention")
|
||||
partial_rotary_factors = _pad(
|
||||
partial_rotary_factors,
|
||||
self.block_count,
|
||||
0.5, # full_attention default for Step3p5
|
||||
)
|
||||
assert [1.0 if lt == "sliding_attention" else 0.5 for lt in layer_types] == partial_rotary_factors
|
||||
head_arr = [n_head_swa if lt == "sliding_attention" else n_head_base for lt in layer_types]
|
||||
kv_arr = [n_kv_swa if lt == "sliding_attention" else n_kv_base for lt in layer_types]
|
||||
@@ -157,31 +202,61 @@ class Step35Model(TextModel):
|
||||
|
||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-5))
|
||||
|
||||
# Optional per-layer SwiGLU clamps.
|
||||
# Optional per-layer SwiGLU clamps. MTP layers default to no clamping (0.0).
|
||||
if (limits := self.hparams.get("swiglu_limits")) is not None:
|
||||
limits_f = [0.0 if v is None else float(v) for v in limits[: self.block_count]]
|
||||
limits_f = _pad(
|
||||
[0.0 if v is None else float(v) for v in limits],
|
||||
self.block_count,
|
||||
0.0,
|
||||
)
|
||||
self.gguf_writer.add_swiglu_clamp_exp(limits_f)
|
||||
if (limits_shared := self.hparams.get("swiglu_limits_shared")) is not None:
|
||||
limits_shared_f = [0.0 if v is None else float(v) for v in limits_shared[: self.block_count]]
|
||||
limits_shared_f = _pad(
|
||||
[0.0 if v is None else float(v) for v in limits_shared],
|
||||
self.block_count,
|
||||
0.0,
|
||||
)
|
||||
self.gguf_writer.add_swiglu_clamp_shexp(limits_shared_f)
|
||||
|
||||
if n_nextn > 0 and not self.no_mtp:
|
||||
self.gguf_writer.add_nextn_predict_layers(n_nextn)
|
||||
|
||||
@classmethod
|
||||
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
|
||||
name, gen = item
|
||||
if (titem := super().filter_tensors(item)) is None:
|
||||
return None
|
||||
name, gen = titem
|
||||
|
||||
# Map router bias (expert selection bias) to a GGUF bias tensor
|
||||
if name.endswith(".moe.router_bias"):
|
||||
name += ".bias"
|
||||
|
||||
return super().filter_tensors((name, gen))
|
||||
# Step3.5 appends the MTP block(s) past num_hidden_layers.
|
||||
assert cls._n_main_layers is not None
|
||||
is_mtp = (m := re.match(r"model\.layers\.(\d+)\.", name)) is not None and int(m.group(1)) >= cls._n_main_layers
|
||||
|
||||
# --no-mtp: drop the appended MTP block(s) entirely.
|
||||
if is_mtp and cls.no_mtp:
|
||||
return None
|
||||
# --mtp: keep ONLY MTP-block tensors plus the shared embeddings/norm/
|
||||
# lm_head (so the resulting GGUF carries just the draft head).
|
||||
if cls.mtp_only and not is_mtp and name not in (
|
||||
"model.embed_tokens.weight", "model.norm.weight", "lm_head.weight",
|
||||
):
|
||||
return None
|
||||
|
||||
# The checkpoint nests the per-MTP-layer shared head under
|
||||
# `model.layers.{N+i}.transformer.shared_head.{norm,output}.weight`;
|
||||
# strip the `transformer.` infix and rename `output` → `head` so the
|
||||
# existing NEXTN_SHARED_HEAD_{NORM,HEAD} tensor mapping picks them up.
|
||||
# Mirrors vllm's `_rewrite_spec_layer_name` (step3p5_mtp.py).
|
||||
if is_mtp:
|
||||
name = name.replace(".transformer.", ".")
|
||||
name = name.replace("shared_head.output", "shared_head.head")
|
||||
|
||||
return name, gen
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
|
||||
# remove mtp layers
|
||||
if (m := re.match(r"model\.layers\.(\d+)\.", name)) is not None:
|
||||
il = int(m.group(1))
|
||||
n_main = int(self.hparams.get("num_hidden_layers", self.block_count))
|
||||
if il >= n_main:
|
||||
return
|
||||
if name.endswith("norm.weight"):
|
||||
data_torch += 1.0
|
||||
|
||||
@@ -190,6 +265,21 @@ class Step35Model(TextModel):
|
||||
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
def prepare_metadata(self, vocab_only: bool):
|
||||
from_dir = self.fname_out.is_dir()
|
||||
super().prepare_metadata(vocab_only=vocab_only)
|
||||
|
||||
# Mirror Qwen3.5's behavior: when emitting a draft-only file into a
|
||||
# directory, prefix with "mtp-" so it doesn't collide with the trunk.
|
||||
if not self.mtp_only or not from_dir:
|
||||
return
|
||||
|
||||
output_type: str = self.ftype.name.partition("_")[2]
|
||||
fname_default: str = gguf.naming_convention(
|
||||
self.metadata.name, self.metadata.basename, self.metadata.finetune,
|
||||
self.metadata.version, size_label=None, output_type=output_type, model_type=None)
|
||||
self.fname_out = self.fname_out.parent / f"mtp-{fname_default}.gguf"
|
||||
|
||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||
# Step35 can optionally use Llama-3 style RoPE scaling (HF: rope_scaling.rope_type == "llama3").
|
||||
# llama.cpp represents this via a single extra tensor: "rope_freqs.weight" (aka MODEL_TENSOR.ROPE_FREQS).
|
||||
@@ -203,11 +293,23 @@ class Step35Model(TextModel):
|
||||
if isinstance(rope_theta, list):
|
||||
rope_theta = rope_theta[0]
|
||||
base = float(rope_theta)
|
||||
if (dim := self.hparams.get("head_dim")) is None:
|
||||
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||
dim = int(dim)
|
||||
|
||||
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
||||
if (storage_dim := self.hparams.get("head_dim")) is None:
|
||||
storage_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||
storage_dim = int(storage_dim)
|
||||
|
||||
# Llama 3 factors apply only to the rotary dims used by full_attention layers
|
||||
# (partial_rotary_factor * head_dim). Remaining slots are padded with 1.0 so
|
||||
# sliding_attention layers remain unaffected. set_gguf_parameters already
|
||||
# guarantees at least one full_attention layer.
|
||||
layer_types = (self.hparams.get("layer_types") or [])[: self.block_count]
|
||||
partial_rotary_factors = (self.hparams.get("partial_rotary_factors") or [])[: self.block_count]
|
||||
full_attention_factor = next(
|
||||
float(f) for lt, f in zip(layer_types, partial_rotary_factors) if lt == "full_attention"
|
||||
)
|
||||
rotary_dim = int(storage_dim * full_attention_factor)
|
||||
|
||||
freqs = 1.0 / (base ** (torch.arange(0, rotary_dim, 2, dtype=torch.float32) / rotary_dim))
|
||||
|
||||
factor = float(rope_params.get("factor", 8.0))
|
||||
low_freq_factor = float(rope_params.get("low_freq_factor", 1.0))
|
||||
@@ -228,4 +330,8 @@ class Step35Model(TextModel):
|
||||
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
|
||||
rope_factors.append(1.0 / ((1.0 - smooth) / factor + smooth))
|
||||
|
||||
# Pad to head_dim/2 with 1.0 so non-scaled layers remain neutral.
|
||||
if len(rope_factors) < storage_dim // 2:
|
||||
rope_factors.extend([1.0] * (storage_dim // 2 - len(rope_factors)))
|
||||
|
||||
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
|
||||
|
||||
@@ -251,8 +251,9 @@ def main() -> None:
|
||||
|
||||
if args.mtp or args.no_mtp:
|
||||
from conversion.qwen import _Qwen35MtpMixin
|
||||
if not issubclass(model_class, _Qwen35MtpMixin):
|
||||
logger.error("--mtp / --no-mtp are only supported for Qwen3.5/3.6 text variants today")
|
||||
from conversion.step3 import Step35Model
|
||||
if not (issubclass(model_class, _Qwen35MtpMixin) or issubclass(model_class, Step35Model)):
|
||||
logger.error("--mtp / --no-mtp are only supported for Qwen3.5/3.6 and Step3.5 text variants today")
|
||||
sys.exit(1)
|
||||
if args.no_mtp:
|
||||
model_class.no_mtp = True
|
||||
|
||||
@@ -158,6 +158,9 @@ models = [
|
||||
{"name": "sarvam-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sarvamai/sarvam-30b", },
|
||||
{"name": "talkie", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/lewtun/talkie-1930-13b-it-hf", },
|
||||
{"name": "minicpm5", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openbmb/MiniCPM5-1B"},
|
||||
{"name": "granite-embed-multi-97m", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-embedding-97m-multilingual-r2", },
|
||||
{"name": "granite-embed-multi-311m", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-embedding-311m-multilingual-r2", },
|
||||
{"name": "mellum2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum2-12B-A2.5B-Base"},
|
||||
]
|
||||
|
||||
# some models are known to be broken upstream, so we will skip them as exceptions
|
||||
|
||||
@@ -25,7 +25,7 @@ The convert script reads the model configuration, tokenizer, tensor names+data a
|
||||
|
||||
The required steps to implement for an HF model are:
|
||||
|
||||
1. Define the model `ModelBase.register` annotation in a new `TextModel` or `MmprojModel` subclass, example:
|
||||
1. Define the model `ModelBase.register` annotation in a new `TextModel` or `MmprojModel` subclass in the [conversion](/conversion) folder, example:
|
||||
|
||||
```python
|
||||
@ModelBase.register("MyModelForCausalLM")
|
||||
@@ -98,7 +98,7 @@ The model params and tensors layout must be defined in `llama.cpp` source files:
|
||||
1. Define a new `llm_arch` enum value in `src/llama-arch.h`.
|
||||
2. In `src/llama-arch.cpp`:
|
||||
- Add the architecture name to the `LLM_ARCH_NAMES` map.
|
||||
- Add the list of model tensors to `llm_get_tensor_names` (you may also need to update `LLM_TENSOR_NAMES`)
|
||||
- You may also need to update `LLM_KV_NAMES`, `LLM_TENSOR_NAMES` and `LLM_TENSOR_INFOS`
|
||||
3. Add any non-standard metadata loading in the `llama_model_loader` constructor in `src/llama-model-loader.cpp`.
|
||||
4. If the model has a RoPE operation, add a case for the architecture in `llama_model_rope_type` function in `src/llama-model.cpp`.
|
||||
|
||||
@@ -106,10 +106,11 @@ NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorc
|
||||
|
||||
### 3. Build the GGML graph implementation
|
||||
|
||||
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `src/llama-model.cpp`.
|
||||
Create a new struct that inherits from `llm_graph_context` and implement the graph-building logic in its constructor.
|
||||
Have a look at existing implementations like `llm_build_llama`, `llm_build_dbrx` or `llm_build_bert`.
|
||||
Then, in the `llama_model::build_graph` method, add a case for your architecture to instantiate your new graph-building struct.
|
||||
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `src/llama-model.cpp`:
|
||||
1. Create a new struct that inherits from `llama_model_base`.
|
||||
2. Implement the graph-building logic in its `build_arch_graph` method.
|
||||
3. The `build_arch_graph` method should return a constructed graph (inherited from `llm_graph_context`). Have a look at existing implementations like `llama_model_llama`, `llama_model_dbrx` or `llama_model_bert`.
|
||||
4. Then, in the `llama_model_mapping` function, add a case for your architecture to instantiate your new graph-building struct.
|
||||
|
||||
Some `ggml` backends do not support all operations. Backend implementations can be added in a separate PR.
|
||||
|
||||
|
||||
@@ -381,11 +381,15 @@ extern "C" {
|
||||
// - most tensors have n_segments == 1 and a contiguous slice of the tensor data
|
||||
// - some tensors have an inhomogenenous data layout along the split axis,
|
||||
// those tensors are divided into segments which are each individually split across devices
|
||||
// - ne has one entry per segment and device that add up to ggml_tensor::ne for that axis,
|
||||
// the outer/inner loops are over segments/devices like [seg0_dev0, seg0_dev1, seg1_dev0, seg1_dev1],
|
||||
// - ne has one entry per segment and device and that segment repeats nr times,
|
||||
// in total when accounting for repetitions the segments add up to ggml_tensor::ne for that axis,
|
||||
// the outer/inner loops are over segments/devices like [seg0_dev0_r0, seg0_dev1_r0, seg0_dev0_r1, seg0_dev1_r1, seg1_dev0_r0, seg1_dev1_r0],
|
||||
// - for example, a transformer may have a fused QKV matrix rather than 3 matrices, those would be 3 separate segments
|
||||
// that each need to be split individually across devices so that each device gets a slice of Q, K, and V
|
||||
// that each need to be split individually across devices so that each device gets a slice of Q, K, and V,
|
||||
// the Q matrix can be larger than the K and V matrices so this can either be expressed as 3 segments or as 2 segments
|
||||
// where the segment for K/V repeats twice
|
||||
int64_t ne[16*GGML_BACKEND_META_MAX_DEVICES];
|
||||
uint32_t nr[16];
|
||||
uint32_t n_segments;
|
||||
};
|
||||
|
||||
|
||||
+142
-136
@@ -487,6 +487,9 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co
|
||||
|
||||
static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
||||
ggml_backend_meta_simple_tensor_container & stc, const struct ggml_tensor * tensor, bool assume_sync) {
|
||||
// FIXME Currently this function preserves/erases the information in n_segments and nr in an inconsistent way.
|
||||
// Since the operations in question are developed specifically for llama.cpp this currently does not manifest as a bug there.
|
||||
// However, in a broader ggml context with arbitrary ggml graphs this can lead to unexpected results.
|
||||
const size_t n_bufs = ggml_backend_meta_buffer_n_bufs(tensor->buffer);
|
||||
ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context;
|
||||
|
||||
@@ -497,11 +500,11 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
||||
for (size_t j = 0; j < n_bufs; j++) {
|
||||
int64_t sum_a = 0;
|
||||
for (size_t s = 0; s < a.n_segments; s++) {
|
||||
sum_a += a.ne[s*n_bufs + j];
|
||||
sum_a += a.ne[s*n_bufs + j] * a.nr[s];
|
||||
}
|
||||
int64_t sum_b = 0;
|
||||
for (size_t s = 0; s < b.n_segments; s++) {
|
||||
sum_b += b.ne[s*n_bufs + j];
|
||||
sum_b += b.ne[s*n_bufs + j] * b.nr[s];
|
||||
}
|
||||
if (sum_a != sum_b) {
|
||||
return false;
|
||||
@@ -511,7 +514,7 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
||||
};
|
||||
|
||||
auto handle_generic = [&](const std::vector<ggml_backend_meta_split_state> & src_ss, bool scalar_only) -> ggml_backend_meta_split_state {
|
||||
ggml_backend_meta_split_state ret = {GGML_BACKEND_SPLIT_AXIS_NONE, {0}, 1};
|
||||
ggml_backend_meta_split_state ret = {GGML_BACKEND_SPLIT_AXIS_NONE, {0}, {1}, 1};
|
||||
for (size_t i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (tensor->src[i] == nullptr || tensor->src[i] == tensor) {
|
||||
continue;
|
||||
@@ -519,15 +522,15 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
||||
if (ret.axis == GGML_BACKEND_SPLIT_AXIS_NONE) {
|
||||
ret = src_ss[i];
|
||||
} else if (!split_states_equal(src_ss[i], ret)) {
|
||||
ret = {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1};
|
||||
ret = {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, {1}, 1};
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (ret.axis == GGML_BACKEND_SPLIT_AXIS_NONE) {
|
||||
ret = {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1};
|
||||
ret = {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, {1}, 1};
|
||||
}
|
||||
if (scalar_only && ret.axis >= 0 && ret.axis < GGML_MAX_DIMS) {
|
||||
ret = {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1};
|
||||
ret = {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, {1}, 1};
|
||||
}
|
||||
GGML_ASSERT(ret.axis != GGML_BACKEND_SPLIT_AXIS_UNKNOWN);
|
||||
return ret;
|
||||
@@ -571,42 +574,24 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
||||
|
||||
auto handle_mul_mat = [&](const std::vector<ggml_backend_meta_split_state> & src_ss) -> ggml_backend_meta_split_state {
|
||||
if (src_ss[0].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED && src_ss[1].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
|
||||
return {GGML_BACKEND_SPLIT_AXIS_MIRRORED, {0}, 1};
|
||||
return {GGML_BACKEND_SPLIT_AXIS_MIRRORED, {0}, {1}, 1};
|
||||
}
|
||||
if (src_ss[0].axis == GGML_BACKEND_SPLIT_AXIS_1 && src_ss[1].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
|
||||
ggml_backend_meta_split_state ret = src_ss[0];
|
||||
ret.axis = GGML_BACKEND_SPLIT_AXIS_0;
|
||||
ret.nr[0] = 1;
|
||||
ret.n_segments = 1;
|
||||
return ret;
|
||||
}
|
||||
if (src_ss[1].axis == GGML_BACKEND_SPLIT_AXIS_1 && src_ss[0].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
|
||||
ggml_backend_meta_split_state ret = src_ss[1];
|
||||
ret.n_segments = 1;
|
||||
return ret;
|
||||
return src_ss[1];
|
||||
}
|
||||
if (src_ss[0].axis == GGML_BACKEND_SPLIT_AXIS_0 && src_ss[1].axis == GGML_BACKEND_SPLIT_AXIS_0) {
|
||||
GGML_ASSERT(split_states_equal(src_ss[0], src_ss[1]));
|
||||
return {assume_sync ? GGML_BACKEND_SPLIT_AXIS_MIRRORED : GGML_BACKEND_SPLIT_AXIS_PARTIAL, {0}, 1};
|
||||
return {assume_sync ? GGML_BACKEND_SPLIT_AXIS_MIRRORED : GGML_BACKEND_SPLIT_AXIS_PARTIAL, {0}, {1}, 1};
|
||||
}
|
||||
GGML_ABORT("fatal error");
|
||||
//return {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1};
|
||||
};
|
||||
|
||||
auto handle_cpy = [&](const std::vector<ggml_backend_meta_split_state> & src_ss) -> ggml_backend_meta_split_state {
|
||||
if (src_ss[0].axis >= 0 && src_ss[0].axis < GGML_MAX_DIMS) {
|
||||
int64_t ne_split_src = tensor->src[0]->ne[0];
|
||||
for (int dim = 1; dim <= src_ss[0].axis; dim++) {
|
||||
ne_split_src *= tensor->src[0]->ne[dim];
|
||||
}
|
||||
int64_t ne_split_dst = 1;
|
||||
for (int dim = 0; dim < GGML_MAX_DIMS; dim++) {
|
||||
ne_split_dst *= tensor->ne[dim];
|
||||
if (ne_split_dst == ne_split_src) {
|
||||
return {ggml_backend_meta_split_axis(dim), {0}, 1};
|
||||
}
|
||||
}
|
||||
}
|
||||
return handle_generic(src_ss, /*scalar_only =*/ false);
|
||||
//return {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, {1}, 1};
|
||||
};
|
||||
|
||||
auto handle_reshape = [&](const std::vector<ggml_backend_meta_split_state> & src_ss) -> ggml_backend_meta_split_state {
|
||||
@@ -615,33 +600,25 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
||||
case GGML_BACKEND_SPLIT_AXIS_1:
|
||||
case GGML_BACKEND_SPLIT_AXIS_2:
|
||||
case GGML_BACKEND_SPLIT_AXIS_3: {
|
||||
GGML_ASSERT(!ggml_is_permuted(tensor) && !ggml_is_permuted(tensor->src[0]));
|
||||
if (src_ss[0].axis == ggml_n_dims(tensor->src[0]) - 1) {
|
||||
return {ggml_backend_meta_split_axis(ggml_n_dims(tensor) - 1), {0}, 1};
|
||||
GGML_ASSERT(src_ss[0].n_segments == 1);
|
||||
if (src_ss[0].axis == ggml_n_dims(tensor->src[0]) - 1 && src_ss[0].nr[0] == 1) {
|
||||
return {ggml_backend_meta_split_axis(ggml_n_dims(tensor) - 1), {0}, {1}, 1};
|
||||
}
|
||||
std::vector<int64_t> base_ne_in;
|
||||
base_ne_in.reserve(GGML_MAX_DIMS - src_ss[0].axis);
|
||||
{
|
||||
base_ne_in.push_back(1);
|
||||
int dim = 0;
|
||||
for (; dim <= src_ss[0].axis; dim++) {
|
||||
base_ne_in[0] *= tensor->src[0]->ne[dim];
|
||||
}
|
||||
for (; dim <= GGML_MAX_DIMS; dim++) {
|
||||
base_ne_in.push_back(base_ne_in.back() * tensor->src[0]->ne[dim]);
|
||||
}
|
||||
int64_t base_ne_in = tensor->src[0]->ne[0];
|
||||
for (int dim = 1; dim <= src_ss[0].axis; dim++) {
|
||||
base_ne_in *= tensor->src[0]->ne[dim];
|
||||
}
|
||||
base_ne_in /= src_ss[0].nr[0];
|
||||
int64_t base_ne_out = 1;
|
||||
for (int dim = 0; dim < GGML_MAX_DIMS; dim++) {
|
||||
const int64_t base_ne_out_next = base_ne_out *= tensor->ne[dim];
|
||||
for (const int64_t & bni : base_ne_in) {
|
||||
if (bni == base_ne_out_next) {
|
||||
return {ggml_backend_meta_split_axis(dim), {0}, 1};
|
||||
}
|
||||
if (base_ne_out_next % base_ne_in == 0) {
|
||||
return {ggml_backend_meta_split_axis(dim), {0}, {uint32_t(base_ne_out_next/base_ne_in)}, 1};
|
||||
}
|
||||
if (base_ne_out_next > base_ne_in[0]) {
|
||||
GGML_ASSERT(dim + 1 < GGML_MAX_DIMS);
|
||||
return {ggml_backend_meta_split_axis(dim + 1), {0}, 1};
|
||||
if (base_ne_out_next > base_ne_in) {
|
||||
GGML_ASSERT(src_ss[0].n_segments == 1);
|
||||
GGML_ASSERT(src_ss[0].nr[0] == 1);
|
||||
return {ggml_backend_meta_split_axis(dim), {0}, {1}, 1};
|
||||
}
|
||||
base_ne_out = base_ne_out_next;
|
||||
}
|
||||
@@ -653,11 +630,18 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
||||
}
|
||||
default: {
|
||||
GGML_ABORT("fatal error");
|
||||
//return {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1};
|
||||
//return {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, {1}, 1};
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
auto handle_cpy = [&](const std::vector<ggml_backend_meta_split_state> & src_ss) -> ggml_backend_meta_split_state {
|
||||
if (src_ss[0].axis >= 0 && src_ss[0].axis < GGML_MAX_DIMS) {
|
||||
return handle_reshape(src_ss);
|
||||
}
|
||||
return handle_generic(src_ss, /*scalar_only =*/ false);
|
||||
};
|
||||
|
||||
auto handle_view = [&](const std::vector<ggml_backend_meta_split_state> & src_ss) -> ggml_backend_meta_split_state {
|
||||
if (ggml_is_contiguous(tensor) && ggml_is_contiguous(tensor->src[0])) {
|
||||
return handle_reshape(src_ss);
|
||||
@@ -681,7 +665,7 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
||||
if (!ggml_is_permuted(tensor) && !ggml_is_permuted(tensor->src[0]) && axis >= 0 && axis < GGML_MAX_DIMS-1) {
|
||||
for (int dim = 0; dim < GGML_MAX_DIMS-1; dim++) {
|
||||
if (tensor->nb[dim+1] == tensor->src[0]->nb[axis+1]) {
|
||||
return {ggml_backend_meta_split_axis(dim), {0}, 1};
|
||||
return {ggml_backend_meta_split_axis(dim), {0}, {1}, 1};
|
||||
}
|
||||
}
|
||||
GGML_ABORT("fatal error");
|
||||
@@ -690,7 +674,7 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
||||
return src_ss[0];
|
||||
}
|
||||
GGML_ABORT("view of permuted tensor not implemented");
|
||||
//return {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1};
|
||||
//return {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, {1}, 1};
|
||||
};
|
||||
|
||||
auto handle_permute = [&](const std::vector<ggml_backend_meta_split_state> & src_ss) -> ggml_backend_meta_split_state {
|
||||
@@ -699,7 +683,8 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
||||
case GGML_BACKEND_SPLIT_AXIS_1:
|
||||
case GGML_BACKEND_SPLIT_AXIS_2:
|
||||
case GGML_BACKEND_SPLIT_AXIS_3: {
|
||||
return {ggml_backend_meta_split_axis(tensor->op_params[src_ss[0].axis]), {0}, 1};
|
||||
GGML_ASSERT(src_ss[0].n_segments == 1 || src_ss[0].nr[0] == 1);
|
||||
return {ggml_backend_meta_split_axis(tensor->op_params[src_ss[0].axis]), {0}, {src_ss[0].nr[0]}, 1};
|
||||
}
|
||||
case GGML_BACKEND_SPLIT_AXIS_MIRRORED:
|
||||
case GGML_BACKEND_SPLIT_AXIS_PARTIAL: {
|
||||
@@ -707,7 +692,7 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
||||
}
|
||||
default: {
|
||||
GGML_ABORT("fatal error");
|
||||
//return {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1};
|
||||
//return {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, {1}, 1};
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -716,7 +701,8 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
||||
switch (src_ss[0].axis) {
|
||||
case GGML_BACKEND_SPLIT_AXIS_0:
|
||||
case GGML_BACKEND_SPLIT_AXIS_1: {
|
||||
return {ggml_backend_meta_split_axis(int(src_ss[0].axis) ^ 1), {0}, 1};
|
||||
GGML_ASSERT(src_ss[0].n_segments == 1 || src_ss[0].nr[0] == 1);
|
||||
return {ggml_backend_meta_split_axis(int(src_ss[0].axis) ^ 1), {0}, {src_ss[0].nr[0]}, 1};
|
||||
}
|
||||
case GGML_BACKEND_SPLIT_AXIS_2:
|
||||
case GGML_BACKEND_SPLIT_AXIS_3:
|
||||
@@ -726,7 +712,7 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
||||
}
|
||||
default: {
|
||||
GGML_ABORT("fatal error");
|
||||
//return {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1};
|
||||
//return {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, {1}, 1};
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -764,16 +750,16 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
||||
GGML_ASSERT( src_ss[2].axis == GGML_BACKEND_SPLIT_AXIS_2);
|
||||
GGML_ASSERT(tensor->src[4] == nullptr || src_ss[3].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED);
|
||||
GGML_ASSERT(tensor->src[4] == nullptr || src_ss[4].axis == GGML_BACKEND_SPLIT_AXIS_0);
|
||||
return {GGML_BACKEND_SPLIT_AXIS_1, {0}, 1};
|
||||
return {GGML_BACKEND_SPLIT_AXIS_1, {0}, {1}, 1};
|
||||
};
|
||||
|
||||
auto handle_ssm_conv = [&](const std::vector<ggml_backend_meta_split_state> & src_ss) -> ggml_backend_meta_split_state {
|
||||
if (src_ss[0].axis == src_ss[1].axis) {
|
||||
if (src_ss[0].axis == GGML_BACKEND_SPLIT_AXIS_0) {
|
||||
return {GGML_BACKEND_SPLIT_AXIS_1, {0}, 1};
|
||||
return {GGML_BACKEND_SPLIT_AXIS_1, {0}, {1}, 1};
|
||||
}
|
||||
if (src_ss[0].axis == GGML_BACKEND_SPLIT_AXIS_1) {
|
||||
return {GGML_BACKEND_SPLIT_AXIS_0, {0}, 1};
|
||||
return {GGML_BACKEND_SPLIT_AXIS_0, {0}, {1}, 1};
|
||||
}
|
||||
}
|
||||
return handle_generic(src_ss, /*scalar_only =*/ false);
|
||||
@@ -781,8 +767,8 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
||||
|
||||
auto handle_gated_delta_net = [&](const std::vector<ggml_backend_meta_split_state> & src_ss) -> ggml_backend_meta_split_state {
|
||||
if (src_ss[0].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED && src_ss[1].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED &&
|
||||
src_ss[2].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED && src_ss[3].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED &&
|
||||
src_ss[4].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED && src_ss[5].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
|
||||
src_ss[2].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED && src_ss[3].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED &&
|
||||
src_ss[4].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED && src_ss[5].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
|
||||
return src_ss[0];
|
||||
}
|
||||
GGML_ASSERT(src_ss[0].axis == GGML_BACKEND_SPLIT_AXIS_1);
|
||||
@@ -793,12 +779,12 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
||||
// state shape is (S_v*S_v*H, K, n_seqs); the heads dim is nested inside axis 0,
|
||||
// so a head-aligned split on the input cache reshapes to axis 0 here (not axis 2).
|
||||
GGML_ASSERT(src_ss[5].axis == GGML_BACKEND_SPLIT_AXIS_2 || src_ss[5].axis == GGML_BACKEND_SPLIT_AXIS_1 || src_ss[5].axis == GGML_BACKEND_SPLIT_AXIS_0);
|
||||
return {GGML_BACKEND_SPLIT_AXIS_0, {0}, 1};
|
||||
return {GGML_BACKEND_SPLIT_AXIS_0, {0}, {1}, 1};
|
||||
};
|
||||
|
||||
auto calculate_split_state = [&]() -> ggml_backend_meta_split_state {
|
||||
if (ggml_nelements(tensor) == 0) {
|
||||
return {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1};
|
||||
return {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, {1}, 1};
|
||||
}
|
||||
if (ggml_backend_buffer_get_usage(tensor->buffer) != GGML_BACKEND_BUFFER_USAGE_COMPUTE && tensor->view_src == nullptr) {
|
||||
ggml_backend_dev_t dev = ggml_backend_buft_get_device(ggml_backend_buffer_get_type(tensor->buffer));
|
||||
@@ -807,19 +793,21 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
||||
if (ret.axis >= 0 && ret.axis <= GGML_MAX_DIMS) {
|
||||
const int64_t granularity = ret.axis == GGML_BACKEND_SPLIT_AXIS_0 ? ggml_blck_size(tensor->type) : 1;
|
||||
int64_t ne_sum = 0;
|
||||
for (size_t sj = 0; sj < ret.n_segments*n_bufs; sj++) {
|
||||
GGML_ASSERT(ret.ne[sj] % granularity == 0);
|
||||
ne_sum += ret.ne[sj];
|
||||
for (size_t s = 0; s < ret.n_segments; s++) {
|
||||
for (size_t j = 0; j < n_bufs; j++) {
|
||||
GGML_ASSERT(ret.ne[s*n_bufs + j] % granularity == 0);
|
||||
ne_sum += ret.ne[s*n_bufs + j] * ret.nr[s];
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(ne_sum == tensor->ne[ret.axis]);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
std::vector<ggml_backend_meta_split_state> src_ss(GGML_MAX_SRC, {GGML_BACKEND_SPLIT_AXIS_NONE, {0}, 1});
|
||||
std::vector<ggml_backend_meta_split_state> src_ss(GGML_MAX_SRC, {GGML_BACKEND_SPLIT_AXIS_NONE, {0}, {1}, 1});
|
||||
for (size_t i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (tensor->src[i] == nullptr || tensor->src[i] == tensor) {
|
||||
src_ss[i] = {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1};
|
||||
src_ss[i] = {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, {1}, 1};
|
||||
continue;
|
||||
}
|
||||
src_ss[i] = ggml_backend_meta_get_split_state(stc, tensor->src[i], /*assume_sync =*/ true);
|
||||
@@ -829,7 +817,7 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
||||
ggml_backend_meta_split_state split_state;
|
||||
switch (tensor->op) {
|
||||
case GGML_OP_NONE: {
|
||||
split_state = {GGML_BACKEND_SPLIT_AXIS_MIRRORED, {0}, 1};
|
||||
split_state = {GGML_BACKEND_SPLIT_AXIS_MIRRORED, {0}, {1}, 1};
|
||||
} break;
|
||||
case GGML_OP_DUP: {
|
||||
split_state = handle_generic(src_ss, /*scalar_only =*/ true);
|
||||
@@ -1016,7 +1004,7 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
||||
} break;
|
||||
default: {
|
||||
GGML_ABORT("ggml op not implemented: %s", ggml_op_name(tensor->op));
|
||||
split_state = {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1};
|
||||
split_state = {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, {1}, 1};
|
||||
} break;
|
||||
}
|
||||
if (split_state.axis >= 0 && split_state.axis < GGML_MAX_DIMS) {
|
||||
@@ -1034,23 +1022,25 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
||||
split_state.ne[s*n_bufs + j] = 0;
|
||||
}
|
||||
for (size_t s = 0; s < src_ss[i].n_segments; s++) {
|
||||
split_state.ne[j] += src_ss[i].ne[s*n_bufs + j];
|
||||
split_state.ne[j] += src_ss[i].ne[s*n_bufs + j] * src_ss[i].nr[s];
|
||||
}
|
||||
split_state.ne[j] *= tensor->ne[split_state.axis];
|
||||
if (split_state.ne[j] != 0 || tensor->src[i]->ne[src_ss[i].axis] != 0) {
|
||||
GGML_ASSERT(split_state.ne[j] % tensor->src[i]->ne[src_ss[i].axis] == 0);
|
||||
split_state.ne[j] /= tensor->src[i]->ne[src_ss[i].axis];
|
||||
const int64_t div = tensor->src[i]->ne[src_ss[i].axis] * split_state.nr[0];
|
||||
GGML_ASSERT(split_state.ne[j] % div == 0);
|
||||
split_state.ne[j] /= div;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
GGML_ASSERT(split_state.n_segments == 1);
|
||||
for (size_t j = 0; j < n_bufs; j++) {
|
||||
// Assert that ratio is consistent:
|
||||
int64_t sum = 0;
|
||||
for (size_t s = 0; s < src_ss[i].n_segments; s++) {
|
||||
sum += src_ss[i].ne[s*n_bufs + j];
|
||||
sum += src_ss[i].ne[s*n_bufs + j] * src_ss[i].nr[s];
|
||||
}
|
||||
// Assert that ratio is consistent:
|
||||
GGML_ASSERT(split_state.ne[j] * tensor->src[i]->ne[src_ss[i].axis]
|
||||
== sum * tensor->ne[split_state.axis]);
|
||||
GGML_ASSERT(split_state.ne[j]*split_state.nr[0] * tensor->src[i]->ne[src_ss[i].axis]
|
||||
== sum * tensor->ne[split_state.axis]);
|
||||
}
|
||||
}
|
||||
first_src_split_by_axis = false;
|
||||
@@ -1080,13 +1070,14 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
||||
srcs_info += ", ";
|
||||
}
|
||||
const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor->src[0], true);
|
||||
GGML_ASSERT(split_state.n_segments == 1);
|
||||
const char * axis_name = ggml_backend_meta_split_axis_name(split_state.axis);
|
||||
std::string ne_info;
|
||||
for (size_t j = 0; j < n_bufs; j++) {
|
||||
if (!ne_info.empty()) {
|
||||
ne_info += ", ";
|
||||
}
|
||||
ne_info += std::to_string(split_state.ne[j]);
|
||||
ne_info += std::to_string(split_state.ne[j]) + "x" + std::to_string(split_state.nr[0]);
|
||||
}
|
||||
srcs_info += std::string(tensor->src[i]->name) + "[" + ggml_op_name(tensor->src[i]->op) + ", " + axis_name + ", {" + ne_info + "}]";
|
||||
}
|
||||
@@ -1095,7 +1086,8 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
||||
if (!ne_info.empty()) {
|
||||
ne_info += ", ";
|
||||
}
|
||||
ne_info += std::to_string(buf_ctx->split_state_cache[key].first.ne[j]);
|
||||
const ggml_backend_meta_split_state & ss = buf_ctx->split_state_cache[key].first;
|
||||
ne_info += std::to_string(ss.ne[j]) + "x" + std::to_string(ss.nr[0]);
|
||||
}
|
||||
GGML_LOG_DEBUG("SPLIT_STATE: {%s} -> %s[%s, %s, {%s}]\n", srcs_info.c_str(), tensor->name, ggml_op_name(tensor->op),
|
||||
ggml_backend_meta_split_axis_name(buf_ctx->split_state_cache[key].first.axis), ne_info.c_str());
|
||||
@@ -1107,8 +1099,10 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
||||
#ifndef NDEBUG
|
||||
if (ret.axis >= 0 && ret.axis < GGML_MAX_DIMS) {
|
||||
int64_t ne_ret = 0;
|
||||
for (size_t sj = 0; sj < ret.n_segments*n_bufs; sj++) {
|
||||
ne_ret += ret.ne[sj];
|
||||
for (size_t s = 0; s < ret.n_segments; s++) {
|
||||
for (size_t j = 0; j < n_bufs; j++) {
|
||||
ne_ret += ret.ne[s*n_bufs + j] * ret.nr[s];
|
||||
}
|
||||
}
|
||||
assert(ne_ret == tensor->ne[int(ret.axis)]);
|
||||
}
|
||||
@@ -1155,7 +1149,7 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor_impl(ggml_backend_m
|
||||
// GGML_ASSERT(ggml_is_contiguously_allocated(tensor));
|
||||
ne[split_dim] = 0;
|
||||
for (size_t s = 0; s < split_state.n_segments; s++) {
|
||||
ne[split_dim] += split_state.ne[s*n_simple_bufs + j];
|
||||
ne[split_dim] += split_state.ne[s*n_simple_bufs + j] * split_state.nr[s];
|
||||
}
|
||||
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||
if (tensor->nb[i] > tensor->nb[split_dim]) {
|
||||
@@ -1229,7 +1223,7 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor_impl(ggml_backend_m
|
||||
for (size_t j = 0; j < n_simple_bufs; j++) {
|
||||
int64_t ne_sum = 0;
|
||||
for (size_t s = 0; s < split_state_src.n_segments; s++) {
|
||||
ne_sum += split_state_src.ne[s*n_simple_bufs + j];
|
||||
ne_sum += split_state_src.ne[s*n_simple_bufs + j] * split_state_src.nr[s];
|
||||
}
|
||||
if (ne_sum == 0) {
|
||||
simple_tensors[j]->flags &= ~GGML_TENSOR_FLAG_COMPUTE;
|
||||
@@ -1255,8 +1249,9 @@ static void ggml_backend_meta_buffer_set_tensor(ggml_backend_buffer_t buffer, gg
|
||||
|
||||
const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor, /*assume_sync =*/ false);
|
||||
|
||||
if (split_state.n_segments != 1) {
|
||||
if (split_state.n_segments != 1 || split_state.nr[0] != 1) {
|
||||
GGML_ASSERT(split_state.axis >= 0 && split_state.axis < GGML_MAX_DIMS);
|
||||
GGML_ASSERT(split_state.nr[0] != 0);
|
||||
GGML_ASSERT(tensor->ne[3] == 1);
|
||||
|
||||
size_t offset_data = 0;
|
||||
@@ -1267,24 +1262,26 @@ static void ggml_backend_meta_buffer_set_tensor(ggml_backend_buffer_t buffer, gg
|
||||
const size_t row_stride = tensor->nb[1];
|
||||
GGML_ASSERT(offset % row_stride == 0);
|
||||
GGML_ASSERT(size % row_stride == 0);
|
||||
const int64_t r_start = offset / row_stride;
|
||||
const int64_t r_count = size / row_stride;
|
||||
GGML_ASSERT(r_start + r_count <= tensor->ne[1]);
|
||||
const int64_t row_start = offset / row_stride;
|
||||
const int64_t row_count = size / row_stride;
|
||||
GGML_ASSERT(row_start + row_count <= tensor->ne[1]);
|
||||
|
||||
const int64_t blck_size = ggml_blck_size(tensor->type);
|
||||
for (size_t s = 0; s < split_state.n_segments; s++) {
|
||||
for (size_t j = 0; j < n_bufs; j++) {
|
||||
ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||
GGML_ASSERT(split_state.ne[s*n_bufs + j] % blck_size == 0);
|
||||
const size_t nbytes = split_state.ne[s*n_bufs + j]/blck_size * tensor->nb[0];
|
||||
ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_data,
|
||||
simple_offsets[j] + r_start * simple_tensor->nb[1], nbytes,
|
||||
r_count, simple_tensor->nb[1], tensor->nb[1]);
|
||||
offset_data += nbytes;
|
||||
simple_offsets[j] += nbytes;
|
||||
for (size_t r = 0; r < split_state.nr[s]; r++) {
|
||||
for (size_t j = 0; j < n_bufs; j++) {
|
||||
ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||
GGML_ASSERT(split_state.ne[s*n_bufs + j] % blck_size == 0);
|
||||
const size_t nbytes = split_state.ne[s*n_bufs + j]/blck_size * tensor->nb[0];
|
||||
ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_data,
|
||||
simple_offsets[j] + row_start * simple_tensor->nb[1], nbytes,
|
||||
row_count, simple_tensor->nb[1], tensor->nb[1]);
|
||||
offset_data += nbytes;
|
||||
simple_offsets[j] += nbytes;
|
||||
}
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(offset_data*r_count == size);
|
||||
GGML_ASSERT(offset_data*row_count == size);
|
||||
return;
|
||||
}
|
||||
GGML_ASSERT(split_state.axis == GGML_BACKEND_SPLIT_AXIS_1);
|
||||
@@ -1292,22 +1289,24 @@ static void ggml_backend_meta_buffer_set_tensor(ggml_backend_buffer_t buffer, gg
|
||||
const size_t row_stride = tensor->nb[2];
|
||||
GGML_ASSERT(offset % row_stride == 0);
|
||||
GGML_ASSERT(size % row_stride == 0);
|
||||
const int64_t r_start = offset / row_stride;
|
||||
const int64_t r_count = size / row_stride;
|
||||
GGML_ASSERT(r_start + r_count <= tensor->ne[2]);
|
||||
const int64_t row_start = offset / row_stride;
|
||||
const int64_t row_count = size / row_stride;
|
||||
GGML_ASSERT(row_start + row_count <= tensor->ne[2]);
|
||||
|
||||
for (size_t s = 0; s < split_state.n_segments; s++) {
|
||||
for (size_t j = 0; j < n_bufs; j++) {
|
||||
ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||
const size_t nbytes = split_state.ne[s*n_bufs + j] * tensor->nb[1];
|
||||
ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_data,
|
||||
simple_offsets[j] + r_start * simple_tensor->nb[2], nbytes,
|
||||
r_count, simple_tensor->nb[2], tensor->nb[2]);
|
||||
offset_data += nbytes;
|
||||
simple_offsets[j] += nbytes;
|
||||
for (size_t r = 0; r < split_state.nr[s]; r++) {
|
||||
for (size_t j = 0; j < n_bufs; j++) {
|
||||
ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||
const size_t nbytes = split_state.ne[s*n_bufs + j] * tensor->nb[1];
|
||||
ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_data,
|
||||
simple_offsets[j] + row_start * simple_tensor->nb[2], nbytes,
|
||||
row_count, simple_tensor->nb[2], tensor->nb[2]);
|
||||
offset_data += nbytes;
|
||||
simple_offsets[j] += nbytes;
|
||||
}
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(offset_data*r_count == size);
|
||||
GGML_ASSERT(offset_data*row_count == size);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -1365,8 +1364,9 @@ static void ggml_backend_meta_buffer_get_tensor(ggml_backend_buffer_t buffer, co
|
||||
|
||||
const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor, /*assume_sync =*/ false);
|
||||
|
||||
if (split_state.n_segments != 1) {
|
||||
if (split_state.n_segments != 1 || split_state.nr[0] != 1) {
|
||||
GGML_ASSERT(split_state.axis >= 0 && split_state.axis < GGML_MAX_DIMS);
|
||||
GGML_ASSERT(split_state.nr[0] != 0);
|
||||
GGML_ASSERT(tensor->ne[3] == 1);
|
||||
|
||||
size_t offset_data = 0;
|
||||
@@ -1377,24 +1377,26 @@ static void ggml_backend_meta_buffer_get_tensor(ggml_backend_buffer_t buffer, co
|
||||
const size_t row_stride = tensor->nb[1];
|
||||
GGML_ASSERT(offset % row_stride == 0);
|
||||
GGML_ASSERT(size % row_stride == 0);
|
||||
const int64_t r_start = offset / row_stride;
|
||||
const int64_t r_count = size / row_stride;
|
||||
GGML_ASSERT(r_start + r_count <= tensor->ne[1]);
|
||||
const int64_t row_start = offset / row_stride;
|
||||
const int64_t row_count = size / row_stride;
|
||||
GGML_ASSERT(row_start + row_count <= tensor->ne[1]);
|
||||
|
||||
const int64_t blck_size = ggml_blck_size(tensor->type);
|
||||
for (size_t s = 0; s < split_state.n_segments; s++) {
|
||||
for (size_t j = 0; j < n_bufs; j++) {
|
||||
const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||
GGML_ASSERT(split_state.ne[s*n_bufs + j] % blck_size == 0);
|
||||
const size_t nbytes = split_state.ne[s*n_bufs + j]/blck_size * tensor->nb[0];
|
||||
ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data,
|
||||
simple_offsets[j] + r_start * simple_tensor->nb[1], nbytes,
|
||||
r_count, simple_tensor->nb[1], tensor->nb[1]);
|
||||
offset_data += nbytes;
|
||||
simple_offsets[j] += nbytes;
|
||||
for (size_t r = 0; r < split_state.nr[s]; r++) {
|
||||
for (size_t j = 0; j < n_bufs; j++) {
|
||||
const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||
GGML_ASSERT(split_state.ne[s*n_bufs + j] % blck_size == 0);
|
||||
const size_t nbytes = split_state.ne[s*n_bufs + j]/blck_size * tensor->nb[0];
|
||||
ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data,
|
||||
simple_offsets[j] + row_start * simple_tensor->nb[1], nbytes,
|
||||
row_count, simple_tensor->nb[1], tensor->nb[1]);
|
||||
offset_data += nbytes;
|
||||
simple_offsets[j] += nbytes;
|
||||
}
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(offset_data*r_count == size);
|
||||
GGML_ASSERT(offset_data*row_count == size);
|
||||
return;
|
||||
}
|
||||
GGML_ASSERT(split_state.axis == GGML_BACKEND_SPLIT_AXIS_1);
|
||||
@@ -1402,22 +1404,24 @@ static void ggml_backend_meta_buffer_get_tensor(ggml_backend_buffer_t buffer, co
|
||||
const size_t row_stride = tensor->nb[2];
|
||||
GGML_ASSERT(offset % row_stride == 0);
|
||||
GGML_ASSERT(size % row_stride == 0);
|
||||
const int64_t r_start = offset / row_stride;
|
||||
const int64_t r_count = size / row_stride;
|
||||
GGML_ASSERT(r_start + r_count <= tensor->ne[2]);
|
||||
const int64_t row_start = offset / row_stride;
|
||||
const int64_t row_count = size / row_stride;
|
||||
GGML_ASSERT(row_start + row_count <= tensor->ne[2]);
|
||||
|
||||
for (size_t s = 0; s < split_state.n_segments; s++) {
|
||||
for (size_t j = 0; j < n_bufs; j++) {
|
||||
const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||
const size_t nbytes = split_state.ne[s*n_bufs + j] * tensor->nb[1];
|
||||
ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data,
|
||||
simple_offsets[j] + r_start * simple_tensor->nb[2], nbytes,
|
||||
r_count, simple_tensor->nb[2], tensor->nb[2]);
|
||||
offset_data += nbytes;
|
||||
simple_offsets[j] += nbytes;
|
||||
for (size_t r = 0; r < split_state.nr[s]; r++) {
|
||||
for (size_t j = 0; j < n_bufs; j++) {
|
||||
const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||
const size_t nbytes = split_state.ne[s*n_bufs + j] * tensor->nb[1];
|
||||
ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data,
|
||||
simple_offsets[j] + row_start * simple_tensor->nb[2], nbytes,
|
||||
row_count, simple_tensor->nb[2], tensor->nb[2]);
|
||||
offset_data += nbytes;
|
||||
simple_offsets[j] += nbytes;
|
||||
}
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(offset_data*r_count == size);
|
||||
GGML_ASSERT(offset_data*row_count == size);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -1675,6 +1679,7 @@ static void ggml_backend_meta_set_tensor_async(ggml_backend_t backend, ggml_tens
|
||||
|
||||
const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor, /*assume_sync =*/ false);
|
||||
GGML_ASSERT(split_state.n_segments == 1);
|
||||
GGML_ASSERT(split_state.nr[0] == 1);
|
||||
|
||||
switch (split_state.axis) {
|
||||
case GGML_BACKEND_SPLIT_AXIS_0:
|
||||
@@ -1719,6 +1724,7 @@ static void ggml_backend_meta_get_tensor_async(ggml_backend_t backend, const ggm
|
||||
|
||||
const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor, /*assume_sync =*/ false);
|
||||
GGML_ASSERT(split_state.n_segments == 1);
|
||||
GGML_ASSERT(split_state.nr[0] == 1);
|
||||
|
||||
switch (split_state.axis) {
|
||||
case GGML_BACKEND_SPLIT_AXIS_0:
|
||||
|
||||
@@ -568,7 +568,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
||||
constexpr bool Q_in_reg = ggml_cuda_fattn_mma_get_Q_in_reg (DKQ, DV, ncols);
|
||||
constexpr int nstages = ggml_cuda_fattn_mma_get_nstages (DKQ, DV, ncols1, ncols2);
|
||||
|
||||
constexpr int stride_tile_Q = DKQ/2 + 4;
|
||||
constexpr int stride_tile_K = nbatch_K2 + 4;
|
||||
|
||||
constexpr int stride_tile_V = V_is_K_view ? stride_tile_K : nbatch_V2 + 4;
|
||||
@@ -604,9 +603,9 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
||||
#pragma unroll
|
||||
for (int k0_start = (DKQ/2-1) - (DKQ/2-1) % nbatch_K2; k0_start >= 0; k0_start -= nbatch_K2) {
|
||||
const int k0_stop = k0_start + nbatch_K2 < DKQ/2 ? k0_start + nbatch_K2 : DKQ/2;
|
||||
const int k0_diff = k0_stop - k0_start;
|
||||
|
||||
if constexpr (nstages <= 1) {
|
||||
const int k0_diff = k0_stop - k0_start;
|
||||
constexpr bool use_cp_async = nstages == 1;
|
||||
flash_attn_ext_f16_load_tile<stride_tile_K, nwarps, nbatch_fa, use_cp_async, oob_check>
|
||||
(K_h2 + int64_t(k_VKQ_0)*stride_K + k0_start, tile_K, k0_diff, stride_K, k_VKQ_sup);
|
||||
@@ -640,6 +639,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
||||
}
|
||||
}
|
||||
} else {
|
||||
constexpr int stride_tile_Q = DKQ/2 + 4;
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = k0_start; k_KQ_0 < k0_stop; k_KQ_0 += T_A_KQ::J) {
|
||||
load_ldmatrix(Q_B[0], tile_Q + (threadIdx.y / np)*(T_B_KQ::I*stride_tile_Q) + k_KQ_0, stride_tile_Q);
|
||||
@@ -954,9 +954,9 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
||||
for (int i0_start = 0; i0_start < DV; i0_start += 2*nbatch_V2) {
|
||||
static_assert(DV % (2*nbatch_V2) == 0, "bad loop size");
|
||||
const int i0_stop = i0_start + 2*nbatch_V2;
|
||||
const int i0_diff = i0_stop - i0_start;
|
||||
|
||||
if constexpr (nstages <= 1) {
|
||||
const int i0_diff = i0_stop - i0_start;
|
||||
if (!V_is_K_view || i0_stop > 2*nbatch_K2) {
|
||||
constexpr bool use_cp_async = nstages == 1;
|
||||
flash_attn_ext_f16_load_tile<stride_tile_V, nwarps, nbatch_fa, use_cp_async, oob_check>
|
||||
|
||||
@@ -43,7 +43,6 @@ gated_delta_net_cuda(const float * q,
|
||||
// output state layout (per-slot D * n_seqs) — same per-(seq,head) offset as before.
|
||||
const int64_t state_in_offset = sequence * K * H * S_v * S_v + h_idx * S_v * S_v;
|
||||
const int64_t state_out_offset = (sequence * H + h_idx) * S_v * S_v;
|
||||
const int64_t state_size_per_token = S_v * S_v * H * n_seqs; // per-slot stride in output
|
||||
state += state_out_offset;
|
||||
curr_state += state_in_offset + col * S_v;
|
||||
attn_data += (sequence * n_tokens * H + h_idx) * S_v;
|
||||
@@ -61,10 +60,6 @@ gated_delta_net_cuda(const float * q,
|
||||
s_shard[r] = curr_state[i];
|
||||
}
|
||||
|
||||
// slot mapping: target_slot = t - shift. When n_tokens < K only the last n_tokens slots
|
||||
// are written; earlier slots are left untouched (caller-owned).
|
||||
const int shift = (int) n_tokens - K;
|
||||
|
||||
for (int t = 0; t < n_tokens; t++) {
|
||||
const float * q_t = q + iq3 * sq3 + t * sq2 + iq1 * sq1;
|
||||
const float * k_t = k + iq3 * sq3 + t * sq2 + iq1 * sq1;
|
||||
@@ -148,6 +143,11 @@ gated_delta_net_cuda(const float * q,
|
||||
attn_data += S_v * H;
|
||||
|
||||
if constexpr (keep_rs_t) {
|
||||
// slot mapping: target_slot = t - shift. When n_tokens < K only the last n_tokens slots
|
||||
// are written; earlier slots are left untouched (caller-owned).
|
||||
const int shift = (int) n_tokens - K;
|
||||
|
||||
const int64_t state_size_per_token = S_v * S_v * H * n_seqs; // per-slot stride in output
|
||||
const int target_slot = t - shift;
|
||||
if (target_slot >= 0 && target_slot < K) {
|
||||
float * curr_state = (dst + attn_score_elems) + target_slot * state_size_per_token + state_out_offset;
|
||||
|
||||
@@ -91,7 +91,7 @@ static __global__ void mul_mat_f(
|
||||
const int row0 = blockIdx.x * rows_per_block;
|
||||
|
||||
int expert_idx = 0;
|
||||
int col_base = 0;
|
||||
[[maybe_unused]] int col_base = 0;
|
||||
|
||||
const int channel_dst = has_ids ? 0 : blockIdx.y;
|
||||
|
||||
@@ -122,12 +122,12 @@ static __global__ void mul_mat_f(
|
||||
ids += col_offset * stride_row_id;
|
||||
}
|
||||
|
||||
const float2 * y2 = (const float2 *) y;
|
||||
[[maybe_unused]] const float2 * y2 = (const float2 *) y;
|
||||
|
||||
extern __shared__ char data_mmv[];
|
||||
|
||||
char * shmem_base = data_mmv;
|
||||
int * slot_map = (int *) shmem_base;
|
||||
[[maybe_unused]] int * slot_map = (int *) shmem_base;
|
||||
char * compute_base = has_ids ? (shmem_base + GGML_PAD(cols_per_block, 16) * sizeof(int)) : shmem_base;
|
||||
|
||||
tile_C C[ntA][ntB];
|
||||
|
||||
@@ -80,9 +80,8 @@ static __global__ void mul_mat_vec_f(
|
||||
gate_x += int64_t(sample_x) *stride_sample_x + channel_x *stride_channel_x + row*stride_row;
|
||||
}
|
||||
|
||||
const int channel_bias = ids ? channel_x : channel_dst;
|
||||
|
||||
if constexpr (has_fusion) {
|
||||
const int channel_bias = ids ? channel_x : channel_dst;
|
||||
if (use_bias) {
|
||||
x_bias += int64_t(sample_dst)*stride_sample_dst + channel_bias*stride_channel_dst;
|
||||
}
|
||||
@@ -95,7 +94,7 @@ static __global__ void mul_mat_vec_f(
|
||||
|
||||
extern __shared__ char data_mmv[];
|
||||
float * buf_iw = (float *) data_mmv;
|
||||
float * buf_iw_gate = nullptr;
|
||||
[[maybe_unused]] float * buf_iw_gate = nullptr;
|
||||
if constexpr (has_fusion) {
|
||||
buf_iw_gate = (float *) (data_mmv + warp_size*sizeof(float));
|
||||
}
|
||||
@@ -123,7 +122,7 @@ static __global__ void mul_mat_vec_f(
|
||||
|
||||
if constexpr (std::is_same_v<T, float>) {
|
||||
const float2 * x2 = (const float2 *) x;
|
||||
const float2 * gate_x2 = nullptr;
|
||||
[[maybe_unused]] const float2 * gate_x2 = nullptr;
|
||||
if constexpr (has_fusion) {
|
||||
if (use_gate) {
|
||||
gate_x2 = (const float2 *) gate_x;
|
||||
@@ -155,7 +154,7 @@ static __global__ void mul_mat_vec_f(
|
||||
}
|
||||
} else if constexpr (std::is_same_v<T, half>) {
|
||||
const half2 * x2 = (const half2 *) x;
|
||||
const half2 * gate_x2 = nullptr;
|
||||
[[maybe_unused]] const half2 * gate_x2 = nullptr;
|
||||
if constexpr (has_fusion) {
|
||||
if (use_gate) {
|
||||
gate_x2 = (const half2 *) gate_x;
|
||||
@@ -266,7 +265,7 @@ static __global__ void mul_mat_vec_f(
|
||||
}
|
||||
#else
|
||||
const nv_bfloat162 * x2 = (const nv_bfloat162 *) x;
|
||||
const nv_bfloat162 * gate_x2 = nullptr;
|
||||
[[maybe_unused]] const nv_bfloat162 * gate_x2 = nullptr;
|
||||
if constexpr (has_fusion) {
|
||||
if (use_gate) {
|
||||
gate_x2 = (const nv_bfloat162 *) gate_x;
|
||||
@@ -274,7 +273,7 @@ static __global__ void mul_mat_vec_f(
|
||||
}
|
||||
for (int col2 = tid; col2 < ncols2; col2 += block_size) {
|
||||
const nv_bfloat162 tmpx = x2[col2];
|
||||
nv_bfloat162 tmpx_gate;
|
||||
[[maybe_unused]] nv_bfloat162 tmpx_gate;
|
||||
if constexpr (has_fusion) {
|
||||
if (use_gate) {
|
||||
tmpx_gate = gate_x2[col2];
|
||||
|
||||
@@ -515,7 +515,7 @@ static __global__ void mul_mat_vec_q(
|
||||
bool use_gate = false;
|
||||
bool use_bias = false;
|
||||
bool use_gate_bias = false;
|
||||
const void * vgate = nullptr;
|
||||
[[maybe_unused]] const void * vgate = nullptr;
|
||||
const float * x_bias = nullptr;
|
||||
const float * gate_bias = nullptr;
|
||||
ggml_glu_op active_glu;
|
||||
@@ -531,8 +531,8 @@ static __global__ void mul_mat_vec_q(
|
||||
}
|
||||
|
||||
|
||||
float x_biases[ncols_dst] = { 0.0f };
|
||||
float gate_biases[ncols_dst] = { 0.0f };
|
||||
[[maybe_unused]] float x_biases[ncols_dst] = { 0.0f };
|
||||
[[maybe_unused]] float gate_biases[ncols_dst] = { 0.0f };
|
||||
if constexpr (has_fusion) {
|
||||
const uint32_t channel_bias = ids ? channel_x : channel_dst;
|
||||
if (use_bias) {
|
||||
@@ -589,12 +589,7 @@ static __global__ void mul_mat_vec_q(
|
||||
}
|
||||
|
||||
__shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_dst][rows_per_cuda_block][warp_size];
|
||||
__shared__ float tmp_shared_gate[(has_fusion && (nwarps-1 > 0)) ? nwarps-1 : 1][ncols_dst][rows_per_cuda_block][warp_size];
|
||||
if constexpr (!has_fusion) {
|
||||
(void) tmp_shared_gate;
|
||||
} else if (!use_gate) {
|
||||
(void) tmp_shared_gate;
|
||||
}
|
||||
[[maybe_unused]] __shared__ float tmp_shared_gate[(has_fusion && (nwarps-1 > 0)) ? nwarps-1 : 1][ncols_dst][rows_per_cuda_block][warp_size];
|
||||
|
||||
if (threadIdx.y > 0) {
|
||||
#pragma unroll
|
||||
|
||||
@@ -134,7 +134,7 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *
|
||||
|
||||
// selection_wt is only needed when bias is present (selection uses wt + bias)
|
||||
// when no bias, we use wt directly for both selection and weight values
|
||||
float selection_wt[has_bias ? experts_per_thread : 1];
|
||||
[[maybe_unused]] float selection_wt[has_bias ? experts_per_thread : 1];
|
||||
|
||||
if constexpr (has_bias) {
|
||||
#pragma unroll
|
||||
|
||||
@@ -1927,6 +1927,7 @@ struct ggml_hexagon_opbatch {
|
||||
size_t extra_tens = 0;
|
||||
|
||||
auto fit_tensor = [&](const ggml_tensor *t) {
|
||||
if (!t) return;
|
||||
if (!t_map.count(t)) {
|
||||
extra_tens++;
|
||||
|
||||
@@ -2602,6 +2603,27 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
|
||||
GGML_LOG_DEBUG("ggml_hexagon_supported_mul_mat: permuted F16 src0 not supported\n");
|
||||
return false;
|
||||
}
|
||||
if (src1->ne[2] < src0->ne[2] || src1->ne[3] < src0->ne[3]) {
|
||||
GGML_LOG_DEBUG("ggml_hexagon_supported_mul_mat: src1 broadcasting not supported\n");
|
||||
return false;
|
||||
}
|
||||
if (ggml_nrows(src1) > 1024) {
|
||||
return false; // no huge batches (for now)
|
||||
}
|
||||
break;
|
||||
|
||||
case GGML_TYPE_F32:
|
||||
if (src1->type != GGML_TYPE_F32) {
|
||||
return false;
|
||||
}
|
||||
if (src0->nb[1] < src0->nb[0]) {
|
||||
GGML_LOG_DEBUG("ggml_hexagon_supported_mul_mat: permuted F32 src0 not supported\n");
|
||||
return false;
|
||||
}
|
||||
if (src1->ne[2] < src0->ne[2] || src1->ne[3] < src0->ne[3]) {
|
||||
GGML_LOG_DEBUG("ggml_hexagon_supported_mul_mat: src1 broadcasting not supported\n");
|
||||
return false;
|
||||
}
|
||||
if (ggml_nrows(src1) > 1024) {
|
||||
return false; // no huge batches (for now)
|
||||
}
|
||||
@@ -3142,13 +3164,14 @@ static htp_op_code op_remap_to_htp(const ggml_tensor * t) {
|
||||
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(t)) {
|
||||
case GGML_UNARY_OP_SILU: return HTP_OP_UNARY_SILU;
|
||||
case GGML_UNARY_OP_GELU: return HTP_OP_UNARY_GELU;
|
||||
case GGML_UNARY_OP_SIGMOID: return HTP_OP_UNARY_SIGMOID;
|
||||
case GGML_UNARY_OP_NEG: return HTP_OP_UNARY_NEG;
|
||||
case GGML_UNARY_OP_EXP: return HTP_OP_UNARY_EXP;
|
||||
case GGML_UNARY_OP_SOFTPLUS: return HTP_OP_UNARY_SOFTPLUS;
|
||||
case GGML_UNARY_OP_TANH: return HTP_OP_UNARY_TANH;
|
||||
case GGML_UNARY_OP_SILU: return HTP_OP_UNARY_SILU;
|
||||
case GGML_UNARY_OP_GELU: return HTP_OP_UNARY_GELU;
|
||||
case GGML_UNARY_OP_GELU_QUICK: return HTP_OP_UNARY_GELU;
|
||||
case GGML_UNARY_OP_SIGMOID: return HTP_OP_UNARY_SIGMOID;
|
||||
case GGML_UNARY_OP_NEG: return HTP_OP_UNARY_NEG;
|
||||
case GGML_UNARY_OP_EXP: return HTP_OP_UNARY_EXP;
|
||||
case GGML_UNARY_OP_SOFTPLUS: return HTP_OP_UNARY_SOFTPLUS;
|
||||
case GGML_UNARY_OP_TANH: return HTP_OP_UNARY_TANH;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@@ -3630,6 +3653,7 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
|
||||
break;
|
||||
case GGML_UNARY_OP_SILU:
|
||||
case GGML_UNARY_OP_GELU:
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
supp = ggml_hexagon_supported_activations(sess, op);
|
||||
break;
|
||||
default:
|
||||
|
||||
@@ -56,7 +56,21 @@ struct htp_opnode {
|
||||
}
|
||||
|
||||
std::vector<const ggml_tensor *> get_inputs() const {
|
||||
std::vector<const ggml_tensor *> inputs;
|
||||
if (fused.empty()) {
|
||||
int last_non_null = -1;
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (node->src[i]) {
|
||||
last_non_null = i;
|
||||
}
|
||||
}
|
||||
std::vector<const ggml_tensor *> inputs(last_non_null + 1, nullptr);
|
||||
for (int i = 0; i <= last_non_null; i++) {
|
||||
inputs[i] = node->src[i];
|
||||
}
|
||||
return inputs;
|
||||
}
|
||||
|
||||
std::vector<const ggml_tensor *> inputs(GGML_MAX_SRC, nullptr);
|
||||
std::vector<const ggml_tensor *> outputs;
|
||||
outputs.push_back(node);
|
||||
for (const auto * f : fused) {
|
||||
@@ -70,20 +84,31 @@ struct htp_opnode {
|
||||
return false;
|
||||
};
|
||||
|
||||
int count = 0;
|
||||
auto add_input = [&](const ggml_tensor * t) {
|
||||
if (t && !contains(outputs, t) && !contains(inputs, t)) {
|
||||
inputs.push_back(t);
|
||||
if (count < (int)inputs.size()) {
|
||||
inputs[count++] = t;
|
||||
} else {
|
||||
inputs.push_back(t);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
for (int i = 0; i < GGML_MAX_SRC && node->src[i]; i++) {
|
||||
add_input(node->src[i]);
|
||||
}
|
||||
for (const auto * f : fused) {
|
||||
for (int i = 0; i < GGML_MAX_SRC && f->src[i]; i++) {
|
||||
add_input(f->src[i]);
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (node->src[i]) {
|
||||
add_input(node->src[i]);
|
||||
}
|
||||
}
|
||||
for (const auto * f : fused) {
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (f->src[i]) {
|
||||
add_input(f->src[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inputs.resize(count);
|
||||
return inputs;
|
||||
}
|
||||
|
||||
@@ -108,6 +133,9 @@ struct htp_opformat {
|
||||
char names[64 * GGML_MAX_SRC];
|
||||
|
||||
int format_tensor_dims(char * str, const struct ggml_tensor * t) {
|
||||
if (!t) {
|
||||
return sprintf(str, "NONE");
|
||||
}
|
||||
if (t->ne[2] == 1 && t->ne[3] == 1) {
|
||||
return sprintf(str, "%d:%d", (int) t->ne[0], (int) t->ne[1]);
|
||||
} else {
|
||||
@@ -136,6 +164,9 @@ struct htp_opformat {
|
||||
}
|
||||
|
||||
int format_tensor_strides(char * str, const struct ggml_tensor * t) {
|
||||
if (!t) {
|
||||
return sprintf(str, "NONE");
|
||||
}
|
||||
const char * c = ggml_is_contiguous(t) ? "" : "!";
|
||||
|
||||
if (t->ne[2] == 1 && t->ne[3] == 1) {
|
||||
@@ -170,11 +201,11 @@ struct htp_opformat {
|
||||
auto inputs = node.get_inputs();
|
||||
|
||||
if (!inputs.empty()) {
|
||||
p += sprintf(p, "%s", ggml_type_name(inputs[0]->type));
|
||||
p += sprintf(p, "%s", inputs[0] ? ggml_type_name(inputs[0]->type) : "NONE");
|
||||
|
||||
for (size_t i = 1; i < inputs.size(); i++) {
|
||||
p += sprintf(p, " x ");
|
||||
p += sprintf(p, "%s", ggml_type_name(inputs[i]->type));
|
||||
p += sprintf(p, "%s", inputs[i] ? ggml_type_name(inputs[i]->type) : "NONE");
|
||||
}
|
||||
|
||||
p += sprintf(p, " -> ");
|
||||
@@ -184,7 +215,7 @@ struct htp_opformat {
|
||||
}
|
||||
|
||||
const char * tensor_buff_name(const struct ggml_tensor * t) {
|
||||
if (t->buffer) {
|
||||
if (t && t->buffer) {
|
||||
return ggml_backend_buffer_name(t->buffer);
|
||||
}
|
||||
return "NONE";
|
||||
@@ -213,11 +244,11 @@ struct htp_opformat {
|
||||
auto inputs = node.get_inputs();
|
||||
|
||||
if (!inputs.empty()) {
|
||||
p += sprintf(p, "%s", inputs[0]->name);
|
||||
p += sprintf(p, "%s", inputs[0] ? inputs[0]->name : "NONE");
|
||||
|
||||
for (size_t i = 1; i < inputs.size(); i++) {
|
||||
p += sprintf(p, " x ");
|
||||
p += sprintf(p, "%s", inputs[i]->name);
|
||||
p += sprintf(p, "%s", inputs[i] ? inputs[i]->name : "NONE");
|
||||
}
|
||||
|
||||
p += sprintf(p, " -> ");
|
||||
|
||||
@@ -19,6 +19,43 @@ add_library(${HTP_LIB} SHARED
|
||||
htp_iface_skel.c
|
||||
worker-pool.c
|
||||
hex-dma.c
|
||||
)
|
||||
|
||||
target_compile_definitions(${HTP_LIB} PRIVATE
|
||||
$<IF:$<BOOL:${HEXAGON_HTP_DEBUG}>,HTP_DEBUG=1,NDEBUG=1>
|
||||
$<IF:$<BOOL:${HEXAGON_HTP_DEBUG}>,FARF_HIGH=1,>
|
||||
FP32_QUANTIZE_GROUP_SIZE=${GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE})
|
||||
|
||||
if (GGML_HEXAGON_FA_EXP2_HF)
|
||||
message(STATUS "ggml-htp: HMX_FA_USE_EXP2_HF=1 (use FP16 exp2 polynomial in FA softmax)")
|
||||
target_compile_definitions(${HTP_LIB} PRIVATE HMX_FA_USE_EXP2_HF=1)
|
||||
endif()
|
||||
|
||||
# HMX acceleration: available on v73+ architectures
|
||||
set(HTP_HMX_VERSIONS v73 v75 v79 v81)
|
||||
list(FIND HTP_HMX_VERSIONS ${DSP_VERSION} _hmx_idx)
|
||||
|
||||
if (_hmx_idx GREATER_EQUAL 0)
|
||||
target_sources(${HTP_LIB} PRIVATE
|
||||
hmx-matmul-ops.c
|
||||
hmx-flash-attn-ops.c
|
||||
hmx-queue.c
|
||||
)
|
||||
|
||||
# -mhmx enables HMX instruction set (needed by files that include hmx-utils.h)
|
||||
set_source_files_properties(
|
||||
hmx-flash-attn-ops.c
|
||||
hmx-matmul-ops.c
|
||||
hmx-queue.c
|
||||
PROPERTIES COMPILE_OPTIONS "-mhmx"
|
||||
)
|
||||
|
||||
target_compile_definitions(${HTP_LIB} PRIVATE HTP_HAS_HMX=1)
|
||||
endif()
|
||||
|
||||
build_idl(htp_iface.idl ${HTP_LIB})
|
||||
|
||||
target_sources(${HTP_LIB} PRIVATE
|
||||
matmul-ops.c
|
||||
binary-ops.c
|
||||
unary-ops.c
|
||||
@@ -42,40 +79,6 @@ add_library(${HTP_LIB} SHARED
|
||||
pad-ops.c
|
||||
)
|
||||
|
||||
target_compile_definitions(${HTP_LIB} PRIVATE
|
||||
$<IF:$<BOOL:${HEXAGON_HTP_DEBUG}>,HTP_DEBUG=1,NDEBUG=1>
|
||||
$<IF:$<BOOL:${HEXAGON_HTP_DEBUG}>,FARF_HIGH=1,>
|
||||
FP32_QUANTIZE_GROUP_SIZE=${GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE})
|
||||
|
||||
if (GGML_HEXAGON_FA_EXP2_HF)
|
||||
message(STATUS "ggml-htp: HMX_FA_USE_EXP2_HF=1 (use FP16 exp2 polynomial in FA softmax)")
|
||||
target_compile_definitions(${HTP_LIB} PRIVATE HMX_FA_USE_EXP2_HF=1)
|
||||
endif()
|
||||
|
||||
# HMX acceleration: available on v73+ architectures
|
||||
set(HTP_HMX_VERSIONS v73 v75 v79 v81)
|
||||
list(FIND HTP_HMX_VERSIONS ${DSP_VERSION} _hmx_idx)
|
||||
|
||||
if (_hmx_idx GREATER_EQUAL 0)
|
||||
target_sources(${HTP_LIB} PRIVATE
|
||||
hmx-flash-attn-ops.c
|
||||
hmx-matmul-ops.c
|
||||
hmx-queue.c
|
||||
)
|
||||
|
||||
# -mhmx enables HMX instruction set (needed by files that include hmx-utils.h)
|
||||
set_source_files_properties(
|
||||
hmx-flash-attn-ops.c
|
||||
hmx-matmul-ops.c
|
||||
hmx-queue.c
|
||||
PROPERTIES COMPILE_OPTIONS "-mhmx"
|
||||
)
|
||||
|
||||
target_compile_definitions(${HTP_LIB} PRIVATE HTP_HAS_HMX=1)
|
||||
endif()
|
||||
|
||||
build_idl(htp_iface.idl ${HTP_LIB})
|
||||
|
||||
set_target_properties(${HTP_LIB} PROPERTIES EXPORT_COMPILE_COMMANDS ON)
|
||||
|
||||
install(TARGETS ${HTP_LIB})
|
||||
|
||||
@@ -276,6 +276,7 @@ int op_argsort(struct htp_ops_context * octx) {
|
||||
octx->src0_spad.data = octx->ctx->vtcm_base;
|
||||
octx->src0_spad.size = total_spad_size;
|
||||
octx->src0_spad.size_per_thread = spad_per_thread;
|
||||
octx->src0_spad.src = NULL;
|
||||
|
||||
FARF(HIGH, "argsort: %ux%ux%ux%u -> %ux%ux%ux%u (0x%x, 0x%x)",
|
||||
octx->src[0]->ne[0], octx->src[0]->ne[1], octx->src[0]->ne[2], octx->src[0]->ne[3],
|
||||
|
||||
@@ -262,6 +262,8 @@ int op_concat(struct htp_ops_context * octx) {
|
||||
|
||||
octx->src0_spad.data = octx->ctx->vtcm_base;
|
||||
octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
|
||||
octx->src0_spad.src = NULL;
|
||||
octx->src1_spad.src = NULL;
|
||||
|
||||
if (type_size == 4) {
|
||||
worker_func = concat_2d_f32_transposed;
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
#include "hex-dma.h"
|
||||
#include "hvx-utils.h"
|
||||
#include "hvx-dump.h"
|
||||
#include "hvx-flash-attn.h"
|
||||
|
||||
#define GGML_COMMON_DECL_C
|
||||
#include "ggml-common.h"
|
||||
@@ -245,6 +246,7 @@ struct htp_fa_context {
|
||||
uint32_t n_head_log2;
|
||||
float m0;
|
||||
float m1;
|
||||
float slopes[512];
|
||||
|
||||
uint32_t n_blocks;
|
||||
|
||||
@@ -412,7 +414,7 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void *
|
||||
}
|
||||
|
||||
const uint32_t h = iq2; // head index
|
||||
const float slope = (factx->max_bias > 0.0f) ? (h < factx->n_head_log2 ? powf(factx->m0, h + 1) : powf(factx->m1, 2*(h - factx->n_head_log2) + 1)) : 1.0f;
|
||||
const float slope = factx->slopes[h];
|
||||
|
||||
HVX_Vector S_vec = hvx_vec_splat_f32(0.0f);
|
||||
HVX_Vector M_vec = hvx_vec_splat_f32(-INFINITY);
|
||||
@@ -628,8 +630,8 @@ int op_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
}
|
||||
|
||||
#ifdef HTP_HAS_HMX
|
||||
// HMX path: head_dim multiple of 32, F16 KV
|
||||
if (k->type == HTP_TYPE_F16 && v->type == HTP_TYPE_F16 && k->ne[0] % 32 == 0) {
|
||||
// HMX path: head_dim multiple of 64, F16 KV, and no sinks
|
||||
if (k->type == HTP_TYPE_F16 && v->type == HTP_TYPE_F16 && k->ne[0] % 64 == 0 && v->ne[0] % 64 == 0 && octx->src[4] == NULL) {
|
||||
int ret = hmx_flash_attn_ext(octx);
|
||||
if (ret == HTP_STATUS_OK) {
|
||||
return ret;
|
||||
@@ -689,6 +691,13 @@ int op_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
factx.m0 = powf(2.0f, -(max_bias ) / factx.n_head_log2);
|
||||
factx.m1 = powf(2.0f, -(max_bias / 2.0f) / factx.n_head_log2);
|
||||
|
||||
if (n_head > 512) {
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
for (uint32_t h = 0; h < n_head; ++h) {
|
||||
factx.slopes[h] = (max_bias > 0.0f) ? alibi_slope(h, factx.n_head_log2, factx.m0, factx.m1) : 1.0f;
|
||||
}
|
||||
|
||||
// total rows in q
|
||||
const uint32_t neq0 = q->ne[0];
|
||||
const uint32_t neq1 = q->ne[1];
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
#include <string.h>
|
||||
|
||||
#include "hvx-utils.h"
|
||||
#include "hex-fastdiv.h"
|
||||
|
||||
#define GGML_COMMON_DECL_C
|
||||
#include "ggml-common.h"
|
||||
@@ -14,106 +15,103 @@
|
||||
|
||||
#define HTP_GDN_MAX_SV 128
|
||||
|
||||
|
||||
struct htp_gdn_context {
|
||||
struct htp_ops_context * octx;
|
||||
uint32_t rows_per_thread;
|
||||
size_t state_bytes;
|
||||
bool use_vtcm;
|
||||
uint8_t * vtcm_state_base;
|
||||
size_t vtcm_state_per_thread;
|
||||
size_t state_bytes;
|
||||
uint8_t * vtcm_base;
|
||||
size_t vtcm_per_thread;
|
||||
};
|
||||
|
||||
static inline float gdn_mul_dot_f32(float * restrict dst, const float * restrict mul,
|
||||
const float * restrict dot, uint32_t n) {
|
||||
static inline HVX_Vector gdn_mul_dot_f32(float * restrict dst, const float * restrict mul, const float * restrict dot, uint32_t n) {
|
||||
HVX_Vector acc = Q6_V_vzero();
|
||||
|
||||
const uint32_t epv = 128 / sizeof(float);
|
||||
const uint32_t epv = 128 / sizeof(float);
|
||||
const uint32_t nvec = n / epv;
|
||||
const uint32_t tail = n % epv;
|
||||
const uint32_t nloe = n % epv;
|
||||
for (uint32_t i = 0; i < nvec; ++i) {
|
||||
HVX_Vector vd = hvx_vmemu(dst + i * epv);
|
||||
HVX_Vector vm = hvx_vmem(mul + i * epv);
|
||||
HVX_Vector vd = hvx_vmemu(dst + i * epv);
|
||||
HVX_Vector vm = hvx_vmem(mul + i * epv);
|
||||
HVX_Vector vdot = hvx_vmem(dot + i * epv);
|
||||
HVX_Vector out = hvx_vec_mul_f32_f32(vd, vm);
|
||||
HVX_Vector out = hvx_vec_mul_f32_f32(vd, vm);
|
||||
hvx_vmemu(dst + i * epv) = out;
|
||||
acc = hvx_vec_add_f32_f32(acc, hvx_vec_mul_f32_f32(out, vdot));
|
||||
}
|
||||
|
||||
if (tail) {
|
||||
if (nloe) {
|
||||
const uint32_t off = nvec * epv;
|
||||
HVX_Vector vd = hvx_vmemu(dst + off);
|
||||
HVX_Vector vm = hvx_vmem(mul + off);
|
||||
HVX_Vector vd = hvx_vmemu(dst + off);
|
||||
HVX_Vector vm = hvx_vmem(mul + off);
|
||||
HVX_Vector vdot = hvx_vmem(dot + off);
|
||||
HVX_Vector out = hvx_vec_mul_f32_f32(vd, vm);
|
||||
hvx_vec_store_u(dst + off, tail * sizeof(float), out);
|
||||
HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
|
||||
HVX_Vector out = hvx_vec_mul_f32_f32(vd, vm);
|
||||
hvx_vec_store_u(dst + off, nloe * sizeof(float), out);
|
||||
HVX_VectorPred mask = Q6_Q_vsetq2_R(nloe * sizeof(float));
|
||||
HVX_Vector prod = hvx_vec_mul_f32_f32(out, vdot);
|
||||
acc = hvx_vec_add_f32_f32(acc, Q6_V_vmux_QVV(mask, prod, Q6_V_vzero()));
|
||||
}
|
||||
|
||||
return hvx_vec_get_f32(hvx_vec_reduce_sum_f32(acc));
|
||||
return hvx_vec_reduce_sum_f32(acc);
|
||||
}
|
||||
|
||||
static inline float gdn_mul_scalar_dot_f32(float * restrict dst, float mul,
|
||||
const float * restrict dot, uint32_t n) {
|
||||
static inline HVX_Vector gdn_mul_scalar_dot_f32(float * restrict dst, float mul, const float * restrict dot, uint32_t n) {
|
||||
HVX_Vector acc = Q6_V_vzero();
|
||||
const HVX_Vector vmul = hvx_vec_splat_f32(mul);
|
||||
|
||||
const uint32_t epv = 128 / sizeof(float);
|
||||
const uint32_t epv = 128 / sizeof(float);
|
||||
const uint32_t nvec = n / epv;
|
||||
const uint32_t tail = n % epv;
|
||||
const uint32_t nloe = n % epv;
|
||||
for (uint32_t i = 0; i < nvec; ++i) {
|
||||
HVX_Vector vd = hvx_vmemu(dst + i * epv);
|
||||
HVX_Vector vd = hvx_vmemu(dst + i * epv);
|
||||
HVX_Vector vdot = hvx_vmem(dot + i * epv);
|
||||
HVX_Vector out = hvx_vec_mul_f32_f32(vd, vmul);
|
||||
HVX_Vector out = hvx_vec_mul_f32_f32(vd, vmul);
|
||||
hvx_vmemu(dst + i * epv) = out;
|
||||
acc = hvx_vec_add_f32_f32(acc, hvx_vec_mul_f32_f32(out, vdot));
|
||||
}
|
||||
|
||||
if (tail) {
|
||||
if (nloe) {
|
||||
const uint32_t off = nvec * epv;
|
||||
HVX_Vector vd = hvx_vmemu(dst + off);
|
||||
HVX_Vector vd = hvx_vmemu(dst + off);
|
||||
HVX_Vector vdot = hvx_vmem(dot + off);
|
||||
HVX_Vector out = hvx_vec_mul_f32_f32(vd, vmul);
|
||||
hvx_vec_store_u(dst + off, tail * sizeof(float), out);
|
||||
HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
|
||||
HVX_Vector out = hvx_vec_mul_f32_f32(vd, vmul);
|
||||
hvx_vec_store_u(dst + off, nloe * sizeof(float), out);
|
||||
HVX_VectorPred mask = Q6_Q_vsetq2_R(nloe * sizeof(float));
|
||||
HVX_Vector prod = hvx_vec_mul_f32_f32(out, vdot);
|
||||
acc = hvx_vec_add_f32_f32(acc, Q6_V_vmux_QVV(mask, prod, Q6_V_vzero()));
|
||||
}
|
||||
|
||||
return hvx_vec_get_f32(hvx_vec_reduce_sum_f32(acc));
|
||||
return hvx_vec_reduce_sum_f32(acc);
|
||||
}
|
||||
|
||||
static inline float gdn_add_scaled_dot_f32(float * restrict dst, const float * restrict src,
|
||||
float scale, const float * restrict dot, uint32_t n) {
|
||||
static inline HVX_Vector gdn_add_scaled_dot_f32(float * restrict dst, const float * restrict src,
|
||||
HVX_Vector vscale, const float * restrict dot, uint32_t n) {
|
||||
HVX_Vector acc = Q6_V_vzero();
|
||||
const HVX_Vector vscale = hvx_vec_splat_f32(scale);
|
||||
|
||||
const uint32_t epv = 128 / sizeof(float);
|
||||
const uint32_t epv = 128 / sizeof(float);
|
||||
const uint32_t nvec = n / epv;
|
||||
const uint32_t tail = n % epv;
|
||||
const uint32_t nloe = n % epv;
|
||||
for (uint32_t i = 0; i < nvec; ++i) {
|
||||
HVX_Vector vd = hvx_vmemu(dst + i * epv);
|
||||
HVX_Vector vs = hvx_vmem(src + i * epv);
|
||||
HVX_Vector vd = hvx_vmemu(dst + i * epv);
|
||||
HVX_Vector vs = hvx_vmem(src + i * epv);
|
||||
HVX_Vector vdot = hvx_vmem(dot + i * epv);
|
||||
HVX_Vector out = hvx_vec_add_f32_f32(vd, hvx_vec_mul_f32_f32(vs, vscale));
|
||||
HVX_Vector out = hvx_vec_add_f32_f32(vd, hvx_vec_mul_f32_f32(vs, vscale));
|
||||
hvx_vmemu(dst + i * epv) = out;
|
||||
acc = hvx_vec_add_f32_f32(acc, hvx_vec_mul_f32_f32(out, vdot));
|
||||
}
|
||||
|
||||
if (tail) {
|
||||
if (nloe) {
|
||||
const uint32_t off = nvec * epv;
|
||||
HVX_Vector vd = hvx_vmemu(dst + off);
|
||||
HVX_Vector vs = hvx_vmem(src + off);
|
||||
HVX_Vector vd = hvx_vmemu(dst + off);
|
||||
HVX_Vector vs = hvx_vmem(src + off);
|
||||
HVX_Vector vdot = hvx_vmem(dot + off);
|
||||
HVX_Vector out = hvx_vec_add_f32_f32(vd, hvx_vec_mul_f32_f32(vs, vscale));
|
||||
hvx_vec_store_u(dst + off, tail * sizeof(float), out);
|
||||
HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
|
||||
HVX_Vector out = hvx_vec_add_f32_f32(vd, hvx_vec_mul_f32_f32(vs, vscale));
|
||||
hvx_vec_store_u(dst + off, nloe * sizeof(float), out);
|
||||
HVX_VectorPred mask = Q6_Q_vsetq2_R(nloe * sizeof(float));
|
||||
HVX_Vector prod = hvx_vec_mul_f32_f32(out, vdot);
|
||||
acc = hvx_vec_add_f32_f32(acc, Q6_V_vmux_QVV(mask, prod, Q6_V_vzero()));
|
||||
}
|
||||
|
||||
return hvx_vec_get_f32(hvx_vec_reduce_sum_f32(acc));
|
||||
return hvx_vec_reduce_sum_f32(acc);
|
||||
}
|
||||
|
||||
static inline void gdn_mul_dot4_f32(float * restrict dst0, float * restrict dst1,
|
||||
@@ -126,7 +124,7 @@ static inline void gdn_mul_dot4_f32(float * restrict dst0, float * restrict dst1
|
||||
|
||||
const uint32_t epv = 128 / sizeof(float);
|
||||
const uint32_t nvec = n / epv;
|
||||
const uint32_t tail = n % epv;
|
||||
const uint32_t nloe = n % epv;
|
||||
for (uint32_t i = 0; i < nvec; ++i) {
|
||||
HVX_Vector vm = hvx_vmem(mul + i * epv);
|
||||
HVX_Vector vdot = hvx_vmem(dot + i * epv);
|
||||
@@ -147,11 +145,11 @@ static inline void gdn_mul_dot4_f32(float * restrict dst0, float * restrict dst1
|
||||
acc3 = hvx_vec_add_f32_f32(acc3, hvx_vec_mul_f32_f32(out3, vdot));
|
||||
}
|
||||
|
||||
if (tail) {
|
||||
if (nloe) {
|
||||
const uint32_t off = nvec * epv;
|
||||
HVX_Vector vm = hvx_vmem(mul + off);
|
||||
HVX_Vector vm = hvx_vmem(mul + off);
|
||||
HVX_Vector vdot = hvx_vmem(dot + off);
|
||||
HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
|
||||
HVX_VectorPred mask = Q6_Q_vsetq2_R(nloe * sizeof(float));
|
||||
HVX_Vector zero = Q6_V_vzero();
|
||||
|
||||
HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + off), vm);
|
||||
@@ -159,10 +157,10 @@ static inline void gdn_mul_dot4_f32(float * restrict dst0, float * restrict dst1
|
||||
HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + off), vm);
|
||||
HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + off), vm);
|
||||
|
||||
hvx_vec_store_u(dst0 + off, tail * sizeof(float), out0);
|
||||
hvx_vec_store_u(dst1 + off, tail * sizeof(float), out1);
|
||||
hvx_vec_store_u(dst2 + off, tail * sizeof(float), out2);
|
||||
hvx_vec_store_u(dst3 + off, tail * sizeof(float), out3);
|
||||
hvx_vec_store_u(dst0 + off, nloe * sizeof(float), out0);
|
||||
hvx_vec_store_u(dst1 + off, nloe * sizeof(float), out1);
|
||||
hvx_vec_store_u(dst2 + off, nloe * sizeof(float), out2);
|
||||
hvx_vec_store_u(dst3 + off, nloe * sizeof(float), out3);
|
||||
|
||||
acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero));
|
||||
acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero));
|
||||
@@ -185,7 +183,7 @@ static inline void gdn_mul_scalar_dot4_f32(float * restrict dst0, float * restri
|
||||
|
||||
const uint32_t epv = 128 / sizeof(float);
|
||||
const uint32_t nvec = n / epv;
|
||||
const uint32_t tail = n % epv;
|
||||
const uint32_t nloe = n % epv;
|
||||
for (uint32_t i = 0; i < nvec; ++i) {
|
||||
HVX_Vector vdot = hvx_vmem(dot + i * epv);
|
||||
|
||||
@@ -205,10 +203,10 @@ static inline void gdn_mul_scalar_dot4_f32(float * restrict dst0, float * restri
|
||||
acc3 = hvx_vec_add_f32_f32(acc3, hvx_vec_mul_f32_f32(out3, vdot));
|
||||
}
|
||||
|
||||
if (tail) {
|
||||
if (nloe) {
|
||||
const uint32_t off = nvec * epv;
|
||||
HVX_Vector vdot = hvx_vmem(dot + off);
|
||||
HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
|
||||
HVX_VectorPred mask = Q6_Q_vsetq2_R(nloe * sizeof(float));
|
||||
HVX_Vector zero = Q6_V_vzero();
|
||||
|
||||
HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + off), vmul);
|
||||
@@ -216,10 +214,10 @@ static inline void gdn_mul_scalar_dot4_f32(float * restrict dst0, float * restri
|
||||
HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + off), vmul);
|
||||
HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + off), vmul);
|
||||
|
||||
hvx_vec_store_u(dst0 + off, tail * sizeof(float), out0);
|
||||
hvx_vec_store_u(dst1 + off, tail * sizeof(float), out1);
|
||||
hvx_vec_store_u(dst2 + off, tail * sizeof(float), out2);
|
||||
hvx_vec_store_u(dst3 + off, tail * sizeof(float), out3);
|
||||
hvx_vec_store_u(dst0 + off, nloe * sizeof(float), out0);
|
||||
hvx_vec_store_u(dst1 + off, nloe * sizeof(float), out1);
|
||||
hvx_vec_store_u(dst2 + off, nloe * sizeof(float), out2);
|
||||
hvx_vec_store_u(dst3 + off, nloe * sizeof(float), out3);
|
||||
|
||||
acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero));
|
||||
acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero));
|
||||
@@ -246,7 +244,7 @@ static inline void gdn_add_scaled_dot4_f32(float * restrict dst0, float * restri
|
||||
|
||||
const uint32_t epv = 128 / sizeof(float);
|
||||
const uint32_t nvec = n / epv;
|
||||
const uint32_t tail = n % epv;
|
||||
const uint32_t nloe = n % epv;
|
||||
for (uint32_t i = 0; i < nvec; ++i) {
|
||||
HVX_Vector vs = hvx_vmem(src + i * epv);
|
||||
HVX_Vector vdot = hvx_vmem(dot + i * epv);
|
||||
@@ -267,11 +265,11 @@ static inline void gdn_add_scaled_dot4_f32(float * restrict dst0, float * restri
|
||||
acc3 = hvx_vec_add_f32_f32(acc3, hvx_vec_mul_f32_f32(out3, vdot));
|
||||
}
|
||||
|
||||
if (tail) {
|
||||
if (nloe) {
|
||||
const uint32_t off = nvec * epv;
|
||||
HVX_Vector vs = hvx_vmem(src + off);
|
||||
HVX_Vector vdot = hvx_vmem(dot + off);
|
||||
HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
|
||||
HVX_VectorPred mask = Q6_Q_vsetq2_R(nloe * sizeof(float));
|
||||
HVX_Vector zero = Q6_V_vzero();
|
||||
|
||||
HVX_Vector out0 = hvx_vec_add_f32_f32(hvx_vmemu(dst0 + off), hvx_vec_mul_f32_f32(vs, scale0));
|
||||
@@ -279,10 +277,10 @@ static inline void gdn_add_scaled_dot4_f32(float * restrict dst0, float * restri
|
||||
HVX_Vector out2 = hvx_vec_add_f32_f32(hvx_vmemu(dst2 + off), hvx_vec_mul_f32_f32(vs, scale2));
|
||||
HVX_Vector out3 = hvx_vec_add_f32_f32(hvx_vmemu(dst3 + off), hvx_vec_mul_f32_f32(vs, scale3));
|
||||
|
||||
hvx_vec_store_u(dst0 + off, tail * sizeof(float), out0);
|
||||
hvx_vec_store_u(dst1 + off, tail * sizeof(float), out1);
|
||||
hvx_vec_store_u(dst2 + off, tail * sizeof(float), out2);
|
||||
hvx_vec_store_u(dst3 + off, tail * sizeof(float), out3);
|
||||
hvx_vec_store_u(dst0 + off, nloe * sizeof(float), out0);
|
||||
hvx_vec_store_u(dst1 + off, nloe * sizeof(float), out1);
|
||||
hvx_vec_store_u(dst2 + off, nloe * sizeof(float), out2);
|
||||
hvx_vec_store_u(dst3 + off, nloe * sizeof(float), out3);
|
||||
|
||||
acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero));
|
||||
acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero));
|
||||
@@ -310,7 +308,7 @@ static inline void gdn_mul_dot8_f32(float * restrict dst0, float * restrict dst1
|
||||
|
||||
const uint32_t epv = 128 / sizeof(float);
|
||||
const uint32_t nvec = n / epv;
|
||||
const uint32_t tail = n % epv;
|
||||
const uint32_t nloe = n % epv;
|
||||
for (uint32_t i = 0; i < nvec; ++i) {
|
||||
HVX_Vector vm = hvx_vmem(mul + i * epv);
|
||||
HVX_Vector vdot = hvx_vmem(dot + i * epv);
|
||||
@@ -343,11 +341,11 @@ static inline void gdn_mul_dot8_f32(float * restrict dst0, float * restrict dst1
|
||||
acc7 = hvx_vec_add_f32_f32(acc7, hvx_vec_mul_f32_f32(out7, vdot));
|
||||
}
|
||||
|
||||
if (tail) {
|
||||
if (nloe) {
|
||||
const uint32_t off = nvec * epv;
|
||||
HVX_Vector vm = hvx_vmem(mul + off);
|
||||
HVX_Vector vdot = hvx_vmem(dot + off);
|
||||
HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
|
||||
HVX_VectorPred mask = Q6_Q_vsetq2_R(nloe * sizeof(float));
|
||||
HVX_Vector zero = Q6_V_vzero();
|
||||
|
||||
HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + off), vm);
|
||||
@@ -359,14 +357,14 @@ static inline void gdn_mul_dot8_f32(float * restrict dst0, float * restrict dst1
|
||||
HVX_Vector out6 = hvx_vec_mul_f32_f32(hvx_vmemu(dst6 + off), vm);
|
||||
HVX_Vector out7 = hvx_vec_mul_f32_f32(hvx_vmemu(dst7 + off), vm);
|
||||
|
||||
hvx_vec_store_u(dst0 + off, tail * sizeof(float), out0);
|
||||
hvx_vec_store_u(dst1 + off, tail * sizeof(float), out1);
|
||||
hvx_vec_store_u(dst2 + off, tail * sizeof(float), out2);
|
||||
hvx_vec_store_u(dst3 + off, tail * sizeof(float), out3);
|
||||
hvx_vec_store_u(dst4 + off, tail * sizeof(float), out4);
|
||||
hvx_vec_store_u(dst5 + off, tail * sizeof(float), out5);
|
||||
hvx_vec_store_u(dst6 + off, tail * sizeof(float), out6);
|
||||
hvx_vec_store_u(dst7 + off, tail * sizeof(float), out7);
|
||||
hvx_vec_store_u(dst0 + off, nloe * sizeof(float), out0);
|
||||
hvx_vec_store_u(dst1 + off, nloe * sizeof(float), out1);
|
||||
hvx_vec_store_u(dst2 + off, nloe * sizeof(float), out2);
|
||||
hvx_vec_store_u(dst3 + off, nloe * sizeof(float), out3);
|
||||
hvx_vec_store_u(dst4 + off, nloe * sizeof(float), out4);
|
||||
hvx_vec_store_u(dst5 + off, nloe * sizeof(float), out5);
|
||||
hvx_vec_store_u(dst6 + off, nloe * sizeof(float), out6);
|
||||
hvx_vec_store_u(dst7 + off, nloe * sizeof(float), out7);
|
||||
|
||||
acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero));
|
||||
acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero));
|
||||
@@ -400,7 +398,7 @@ static inline void gdn_mul_scalar_dot8_f32(float * restrict dst0, float * restri
|
||||
|
||||
const uint32_t epv = 128 / sizeof(float);
|
||||
const uint32_t nvec = n / epv;
|
||||
const uint32_t tail = n % epv;
|
||||
const uint32_t nloe = n % epv;
|
||||
for (uint32_t i = 0; i < nvec; ++i) {
|
||||
HVX_Vector vdot = hvx_vmem(dot + i * epv);
|
||||
|
||||
@@ -432,10 +430,10 @@ static inline void gdn_mul_scalar_dot8_f32(float * restrict dst0, float * restri
|
||||
acc7 = hvx_vec_add_f32_f32(acc7, hvx_vec_mul_f32_f32(out7, vdot));
|
||||
}
|
||||
|
||||
if (tail) {
|
||||
if (nloe) {
|
||||
const uint32_t off = nvec * epv;
|
||||
HVX_Vector vdot = hvx_vmem(dot + off);
|
||||
HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
|
||||
HVX_VectorPred mask = Q6_Q_vsetq2_R(nloe * sizeof(float));
|
||||
HVX_Vector zero = Q6_V_vzero();
|
||||
|
||||
HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + off), vmul);
|
||||
@@ -447,14 +445,14 @@ static inline void gdn_mul_scalar_dot8_f32(float * restrict dst0, float * restri
|
||||
HVX_Vector out6 = hvx_vec_mul_f32_f32(hvx_vmemu(dst6 + off), vmul);
|
||||
HVX_Vector out7 = hvx_vec_mul_f32_f32(hvx_vmemu(dst7 + off), vmul);
|
||||
|
||||
hvx_vec_store_u(dst0 + off, tail * sizeof(float), out0);
|
||||
hvx_vec_store_u(dst1 + off, tail * sizeof(float), out1);
|
||||
hvx_vec_store_u(dst2 + off, tail * sizeof(float), out2);
|
||||
hvx_vec_store_u(dst3 + off, tail * sizeof(float), out3);
|
||||
hvx_vec_store_u(dst4 + off, tail * sizeof(float), out4);
|
||||
hvx_vec_store_u(dst5 + off, tail * sizeof(float), out5);
|
||||
hvx_vec_store_u(dst6 + off, tail * sizeof(float), out6);
|
||||
hvx_vec_store_u(dst7 + off, tail * sizeof(float), out7);
|
||||
hvx_vec_store_u(dst0 + off, nloe * sizeof(float), out0);
|
||||
hvx_vec_store_u(dst1 + off, nloe * sizeof(float), out1);
|
||||
hvx_vec_store_u(dst2 + off, nloe * sizeof(float), out2);
|
||||
hvx_vec_store_u(dst3 + off, nloe * sizeof(float), out3);
|
||||
hvx_vec_store_u(dst4 + off, nloe * sizeof(float), out4);
|
||||
hvx_vec_store_u(dst5 + off, nloe * sizeof(float), out5);
|
||||
hvx_vec_store_u(dst6 + off, nloe * sizeof(float), out6);
|
||||
hvx_vec_store_u(dst7 + off, nloe * sizeof(float), out7);
|
||||
|
||||
acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero));
|
||||
acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero));
|
||||
@@ -496,7 +494,7 @@ static inline void gdn_add_scaled_dot8_f32(float * restrict dst0, float * restri
|
||||
|
||||
const uint32_t epv = 128 / sizeof(float);
|
||||
const uint32_t nvec = n / epv;
|
||||
const uint32_t tail = n % epv;
|
||||
const uint32_t nloe = n % epv;
|
||||
for (uint32_t i = 0; i < nvec; ++i) {
|
||||
HVX_Vector vs = hvx_vmem(src + i * epv);
|
||||
HVX_Vector vdot = hvx_vmem(dot + i * epv);
|
||||
@@ -529,11 +527,11 @@ static inline void gdn_add_scaled_dot8_f32(float * restrict dst0, float * restri
|
||||
acc7 = hvx_vec_add_f32_f32(acc7, hvx_vec_mul_f32_f32(out7, vdot));
|
||||
}
|
||||
|
||||
if (tail) {
|
||||
if (nloe) {
|
||||
const uint32_t off = nvec * epv;
|
||||
HVX_Vector vs = hvx_vmem(src + off);
|
||||
HVX_Vector vdot = hvx_vmem(dot + off);
|
||||
HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
|
||||
HVX_VectorPred mask = Q6_Q_vsetq2_R(nloe * sizeof(float));
|
||||
HVX_Vector zero = Q6_V_vzero();
|
||||
|
||||
HVX_Vector out0 = hvx_vec_add_f32_f32(hvx_vmemu(dst0 + off), hvx_vec_mul_f32_f32(vs, scale0));
|
||||
@@ -545,14 +543,14 @@ static inline void gdn_add_scaled_dot8_f32(float * restrict dst0, float * restri
|
||||
HVX_Vector out6 = hvx_vec_add_f32_f32(hvx_vmemu(dst6 + off), hvx_vec_mul_f32_f32(vs, scale6));
|
||||
HVX_Vector out7 = hvx_vec_add_f32_f32(hvx_vmemu(dst7 + off), hvx_vec_mul_f32_f32(vs, scale7));
|
||||
|
||||
hvx_vec_store_u(dst0 + off, tail * sizeof(float), out0);
|
||||
hvx_vec_store_u(dst1 + off, tail * sizeof(float), out1);
|
||||
hvx_vec_store_u(dst2 + off, tail * sizeof(float), out2);
|
||||
hvx_vec_store_u(dst3 + off, tail * sizeof(float), out3);
|
||||
hvx_vec_store_u(dst4 + off, tail * sizeof(float), out4);
|
||||
hvx_vec_store_u(dst5 + off, tail * sizeof(float), out5);
|
||||
hvx_vec_store_u(dst6 + off, tail * sizeof(float), out6);
|
||||
hvx_vec_store_u(dst7 + off, tail * sizeof(float), out7);
|
||||
hvx_vec_store_u(dst0 + off, nloe * sizeof(float), out0);
|
||||
hvx_vec_store_u(dst1 + off, nloe * sizeof(float), out1);
|
||||
hvx_vec_store_u(dst2 + off, nloe * sizeof(float), out2);
|
||||
hvx_vec_store_u(dst3 + off, nloe * sizeof(float), out3);
|
||||
hvx_vec_store_u(dst4 + off, nloe * sizeof(float), out4);
|
||||
hvx_vec_store_u(dst5 + off, nloe * sizeof(float), out5);
|
||||
hvx_vec_store_u(dst6 + off, nloe * sizeof(float), out6);
|
||||
hvx_vec_store_u(dst7 + off, nloe * sizeof(float), out7);
|
||||
|
||||
acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero));
|
||||
acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero));
|
||||
@@ -605,26 +603,65 @@ static void gated_delta_net_f32_pp_thread(unsigned int nth, unsigned int ith, vo
|
||||
float local_gate[HTP_GDN_MAX_SV] __attribute__((aligned(128)));
|
||||
float local_q[HTP_GDN_MAX_SV] __attribute__((aligned(128)));
|
||||
float local_k[HTP_GDN_MAX_SV] __attribute__((aligned(128)));
|
||||
float local_sums[4] __attribute__((aligned(128)));
|
||||
float local_sums[32] __attribute__((aligned(128)));
|
||||
|
||||
dma_queue * dma = octx->ctx->dma[ith];
|
||||
size_t state_aligned = (size_t) S_v * S_v * sizeof(float);
|
||||
state_aligned = (state_aligned + 127) & ~(size_t)127;
|
||||
float * s_work[2];
|
||||
s_work[0] = (float *) (gctx->vtcm_base + gctx->vtcm_per_thread * ith);
|
||||
s_work[1] = s_work[0] + state_aligned / sizeof(float);
|
||||
|
||||
struct fastdiv_values fd_H = init_fastdiv_values(H);
|
||||
struct fastdiv_values fd_q1 = init_fastdiv_values(q->ne[1]);
|
||||
struct fastdiv_values fd_k1 = init_fastdiv_values(k->ne[1]);
|
||||
struct fastdiv_values fd_rq3 = init_fastdiv_values(rq3);
|
||||
struct fastdiv_values fd_rk3 = init_fastdiv_values(rk3);
|
||||
|
||||
const uint64_t state_seq_stride = state->nb[2] / sizeof(float);
|
||||
const uint64_t state_size_per_snap = (uint64_t) S_v * S_v * H * n_seqs;
|
||||
const int64_t shift = (int64_t) n_tokens - (int64_t) K;
|
||||
|
||||
for (uint32_t ir = ith; ir < total_rows; ir += nth) {
|
||||
const uint32_t iv1 = ir % H;
|
||||
const uint32_t iv3 = ir / H;
|
||||
uint32_t ir_prefetch = ith;
|
||||
int spad_idx = 0;
|
||||
|
||||
const uint32_t iq1 = iv1 % q->ne[1];
|
||||
const uint32_t ik1 = iv1 % k->ne[1];
|
||||
const uint32_t iq3 = iv3 / rq3;
|
||||
const uint32_t ik3 = iv3 / rk3;
|
||||
// Prefetch preamble (up to 2 steps)
|
||||
for (int k = 0; k < 2 && ir_prefetch < total_rows; k++) {
|
||||
const uint32_t piv1 = fastmodulo(ir_prefetch, H, &fd_H);
|
||||
const uint32_t piv3 = fastdiv(ir_prefetch, &fd_H);
|
||||
const float * ps_in = state_in_base + (uint64_t) piv3 * state_seq_stride + (uint64_t) piv1 * S_v * S_v;
|
||||
float * ps_out = state_out_base + (uint64_t) (K - 1) * state_size_per_snap + ((uint64_t) piv3 * H + piv1) * S_v * S_v;
|
||||
|
||||
// Push dummy write-back
|
||||
dma_queue_push(dma, dma_make_ptr(ps_out, s_work[spad_idx]),
|
||||
S_v * sizeof(float), S_v * sizeof(float),
|
||||
S_v * sizeof(float), 0);
|
||||
|
||||
// Push fetch
|
||||
dma_queue_push(dma, dma_make_ptr(s_work[spad_idx], ps_in),
|
||||
S_v * sizeof(float), S_v * sizeof(float),
|
||||
S_v * sizeof(float), S_v);
|
||||
|
||||
ir_prefetch += nth;
|
||||
spad_idx ^= 1;
|
||||
}
|
||||
|
||||
int curr_spad_idx = 0;
|
||||
for (uint32_t ir = ith; ir < total_rows; ir += nth) {
|
||||
dma_queue_pop(dma);
|
||||
dma_queue_pop(dma);
|
||||
|
||||
float * s_work_curr = s_work[curr_spad_idx];
|
||||
|
||||
const uint32_t iv1 = fastmodulo(ir, H, &fd_H);
|
||||
const uint32_t iv3 = fastdiv(ir, &fd_H);
|
||||
|
||||
const uint32_t iq1 = fastmodulo(iv1, q->ne[1], &fd_q1);
|
||||
const uint32_t ik1 = fastmodulo(iv1, k->ne[1], &fd_k1);
|
||||
const uint32_t iq3 = fastdiv(iv3, &fd_rq3);
|
||||
const uint32_t ik3 = fastdiv(iv3, &fd_rk3);
|
||||
|
||||
float * s_out = state_out_base + (uint64_t) (K - 1) * state_size_per_snap + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
|
||||
const float * s_in = state_in_base + (uint64_t) iv3 * state_seq_stride + (uint64_t) iv1 * S_v * S_v;
|
||||
|
||||
memcpy(s_out, s_in, gctx->state_bytes);
|
||||
float * s_work = s_out;
|
||||
|
||||
float * attn_data = dst_base + ((uint64_t) iv3 * n_tokens * H + iv1) * S_v;
|
||||
|
||||
@@ -640,57 +677,117 @@ static void gated_delta_net_f32_pp_thread(unsigned int nth, unsigned int ith, vo
|
||||
const float beta_val = *(const float *) ((const uint8_t *) (uintptr_t) beta->data +
|
||||
(uint64_t) iv3 * beta->nb[3] + (uint64_t) t * beta->nb[2] + (uint64_t) iv1 * beta->nb[1]);
|
||||
|
||||
memcpy(local_q, q_t, (size_t) S_v * sizeof(float));
|
||||
memcpy(local_k, k_t, (size_t) S_v * sizeof(float));
|
||||
hvx_copy_f32_au((uint8_t *) local_q, (const uint8_t *) q_t, S_v);
|
||||
hvx_copy_f32_au((uint8_t *) local_k, (const uint8_t *) k_t, S_v);
|
||||
|
||||
if (kda) {
|
||||
hvx_exp_f32((uint8_t *) local_gate, (const uint8_t *) g_t, S_v, false);
|
||||
|
||||
uint32_t j = 0;
|
||||
for (; j + 4 <= S_v; j += 4) {
|
||||
float * row0 = s_work + (uint64_t) (j + 0) * S_v;
|
||||
float * row1 = s_work + (uint64_t) (j + 1) * S_v;
|
||||
float * row2 = s_work + (uint64_t) (j + 2) * S_v;
|
||||
float * row3 = s_work + (uint64_t) (j + 3) * S_v;
|
||||
gdn_mul_dot4_f32(row0, row1, row2, row3, local_gate, local_k, S_v, local_sums);
|
||||
float local_delta_b[4] __attribute__((aligned(128)));
|
||||
for (uint32_t r = 0; r < 4; ++r) {
|
||||
local_delta_b[r] = (v_t[j + r] - local_sums[r]) * beta_val;
|
||||
}
|
||||
gdn_add_scaled_dot4_f32(row0, row1, row2, row3, local_k, local_delta_b, local_q, S_v, local_sums);
|
||||
for (uint32_t r = 0; r < 4; ++r) {
|
||||
attn_data[j + r] = local_sums[r] * scale;
|
||||
}
|
||||
for (; j + 8 <= S_v; j += 8) {
|
||||
float * row0 = s_work_curr + (uint64_t) (j + 0) * S_v;
|
||||
float * row1 = s_work_curr + (uint64_t) (j + 1) * S_v;
|
||||
float * row2 = s_work_curr + (uint64_t) (j + 2) * S_v;
|
||||
float * row3 = s_work_curr + (uint64_t) (j + 3) * S_v;
|
||||
float * row4 = s_work_curr + (uint64_t) (j + 4) * S_v;
|
||||
float * row5 = s_work_curr + (uint64_t) (j + 5) * S_v;
|
||||
float * row6 = s_work_curr + (uint64_t) (j + 6) * S_v;
|
||||
float * row7 = s_work_curr + (uint64_t) (j + 7) * S_v;
|
||||
gdn_mul_dot8_f32(row0, row1, row2, row3, row4, row5, row6, row7,
|
||||
local_gate, local_k, S_v, local_sums);
|
||||
|
||||
float local_delta_b[32] __attribute__((aligned(128)));
|
||||
HVX_Vector vv_t = hvx_vmemu(v_t + j);
|
||||
HVX_Vector v_local_sums = hvx_vmem(local_sums);
|
||||
HVX_Vector diff = hvx_vec_sub_f32_f32(vv_t, v_local_sums);
|
||||
hvx_vmem(local_delta_b) = hvx_vec_mul_f32_f32(diff, hvx_vec_splat_f32(beta_val));
|
||||
|
||||
gdn_add_scaled_dot8_f32(row0, row1, row2, row3, row4, row5, row6, row7,
|
||||
local_k, local_delta_b, local_q, S_v, local_sums);
|
||||
|
||||
HVX_Vector res_attn = hvx_vec_mul_f32_f32(hvx_vmem(local_sums), hvx_vec_splat_f32(scale));
|
||||
hvx_vec_store_u(attn_data + j, 8 * sizeof(float), res_attn);
|
||||
}
|
||||
for (; j + 4 <= S_v; j += 4) {
|
||||
float * row0 = s_work_curr + (uint64_t) (j + 0) * S_v;
|
||||
float * row1 = s_work_curr + (uint64_t) (j + 1) * S_v;
|
||||
float * row2 = s_work_curr + (uint64_t) (j + 2) * S_v;
|
||||
float * row3 = s_work_curr + (uint64_t) (j + 3) * S_v;
|
||||
gdn_mul_dot4_f32(row0, row1, row2, row3, local_gate, local_k, S_v, local_sums);
|
||||
|
||||
float local_delta_b[32] __attribute__((aligned(128)));
|
||||
HVX_Vector vv_t = hvx_vmemu(v_t + j);
|
||||
HVX_Vector v_local_sums = hvx_vmem(local_sums);
|
||||
HVX_Vector diff = hvx_vec_sub_f32_f32(vv_t, v_local_sums);
|
||||
hvx_vmem(local_delta_b) = hvx_vec_mul_f32_f32(diff, hvx_vec_splat_f32(beta_val));
|
||||
|
||||
gdn_add_scaled_dot4_f32(row0, row1, row2, row3, local_k, local_delta_b, local_q, S_v, local_sums);
|
||||
|
||||
HVX_Vector res_attn = hvx_vec_mul_f32_f32(hvx_vmem(local_sums), hvx_vec_splat_f32(scale));
|
||||
hvx_vec_store_u(attn_data + j, 4 * sizeof(float), res_attn);
|
||||
}
|
||||
HVX_Vector vscale_splat = hvx_vec_splat_f32(scale);
|
||||
for (; j < S_v; ++j) {
|
||||
float * row = s_work + (uint64_t) j * S_v;
|
||||
const float sum = gdn_mul_dot_f32(row, local_gate, local_k, S_v);
|
||||
const float dj = (v_t[j] - sum) * beta_val;
|
||||
attn_data[j] = gdn_add_scaled_dot_f32(row, local_k, dj, local_q, S_v) * scale;
|
||||
float * row = s_work_curr + (uint64_t) j * S_v;
|
||||
HVX_Vector vsum = gdn_mul_dot_f32(row, local_gate, local_k, S_v);
|
||||
HVX_Vector vv_t = hvx_vec_splat_f32(v_t[j]);
|
||||
HVX_Vector vdj = hvx_vec_mul_f32_f32(hvx_vec_sub_f32_f32(vv_t, vsum), hvx_vec_splat_f32(beta_val));
|
||||
HVX_Vector vres = gdn_add_scaled_dot_f32(row, local_k, vdj, local_q, S_v);
|
||||
attn_data[j] = hvx_vec_get_f32(hvx_vec_mul_f32_f32(vres, vscale_splat));
|
||||
}
|
||||
} else {
|
||||
const float gate = expf(g_t[0]);
|
||||
uint32_t j = 0;
|
||||
for (; j + 4 <= S_v; j += 4) {
|
||||
float * row0 = s_work + (uint64_t) (j + 0) * S_v;
|
||||
float * row1 = s_work + (uint64_t) (j + 1) * S_v;
|
||||
float * row2 = s_work + (uint64_t) (j + 2) * S_v;
|
||||
float * row3 = s_work + (uint64_t) (j + 3) * S_v;
|
||||
gdn_mul_scalar_dot4_f32(row0, row1, row2, row3, gate, local_k, S_v, local_sums);
|
||||
float local_delta_b[4] __attribute__((aligned(128)));
|
||||
for (uint32_t r = 0; r < 4; ++r) {
|
||||
local_delta_b[r] = (v_t[j + r] - local_sums[r]) * beta_val;
|
||||
}
|
||||
gdn_add_scaled_dot4_f32(row0, row1, row2, row3, local_k, local_delta_b, local_q, S_v, local_sums);
|
||||
for (uint32_t r = 0; r < 4; ++r) {
|
||||
attn_data[j + r] = local_sums[r] * scale;
|
||||
}
|
||||
for (; j + 8 <= S_v; j += 8) {
|
||||
float * row0 = s_work_curr + (uint64_t) (j + 0) * S_v;
|
||||
float * row1 = s_work_curr + (uint64_t) (j + 1) * S_v;
|
||||
float * row2 = s_work_curr + (uint64_t) (j + 2) * S_v;
|
||||
float * row3 = s_work_curr + (uint64_t) (j + 3) * S_v;
|
||||
float * row4 = s_work_curr + (uint64_t) (j + 4) * S_v;
|
||||
float * row5 = s_work_curr + (uint64_t) (j + 5) * S_v;
|
||||
float * row6 = s_work_curr + (uint64_t) (j + 6) * S_v;
|
||||
float * row7 = s_work_curr + (uint64_t) (j + 7) * S_v;
|
||||
gdn_mul_scalar_dot8_f32(row0, row1, row2, row3, row4, row5, row6, row7,
|
||||
gate, local_k, S_v, local_sums);
|
||||
|
||||
float local_delta_b[32] __attribute__((aligned(128)));
|
||||
HVX_Vector vv_t = hvx_vmemu(v_t + j);
|
||||
HVX_Vector v_local_sums = hvx_vmem(local_sums);
|
||||
HVX_Vector diff = hvx_vec_sub_f32_f32(vv_t, v_local_sums);
|
||||
hvx_vmem(local_delta_b) = hvx_vec_mul_f32_f32(diff, hvx_vec_splat_f32(beta_val));
|
||||
|
||||
gdn_add_scaled_dot8_f32(row0, row1, row2, row3, row4, row5, row6, row7,
|
||||
local_k, local_delta_b, local_q, S_v, local_sums);
|
||||
|
||||
HVX_Vector res_attn = hvx_vec_mul_f32_f32(hvx_vmem(local_sums), hvx_vec_splat_f32(scale));
|
||||
hvx_vec_store_u(attn_data + j, 8 * sizeof(float), res_attn);
|
||||
}
|
||||
for (; j + 4 <= S_v; j += 4) {
|
||||
float * row0 = s_work_curr + (uint64_t) (j + 0) * S_v;
|
||||
float * row1 = s_work_curr + (uint64_t) (j + 1) * S_v;
|
||||
float * row2 = s_work_curr + (uint64_t) (j + 2) * S_v;
|
||||
float * row3 = s_work_curr + (uint64_t) (j + 3) * S_v;
|
||||
gdn_mul_scalar_dot4_f32(row0, row1, row2, row3, gate, local_k, S_v, local_sums);
|
||||
|
||||
float local_delta_b[32] __attribute__((aligned(128)));
|
||||
HVX_Vector vv_t = hvx_vmemu(v_t + j);
|
||||
HVX_Vector v_local_sums = hvx_vmem(local_sums);
|
||||
HVX_Vector diff = hvx_vec_sub_f32_f32(vv_t, v_local_sums);
|
||||
hvx_vmem(local_delta_b) = hvx_vec_mul_f32_f32(diff, hvx_vec_splat_f32(beta_val));
|
||||
|
||||
gdn_add_scaled_dot4_f32(row0, row1, row2, row3, local_k, local_delta_b, local_q, S_v, local_sums);
|
||||
|
||||
HVX_Vector res_attn = hvx_vec_mul_f32_f32(hvx_vmem(local_sums), hvx_vec_splat_f32(scale));
|
||||
hvx_vec_store_u(attn_data + j, 4 * sizeof(float), res_attn);
|
||||
}
|
||||
HVX_Vector vscale_splat = hvx_vec_splat_f32(scale);
|
||||
for (; j < S_v; ++j) {
|
||||
float * row = s_work + (uint64_t) j * S_v;
|
||||
const float sum = gdn_mul_scalar_dot_f32(row, gate, local_k, S_v);
|
||||
const float dj = (v_t[j] - sum) * beta_val;
|
||||
attn_data[j] = gdn_add_scaled_dot_f32(row, local_k, dj, local_q, S_v) * scale;
|
||||
float * row = s_work_curr + (uint64_t) j * S_v;
|
||||
HVX_Vector vsum = gdn_mul_scalar_dot_f32(row, gate, local_k, S_v);
|
||||
HVX_Vector vv_t = hvx_vec_splat_f32(v_t[j]);
|
||||
HVX_Vector vdj = hvx_vec_mul_f32_f32(hvx_vec_sub_f32_f32(vv_t, vsum), hvx_vec_splat_f32(beta_val));
|
||||
HVX_Vector vres = gdn_add_scaled_dot_f32(row, local_k, vdj, local_q, S_v);
|
||||
attn_data[j] = hvx_vec_get_f32(hvx_vec_mul_f32_f32(vres, vscale_splat));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -698,17 +795,40 @@ static void gated_delta_net_f32_pp_thread(unsigned int nth, unsigned int ith, vo
|
||||
const int64_t target_slot = (int64_t) t - shift;
|
||||
if (target_slot >= 0 && target_slot < (int64_t) K) {
|
||||
float * curr_state_o = state_out_base + (uint64_t) target_slot * state_size_per_snap + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
|
||||
if (curr_state_o != s_work) {
|
||||
memcpy(curr_state_o, s_work, gctx->state_bytes);
|
||||
if (curr_state_o != s_out) {
|
||||
hvx_copy_f32_uu((uint8_t *) curr_state_o, (const uint8_t *) s_work_curr, S_v * S_v);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
attn_data += (uint64_t) S_v * H;
|
||||
}
|
||||
|
||||
// Push real write-back
|
||||
dma_queue_push(dma, dma_make_ptr(s_out, s_work_curr),
|
||||
S_v * sizeof(float), S_v * sizeof(float),
|
||||
S_v * sizeof(float), S_v);
|
||||
|
||||
// Prefetch next block (if any)
|
||||
if (ir_prefetch < total_rows) {
|
||||
const uint32_t piv1 = fastmodulo(ir_prefetch, H, &fd_H);
|
||||
const uint32_t piv3 = fastdiv(ir_prefetch, &fd_H);
|
||||
const float * ps_in = state_in_base + (uint64_t) piv3 * state_seq_stride + (uint64_t) piv1 * S_v * S_v;
|
||||
|
||||
dma_queue_push(dma, dma_make_ptr(s_work[spad_idx], ps_in),
|
||||
S_v * sizeof(float), S_v * sizeof(float),
|
||||
S_v * sizeof(float), S_v);
|
||||
|
||||
ir_prefetch += nth;
|
||||
spad_idx ^= 1;
|
||||
}
|
||||
|
||||
curr_spad_idx ^= 1;
|
||||
}
|
||||
dma_queue_flush(dma);
|
||||
}
|
||||
|
||||
|
||||
static void gated_delta_net_f32_tg_thread(unsigned int nth, unsigned int ith, void * data) {
|
||||
struct htp_gdn_context * gctx = (struct htp_gdn_context *) data;
|
||||
struct htp_ops_context * octx = gctx->octx;
|
||||
@@ -743,41 +863,64 @@ static void gated_delta_net_f32_tg_thread(unsigned int nth, unsigned int ith, vo
|
||||
float local_gate[HTP_GDN_MAX_SV] __attribute__((aligned(128)));
|
||||
float local_q[HTP_GDN_MAX_SV] __attribute__((aligned(128)));
|
||||
float local_k[HTP_GDN_MAX_SV] __attribute__((aligned(128)));
|
||||
float local_sums[8] __attribute__((aligned(128)));
|
||||
float local_sums[32] __attribute__((aligned(128)));
|
||||
|
||||
dma_queue * dma = octx->ctx->dma[ith];
|
||||
size_t state_aligned = (size_t) S_v * S_v * sizeof(float);
|
||||
state_aligned = (state_aligned + 127) & ~(size_t)127;
|
||||
float * s_work[2];
|
||||
s_work[0] = (float *) (gctx->vtcm_base + gctx->vtcm_per_thread * ith);
|
||||
s_work[1] = s_work[0] + state_aligned / sizeof(float);
|
||||
|
||||
uint8_t * spad = NULL;
|
||||
if (gctx->use_vtcm) {
|
||||
spad = gctx->vtcm_state_base + gctx->vtcm_state_per_thread * ith;
|
||||
}
|
||||
struct fastdiv_values fd_H = init_fastdiv_values(H);
|
||||
struct fastdiv_values fd_q1 = init_fastdiv_values(q->ne[1]);
|
||||
struct fastdiv_values fd_k1 = init_fastdiv_values(k->ne[1]);
|
||||
struct fastdiv_values fd_rq3 = init_fastdiv_values(rq3);
|
||||
struct fastdiv_values fd_rk3 = init_fastdiv_values(rk3);
|
||||
|
||||
const uint64_t state_seq_stride = state->nb[2] / sizeof(float);
|
||||
const uint64_t state_size_per_snap = (uint64_t) S_v * S_v * H * n_seqs;
|
||||
|
||||
for (uint32_t ir = ith; ir < total_rows; ir += nth) {
|
||||
const uint32_t iv1 = ir % H;
|
||||
const uint32_t iv3 = ir / H;
|
||||
uint32_t ir_prefetch = ith;
|
||||
int spad_idx = 0;
|
||||
|
||||
const uint32_t iq1 = iv1 % q->ne[1];
|
||||
const uint32_t ik1 = iv1 % k->ne[1];
|
||||
const uint32_t iq3 = iv3 / rq3;
|
||||
const uint32_t ik3 = iv3 / rk3;
|
||||
// Prefetch preamble (up to 2 steps)
|
||||
for (int k = 0; k < 2 && ir_prefetch < total_rows; k++) {
|
||||
const uint32_t piv1 = fastmodulo(ir_prefetch, H, &fd_H);
|
||||
const uint32_t piv3 = fastdiv(ir_prefetch, &fd_H);
|
||||
const float * ps_in = state_in_base + (uint64_t) piv3 * state_seq_stride + (uint64_t) piv1 * S_v * S_v;
|
||||
float * ps_out = state_out_base + (uint64_t) (K - 1) * state_size_per_snap + ((uint64_t) piv3 * H + piv1) * S_v * S_v;
|
||||
|
||||
// Push dummy write-back
|
||||
dma_queue_push(dma, dma_make_ptr(ps_out, s_work[spad_idx]),
|
||||
S_v * sizeof(float), S_v * sizeof(float),
|
||||
S_v * sizeof(float), 0);
|
||||
|
||||
// Push fetch
|
||||
dma_queue_push(dma, dma_make_ptr(s_work[spad_idx], ps_in),
|
||||
S_v * sizeof(float), S_v * sizeof(float),
|
||||
S_v * sizeof(float), S_v);
|
||||
|
||||
ir_prefetch += nth;
|
||||
spad_idx ^= 1;
|
||||
}
|
||||
|
||||
int curr_spad_idx = 0;
|
||||
for (uint32_t ir = ith; ir < total_rows; ir += nth) {
|
||||
dma_queue_pop(dma);
|
||||
dma_queue_pop(dma);
|
||||
|
||||
float * s_work_curr = s_work[curr_spad_idx];
|
||||
|
||||
const uint32_t iv1 = fastmodulo(ir, H, &fd_H);
|
||||
const uint32_t iv3 = fastdiv(ir, &fd_H);
|
||||
|
||||
const uint32_t iq1 = fastmodulo(iv1, q->ne[1], &fd_q1);
|
||||
const uint32_t ik1 = fastmodulo(iv1, k->ne[1], &fd_k1);
|
||||
const uint32_t iq3 = fastdiv(iv3, &fd_rq3);
|
||||
const uint32_t ik3 = fastdiv(iv3, &fd_rk3);
|
||||
|
||||
float * s_out = state_out_base + (uint64_t) (K - 1) * state_size_per_snap + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
|
||||
const float * s_in = state_in_base + (uint64_t) iv3 * state_seq_stride + (uint64_t) iv1 * S_v * S_v;
|
||||
float * s_work;
|
||||
|
||||
if (spad) {
|
||||
dma_queue_push(dma, dma_make_ptr(spad, s_in),
|
||||
S_v * sizeof(float), S_v * sizeof(float),
|
||||
S_v * sizeof(float), S_v);
|
||||
dma_queue_pop(dma);
|
||||
s_work = (float *) spad;
|
||||
} else {
|
||||
s_work = s_out;
|
||||
memcpy(s_work, s_in, gctx->state_bytes);
|
||||
}
|
||||
|
||||
float * attn_data = dst_base + ((uint64_t) iv3 * H + iv1) * S_v;
|
||||
|
||||
@@ -792,111 +935,145 @@ static void gated_delta_net_f32_tg_thread(unsigned int nth, unsigned int ith, vo
|
||||
const float beta_val = *(const float *) ((const uint8_t *) (uintptr_t) beta->data +
|
||||
(uint64_t) iv3 * beta->nb[3] + (uint64_t) iv1 * beta->nb[1]);
|
||||
|
||||
memcpy(local_q, q_t, (size_t) S_v * sizeof(float));
|
||||
memcpy(local_k, k_t, (size_t) S_v * sizeof(float));
|
||||
hvx_copy_f32_au((uint8_t *) local_q, (const uint8_t *) q_t, S_v);
|
||||
hvx_copy_f32_au((uint8_t *) local_k, (const uint8_t *) k_t, S_v);
|
||||
|
||||
if (kda) {
|
||||
hvx_exp_f32((uint8_t *) local_gate, (const uint8_t *) g_t, S_v, false);
|
||||
|
||||
uint32_t j = 0;
|
||||
for (; j + 8 <= S_v; j += 8) {
|
||||
float * row0 = s_work + (uint64_t) (j + 0) * S_v;
|
||||
float * row1 = s_work + (uint64_t) (j + 1) * S_v;
|
||||
float * row2 = s_work + (uint64_t) (j + 2) * S_v;
|
||||
float * row3 = s_work + (uint64_t) (j + 3) * S_v;
|
||||
float * row4 = s_work + (uint64_t) (j + 4) * S_v;
|
||||
float * row5 = s_work + (uint64_t) (j + 5) * S_v;
|
||||
float * row6 = s_work + (uint64_t) (j + 6) * S_v;
|
||||
float * row7 = s_work + (uint64_t) (j + 7) * S_v;
|
||||
float * row0 = s_work_curr + (uint64_t) (j + 0) * S_v;
|
||||
float * row1 = s_work_curr + (uint64_t) (j + 1) * S_v;
|
||||
float * row2 = s_work_curr + (uint64_t) (j + 2) * S_v;
|
||||
float * row3 = s_work_curr + (uint64_t) (j + 3) * S_v;
|
||||
float * row4 = s_work_curr + (uint64_t) (j + 4) * S_v;
|
||||
float * row5 = s_work_curr + (uint64_t) (j + 5) * S_v;
|
||||
float * row6 = s_work_curr + (uint64_t) (j + 6) * S_v;
|
||||
float * row7 = s_work_curr + (uint64_t) (j + 7) * S_v;
|
||||
gdn_mul_dot8_f32(row0, row1, row2, row3, row4, row5, row6, row7,
|
||||
local_gate, local_k, S_v, local_sums);
|
||||
float local_delta_b[8] __attribute__((aligned(128)));
|
||||
for (uint32_t r = 0; r < 8; ++r) {
|
||||
local_delta_b[r] = (v_t[j + r] - local_sums[r]) * beta_val;
|
||||
}
|
||||
|
||||
float local_delta_b[32] __attribute__((aligned(128)));
|
||||
HVX_Vector vv_t = hvx_vmemu(v_t + j);
|
||||
HVX_Vector v_local_sums = hvx_vmem(local_sums);
|
||||
HVX_Vector diff = hvx_vec_sub_f32_f32(vv_t, v_local_sums);
|
||||
hvx_vmem(local_delta_b) = hvx_vec_mul_f32_f32(diff, hvx_vec_splat_f32(beta_val));
|
||||
|
||||
gdn_add_scaled_dot8_f32(row0, row1, row2, row3, row4, row5, row6, row7,
|
||||
local_k, local_delta_b, local_q, S_v, local_sums);
|
||||
for (uint32_t r = 0; r < 8; ++r) {
|
||||
attn_data[j + r] = local_sums[r] * scale;
|
||||
}
|
||||
|
||||
HVX_Vector res_attn = hvx_vec_mul_f32_f32(hvx_vmem(local_sums), hvx_vec_splat_f32(scale));
|
||||
hvx_vec_store_u(attn_data + j, 8 * sizeof(float), res_attn);
|
||||
}
|
||||
for (; j + 4 <= S_v; j += 4) {
|
||||
float * row0 = s_work + (uint64_t) (j + 0) * S_v;
|
||||
float * row1 = s_work + (uint64_t) (j + 1) * S_v;
|
||||
float * row2 = s_work + (uint64_t) (j + 2) * S_v;
|
||||
float * row3 = s_work + (uint64_t) (j + 3) * S_v;
|
||||
float * row0 = s_work_curr + (uint64_t) (j + 0) * S_v;
|
||||
float * row1 = s_work_curr + (uint64_t) (j + 1) * S_v;
|
||||
float * row2 = s_work_curr + (uint64_t) (j + 2) * S_v;
|
||||
float * row3 = s_work_curr + (uint64_t) (j + 3) * S_v;
|
||||
gdn_mul_dot4_f32(row0, row1, row2, row3, local_gate, local_k, S_v, local_sums);
|
||||
float local_delta_b[4] __attribute__((aligned(128)));
|
||||
for (uint32_t r = 0; r < 4; ++r) {
|
||||
local_delta_b[r] = (v_t[j + r] - local_sums[r]) * beta_val;
|
||||
}
|
||||
|
||||
float local_delta_b[32] __attribute__((aligned(128)));
|
||||
HVX_Vector vv_t = hvx_vmemu(v_t + j);
|
||||
HVX_Vector v_local_sums = hvx_vmem(local_sums);
|
||||
HVX_Vector diff = hvx_vec_sub_f32_f32(vv_t, v_local_sums);
|
||||
hvx_vmem(local_delta_b) = hvx_vec_mul_f32_f32(diff, hvx_vec_splat_f32(beta_val));
|
||||
|
||||
gdn_add_scaled_dot4_f32(row0, row1, row2, row3, local_k, local_delta_b, local_q, S_v, local_sums);
|
||||
for (uint32_t r = 0; r < 4; ++r) {
|
||||
attn_data[j + r] = local_sums[r] * scale;
|
||||
}
|
||||
|
||||
HVX_Vector res_attn = hvx_vec_mul_f32_f32(hvx_vmem(local_sums), hvx_vec_splat_f32(scale));
|
||||
hvx_vec_store_u(attn_data + j, 4 * sizeof(float), res_attn);
|
||||
}
|
||||
HVX_Vector vscale_splat = hvx_vec_splat_f32(scale);
|
||||
for (; j < S_v; ++j) {
|
||||
float * row = s_work + (uint64_t) j * S_v;
|
||||
const float sum = gdn_mul_dot_f32(row, local_gate, local_k, S_v);
|
||||
const float dj = (v_t[j] - sum) * beta_val;
|
||||
attn_data[j] = gdn_add_scaled_dot_f32(row, local_k, dj, local_q, S_v) * scale;
|
||||
float * row = s_work_curr + (uint64_t) j * S_v;
|
||||
HVX_Vector vsum = gdn_mul_dot_f32(row, local_gate, local_k, S_v);
|
||||
HVX_Vector vv_t = hvx_vec_splat_f32(v_t[j]);
|
||||
HVX_Vector vdj = hvx_vec_mul_f32_f32(hvx_vec_sub_f32_f32(vv_t, vsum), hvx_vec_splat_f32(beta_val));
|
||||
HVX_Vector vres = gdn_add_scaled_dot_f32(row, local_k, vdj, local_q, S_v);
|
||||
attn_data[j] = hvx_vec_get_f32(hvx_vec_mul_f32_f32(vres, vscale_splat));
|
||||
}
|
||||
} else {
|
||||
const float gate = expf(g_t[0]);
|
||||
uint32_t j = 0;
|
||||
for (; j + 8 <= S_v; j += 8) {
|
||||
float * row0 = s_work + (uint64_t) (j + 0) * S_v;
|
||||
float * row1 = s_work + (uint64_t) (j + 1) * S_v;
|
||||
float * row2 = s_work + (uint64_t) (j + 2) * S_v;
|
||||
float * row3 = s_work + (uint64_t) (j + 3) * S_v;
|
||||
float * row4 = s_work + (uint64_t) (j + 4) * S_v;
|
||||
float * row5 = s_work + (uint64_t) (j + 5) * S_v;
|
||||
float * row6 = s_work + (uint64_t) (j + 6) * S_v;
|
||||
float * row7 = s_work + (uint64_t) (j + 7) * S_v;
|
||||
float * row0 = s_work_curr + (uint64_t) (j + 0) * S_v;
|
||||
float * row1 = s_work_curr + (uint64_t) (j + 1) * S_v;
|
||||
float * row2 = s_work_curr + (uint64_t) (j + 2) * S_v;
|
||||
float * row3 = s_work_curr + (uint64_t) (j + 3) * S_v;
|
||||
float * row4 = s_work_curr + (uint64_t) (j + 4) * S_v;
|
||||
float * row5 = s_work_curr + (uint64_t) (j + 5) * S_v;
|
||||
float * row6 = s_work_curr + (uint64_t) (j + 6) * S_v;
|
||||
float * row7 = s_work_curr + (uint64_t) (j + 7) * S_v;
|
||||
gdn_mul_scalar_dot8_f32(row0, row1, row2, row3, row4, row5, row6, row7,
|
||||
gate, local_k, S_v, local_sums);
|
||||
float local_delta_b[8] __attribute__((aligned(128)));
|
||||
for (uint32_t r = 0; r < 8; ++r) {
|
||||
local_delta_b[r] = (v_t[j + r] - local_sums[r]) * beta_val;
|
||||
}
|
||||
|
||||
float local_delta_b[32] __attribute__((aligned(128)));
|
||||
HVX_Vector vv_t = hvx_vmemu(v_t + j);
|
||||
HVX_Vector v_local_sums = hvx_vmem(local_sums);
|
||||
HVX_Vector diff = hvx_vec_sub_f32_f32(vv_t, v_local_sums);
|
||||
hvx_vmem(local_delta_b) = hvx_vec_mul_f32_f32(diff, hvx_vec_splat_f32(beta_val));
|
||||
|
||||
gdn_add_scaled_dot8_f32(row0, row1, row2, row3, row4, row5, row6, row7,
|
||||
local_k, local_delta_b, local_q, S_v, local_sums);
|
||||
for (uint32_t r = 0; r < 8; ++r) {
|
||||
attn_data[j + r] = local_sums[r] * scale;
|
||||
}
|
||||
|
||||
HVX_Vector res_attn = hvx_vec_mul_f32_f32(hvx_vmem(local_sums), hvx_vec_splat_f32(scale));
|
||||
hvx_vec_store_u(attn_data + j, 8 * sizeof(float), res_attn);
|
||||
}
|
||||
for (; j + 4 <= S_v; j += 4) {
|
||||
float * row0 = s_work + (uint64_t) (j + 0) * S_v;
|
||||
float * row1 = s_work + (uint64_t) (j + 1) * S_v;
|
||||
float * row2 = s_work + (uint64_t) (j + 2) * S_v;
|
||||
float * row3 = s_work + (uint64_t) (j + 3) * S_v;
|
||||
float * row0 = s_work_curr + (uint64_t) (j + 0) * S_v;
|
||||
float * row1 = s_work_curr + (uint64_t) (j + 1) * S_v;
|
||||
float * row2 = s_work_curr + (uint64_t) (j + 2) * S_v;
|
||||
float * row3 = s_work_curr + (uint64_t) (j + 3) * S_v;
|
||||
gdn_mul_scalar_dot4_f32(row0, row1, row2, row3, gate, local_k, S_v, local_sums);
|
||||
float local_delta_b[4] __attribute__((aligned(128)));
|
||||
for (uint32_t r = 0; r < 4; ++r) {
|
||||
local_delta_b[r] = (v_t[j + r] - local_sums[r]) * beta_val;
|
||||
}
|
||||
|
||||
float local_delta_b[32] __attribute__((aligned(128)));
|
||||
HVX_Vector vv_t = hvx_vmemu(v_t + j);
|
||||
HVX_Vector v_local_sums = hvx_vmem(local_sums);
|
||||
HVX_Vector diff = hvx_vec_sub_f32_f32(vv_t, v_local_sums);
|
||||
hvx_vmem(local_delta_b) = hvx_vec_mul_f32_f32(diff, hvx_vec_splat_f32(beta_val));
|
||||
|
||||
gdn_add_scaled_dot4_f32(row0, row1, row2, row3, local_k, local_delta_b, local_q, S_v, local_sums);
|
||||
for (uint32_t r = 0; r < 4; ++r) {
|
||||
attn_data[j + r] = local_sums[r] * scale;
|
||||
}
|
||||
|
||||
HVX_Vector res_attn = hvx_vec_mul_f32_f32(hvx_vmem(local_sums), hvx_vec_splat_f32(scale));
|
||||
hvx_vec_store_u(attn_data + j, 4 * sizeof(float), res_attn);
|
||||
}
|
||||
HVX_Vector vscale_splat = hvx_vec_splat_f32(scale);
|
||||
for (; j < S_v; ++j) {
|
||||
float * row = s_work + (uint64_t) j * S_v;
|
||||
const float sum = gdn_mul_scalar_dot_f32(row, gate, local_k, S_v);
|
||||
const float dj = (v_t[j] - sum) * beta_val;
|
||||
attn_data[j] = gdn_add_scaled_dot_f32(row, local_k, dj, local_q, S_v) * scale;
|
||||
float * row = s_work_curr + (uint64_t) j * S_v;
|
||||
HVX_Vector vsum = gdn_mul_scalar_dot_f32(row, gate, local_k, S_v);
|
||||
HVX_Vector vv_t = hvx_vec_splat_f32(v_t[j]);
|
||||
HVX_Vector vdj = hvx_vec_mul_f32_f32(hvx_vec_sub_f32_f32(vv_t, vsum), hvx_vec_splat_f32(beta_val));
|
||||
HVX_Vector vres = gdn_add_scaled_dot_f32(row, local_k, vdj, local_q, S_v);
|
||||
attn_data[j] = hvx_vec_get_f32(hvx_vec_mul_f32_f32(vres, vscale_splat));
|
||||
}
|
||||
}
|
||||
|
||||
if (spad) {
|
||||
dma_queue_push(dma, dma_make_ptr(s_out, spad),
|
||||
// Push real write-back
|
||||
dma_queue_push(dma, dma_make_ptr(s_out, s_work_curr),
|
||||
S_v * sizeof(float), S_v * sizeof(float),
|
||||
S_v * sizeof(float), S_v);
|
||||
|
||||
// Prefetch next block (if any)
|
||||
if (ir_prefetch < total_rows) {
|
||||
const uint32_t piv1 = fastmodulo(ir_prefetch, H, &fd_H);
|
||||
const uint32_t piv3 = fastdiv(ir_prefetch, &fd_H);
|
||||
const float * ps_in = state_in_base + (uint64_t) piv3 * state_seq_stride + (uint64_t) piv1 * S_v * S_v;
|
||||
|
||||
dma_queue_push(dma, dma_make_ptr(s_work[spad_idx], ps_in),
|
||||
S_v * sizeof(float), S_v * sizeof(float),
|
||||
S_v * sizeof(float), S_v);
|
||||
dma_queue_pop(dma);
|
||||
|
||||
ir_prefetch += nth;
|
||||
spad_idx ^= 1;
|
||||
}
|
||||
|
||||
curr_spad_idx ^= 1;
|
||||
}
|
||||
dma_queue_flush(dma);
|
||||
}
|
||||
|
||||
|
||||
int op_gated_delta_net(struct htp_ops_context * octx) {
|
||||
const struct htp_tensor * q = octx->src[0];
|
||||
const struct htp_tensor * k = octx->src[1];
|
||||
@@ -952,18 +1129,11 @@ int op_gated_delta_net(struct htp_ops_context * octx) {
|
||||
size_t state_aligned = (size_t) S_v * S_v * sizeof(float);
|
||||
state_aligned = (state_aligned + 127) & ~(size_t)127;
|
||||
|
||||
gctx.use_vtcm = false;
|
||||
gctx.vtcm_state_base = NULL;
|
||||
gctx.vtcm_state_per_thread = 0;
|
||||
assert(octx->ctx->vtcm_base != NULL);
|
||||
assert(octx->ctx->vtcm_size >= 2 * state_aligned * octx->n_threads);
|
||||
|
||||
if (n_tokens == 1 && octx->ctx->vtcm_base) {
|
||||
size_t vtcm_total = state_aligned * octx->n_threads;
|
||||
if (octx->ctx->vtcm_size >= vtcm_total) {
|
||||
gctx.use_vtcm = true;
|
||||
gctx.vtcm_state_base = octx->ctx->vtcm_base;
|
||||
gctx.vtcm_state_per_thread = state_aligned;
|
||||
}
|
||||
}
|
||||
gctx.vtcm_base = octx->ctx->vtcm_base;
|
||||
gctx.vtcm_per_thread = 2 * state_aligned;
|
||||
|
||||
if (n_tokens == 1) {
|
||||
worker_pool_run_func(octx->ctx->worker_pool, gated_delta_net_f32_tg_thread, &gctx, octx->n_threads);
|
||||
|
||||
@@ -17,14 +17,17 @@
|
||||
#define GGML_COMMON_DECL_C
|
||||
#include "ggml-common.h"
|
||||
#include "hex-dma.h"
|
||||
#include "hex-fastdiv.h"
|
||||
#include "hmx-profile.h"
|
||||
#include "hmx-queue.h"
|
||||
#include "hmx-utils.h"
|
||||
#include "htp-ctx.h"
|
||||
#include "htp-ops.h"
|
||||
#include "hvx-dump.h"
|
||||
#include "hvx-copy.h"
|
||||
#include "hvx-reduce.h"
|
||||
#include "hvx-utils.h"
|
||||
#include "hvx-flash-attn.h"
|
||||
#include "vtcm-utils.h"
|
||||
#include "worker-pool.h"
|
||||
|
||||
@@ -46,7 +49,7 @@
|
||||
// g_br = hex_align_up(gqa_factor * Br, 32) replaces Br for all Q/O/S/P/D dimensions.
|
||||
// Layout: Q + O_ping + O_pong + K_dma*2 + V_dma*2 + K_tile + V_tile + S + P + D + vectors + scales
|
||||
// Mask is DMA'd into a VTCM buffer (Br rows per KV block) to avoid DDR reads in softmax.
|
||||
static size_t hmx_fa_compute_vtcm_usage(size_t gqa_factor, size_t DK, size_t DV, size_t Br, size_t Bc, size_t n_threads) {
|
||||
static size_t hmx_fa_compute_vtcm_usage(size_t gqa_factor, size_t DK, size_t DV, size_t Br, size_t Bc, size_t n_threads, bool use_pipeline) {
|
||||
const size_t g_br = hex_align_up(gqa_factor * Br, HMX_FP16_TILE_N_ROWS);
|
||||
const size_t q_tile_size = hex_align_up(g_br * DK * sizeof(__fp16), 4096); // Q: [g_br, DK]
|
||||
const size_t o_tile_size = hex_align_up(g_br * DV * sizeof(__fp16), 4096); // O: [g_br, DV] x2 ping-pong
|
||||
@@ -67,7 +70,7 @@ static size_t hmx_fa_compute_vtcm_usage(size_t gqa_factor, size_t DK, size_t DV,
|
||||
+ k_dma_size * 2 // K DMA x2
|
||||
+ v_dma_size * 2 // V DMA x2
|
||||
+ k_tile_size * 1 // K tiles
|
||||
+ v_tile_size * 1 // V tiles
|
||||
+ v_tile_size * (use_pipeline ? 2 : 1) // V tiles (double-buffered if pipelining)
|
||||
+ s_tile_size * 2 // S + P
|
||||
+ d_tile_size * 1 // D (diagonal matrix)
|
||||
+ col_vec_size * 4 // m_vec, l_vec, s_rowmax, p_rowsum
|
||||
@@ -144,12 +147,13 @@ static int hmx_fa_find_chunk_size(size_t * Br_out,
|
||||
// See .cursor/todos/hmx-flash-attn-bc-search-space.md for the perf trade-off.
|
||||
const size_t bc_unit = HMX_FP16_TILE_N_COLS * 2; // 64
|
||||
const size_t fp16 = sizeof(__fp16);
|
||||
const bool can_pipeline = (kv_len >= FA_MIN_KV_BLOCKS * bc_unit && n_threads >= 2);
|
||||
|
||||
// Approximate per-unit VTCM costs (without per-buffer alignment padding).
|
||||
const size_t per_gbr = (DK + 2 * DV) * fp16 + 4 * fp16; // Q + O×2 + 4 col vectors
|
||||
const size_t per_gbr2 = fp16; // D diagonal matrix
|
||||
const size_t per_bc =
|
||||
3 * (DK + DV) * fp16 + 2 * n_threads * fp16; // K_dma×2 + V_dma×2 + K_tile + V_tile + row bufs
|
||||
3 * DK * fp16 + (can_pipeline ? 4 : 3) * DV * fp16 + 2 * n_threads * fp16; // K/V DMA x2 + tiles + row bufs
|
||||
const size_t per_gbr_bc = 2 * fp16; // S + P
|
||||
|
||||
const size_t overhead = 256 * 2 + 13 * 4096;
|
||||
@@ -164,7 +168,6 @@ static int hmx_fa_find_chunk_size(size_t * Br_out,
|
||||
|
||||
// Pipeline constraint: cap Bc so n_kv_blocks >= FA_MIN_KV_BLOCKS.
|
||||
// Only relax when kv_len is too short to form enough blocks.
|
||||
const bool can_pipeline = (kv_len >= FA_MIN_KV_BLOCKS * bc_unit && n_threads >= 2);
|
||||
const size_t Bc_limit = can_pipeline ? hex_align_down(kv_len / FA_MIN_KV_BLOCKS, bc_unit) :
|
||||
(kv_len >= bc_unit ? hex_align_down(kv_len, bc_unit) : bc_unit);
|
||||
// Cost coefficients calibrated from profiling
|
||||
@@ -200,7 +203,7 @@ static int hmx_fa_find_chunk_size(size_t * Br_out,
|
||||
}
|
||||
|
||||
// Exact VTCM verification (alignment padding may push over budget)
|
||||
while (Bc >= bc_unit && hmx_fa_compute_vtcm_usage(gqa_factor, DK, DV, Br, Bc, n_threads) > vtcm_budget) {
|
||||
while (Bc >= bc_unit && hmx_fa_compute_vtcm_usage(gqa_factor, DK, DV, Br, Bc, n_threads, can_pipeline) > vtcm_budget) {
|
||||
Bc -= bc_unit;
|
||||
}
|
||||
if (Bc < bc_unit) {
|
||||
@@ -303,6 +306,7 @@ struct hmx_fa_context {
|
||||
uint32_t n_kv_heads; // number of KV heads
|
||||
uint32_t n_heads; // number of Q heads
|
||||
uint32_t G; // GQA factor = n_heads / n_kv_heads
|
||||
struct fastdiv_values div_G;
|
||||
uint32_t n_kv_blocks;
|
||||
uint32_t neq1; // Q token count
|
||||
|
||||
@@ -321,7 +325,7 @@ struct hmx_fa_context {
|
||||
__fp16 * vtcm_k_fp16[2]; // K DMA double-buffer [Bc, D]
|
||||
__fp16 * vtcm_v_fp16[2]; // V DMA double-buffer [Bc, D]
|
||||
__fp16 * vtcm_k_tiles; // K tiles (transposed)
|
||||
__fp16 * vtcm_v_tiles; // V tiles (column-major)
|
||||
__fp16 * vtcm_v_tiles[2]; // V tiles (column-major, double-buffered)
|
||||
__fp16 * vtcm_s_tiles; // S = QK^T [g_br, Bc]
|
||||
__fp16 * vtcm_p_tiles; // P = softmax(S) [g_br, Bc]
|
||||
__fp16 * vtcm_d_tiles; // Diagonal rescale [g_br, g_br]
|
||||
@@ -402,7 +406,9 @@ static void fa_v_interleave_thread(unsigned int n, unsigned int i, void * data)
|
||||
return;
|
||||
}
|
||||
|
||||
hmx_interleave_cols_to_tiles(factx->vtcm_v_tiles, factx->vtcm_v_fp16[args->buf_idx], total_rows, (int) factx->DV,
|
||||
__fp16 * v_tiles_dest = factx->use_pipeline ? factx->vtcm_v_tiles[args->buf_idx] : factx->vtcm_v_tiles[0];
|
||||
|
||||
hmx_interleave_cols_to_tiles(v_tiles_dest, factx->vtcm_v_fp16[args->buf_idx], total_rows, (int) factx->DV,
|
||||
(int) args->src_stride, (int) args->n_col_tiles, start, end);
|
||||
}
|
||||
|
||||
@@ -464,10 +470,10 @@ static void fa_q_load_thread(unsigned int n, unsigned int i, void * data) {
|
||||
for (size_t r = start; r < end; r += 2) {
|
||||
const bool next_row_valid = (r + 1) < n_rows_g;
|
||||
|
||||
const size_t q_idx0 = (r + 0) / G;
|
||||
const size_t h_idx0 = (r + 0) % G;
|
||||
const size_t q_idx1 = (r + 1) / G;
|
||||
const size_t h_idx1 = (r + 1) % G;
|
||||
const size_t q_idx0 = fastdiv(r + 0, &factx->div_G);
|
||||
const size_t h_idx0 = fastmodulo(r + 0, G, &factx->div_G);
|
||||
const size_t q_idx1 = fastdiv(r + 1, &factx->div_G);
|
||||
const size_t h_idx1 = fastmodulo(r + 1, G, &factx->div_G);
|
||||
|
||||
const uint8_t * q_ptr0 = (const uint8_t *) q->data + (q_start + q_idx0) * q->nb[1] +
|
||||
(kv_head * G + h_idx0) * q->nb[2] + ib3 * q->nb[3];
|
||||
@@ -567,8 +573,8 @@ static void fa_o_store_thread(unsigned int n, unsigned int i, void * data) {
|
||||
const uint32_t ib3 = args->ib3;
|
||||
|
||||
for (size_t r = start; r < end; ++r) {
|
||||
const size_t q_idx = r / G;
|
||||
const size_t h_idx = r % G;
|
||||
const size_t q_idx = fastdiv(r, &factx->div_G);
|
||||
const size_t h_idx = fastmodulo(r, G, &factx->div_G);
|
||||
|
||||
// FIX(dst-indexing): ggml_flash_attn_ext() creates dst as permute(0,2,1,3) ->
|
||||
// [DV, n_heads, n_tokens, n_seq], so head stride is nb[1] and token stride is nb[2].
|
||||
@@ -780,11 +786,11 @@ static void fa_softmax_thread(unsigned int n, unsigned int i, void * data) {
|
||||
if (args->mask_vtcm) {
|
||||
// Read mask from VTCM buffer (DMA'd per KV block).
|
||||
// GQA dedup (scheme B): skip load when qi unchanged.
|
||||
const size_t qi0 = (r + 0) / G;
|
||||
const size_t qi0 = fastdiv(r + 0, &factx->div_G);
|
||||
v_mask0 = *(const HVX_UVector *) (args->mask_vtcm + qi0 * args->mask_vtcm_row_stride + c);
|
||||
v_mask1 = v_neg_inf;
|
||||
if (r + 1 < (int) n_rows_g) {
|
||||
const size_t qi1 = (r + 1) / G;
|
||||
const size_t qi1 = fastdiv(r + 1, &factx->div_G);
|
||||
if (qi1 == qi0) {
|
||||
v_mask1 = v_mask0; // scheme B: reuse — same mask row
|
||||
} else {
|
||||
@@ -794,8 +800,8 @@ static void fa_softmax_thread(unsigned int n, unsigned int i, void * data) {
|
||||
} else {
|
||||
// Fallback: read mask directly from DDR (when mask->ne[2] > 1).
|
||||
const struct htp_tensor * mask = args->mask;
|
||||
const size_t q_idx0 = args->q_start + ((r + 0) / G);
|
||||
const size_t h_idx0 = args->kv_head * G + (r + 0) % G;
|
||||
const size_t q_idx0 = args->q_start + fastdiv(r + 0, &factx->div_G);
|
||||
const size_t h_idx0 = args->kv_head * G + fastmodulo(r + 0, G, &factx->div_G);
|
||||
const uint32_t im2_0 = h_idx0 % mask->ne[2];
|
||||
const uint32_t im3_0 = args->ib3 % mask->ne[3];
|
||||
|
||||
@@ -805,12 +811,12 @@ static void fa_softmax_thread(unsigned int n, unsigned int i, void * data) {
|
||||
v_mask1 = v_neg_inf;
|
||||
|
||||
if (r + 1 < (int) n_rows_g) {
|
||||
const size_t q_idx1 = args->q_start + ((r + 1) / G);
|
||||
const size_t q_idx1 = args->q_start + fastdiv(r + 1, &factx->div_G);
|
||||
if (q_idx1 == q_idx0) {
|
||||
// scheme B: same mask row in DDR path
|
||||
v_mask1 = v_mask0;
|
||||
} else {
|
||||
const size_t h_idx1 = args->kv_head * G + (r + 1) % G;
|
||||
const size_t h_idx1 = args->kv_head * G + fastmodulo(r + 1, G, &factx->div_G);
|
||||
const uint32_t im2_1 = h_idx1 % mask->ne[2];
|
||||
const uint32_t im3_1 = args->ib3 % mask->ne[3];
|
||||
const __fp16 * m1_ptr = (const __fp16 *) ((const uint8_t *) mask->data + q_idx1 * mask->nb[1] +
|
||||
@@ -1191,14 +1197,13 @@ static void hmx_fa_o_norm_worker(void * data) {
|
||||
// Row r in the GQA-merged block maps to Q head h = kv_head * G + r % G.
|
||||
// slope(h) = m0^(h+1) when h < n_head_log2, else m1^(2*(h-n_head_log2)+1).
|
||||
// When max_bias == 0, all slopes are 1.0 (no ALiBi).
|
||||
static __attribute__((noinline)) void fa_compute_slopes(fa_softmax_args_t * sargs,
|
||||
static __attribute__((noinline)) void fa_compute_slopes(
|
||||
const struct hmx_fa_context * factx,
|
||||
uint32_t kv_head,
|
||||
size_t n_rows_g) {
|
||||
__fp16 * slopes = factx->vtcm_slopes;
|
||||
if (factx->max_bias == 0.0f) {
|
||||
for (size_t r = 0; r < n_rows_g; ++r) {
|
||||
sargs->slopes[r] = 1.0f;
|
||||
}
|
||||
hvx_splat_f16_a(slopes, 1.0f, n_rows_g);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -1207,10 +1212,32 @@ static __attribute__((noinline)) void fa_compute_slopes(fa_softmax_args_t * sarg
|
||||
const float m0 = factx->m0;
|
||||
const float m1 = factx->m1;
|
||||
|
||||
for (size_t r = 0; r < n_rows_g; ++r) {
|
||||
const uint32_t h = kv_head * G + r % G;
|
||||
sargs->slopes[r] = (h < n_head_log2) ? powf(m0, h + 1) : powf(m1, 2 * (h - n_head_log2) + 1);
|
||||
__fp16 temp_slopes[512] __attribute__((aligned(128)));
|
||||
if (G <= 32) {
|
||||
// Fast path: Compute G unique slope values in vector registers
|
||||
HVX_Vector v_val = hvx_alibi_slopes(kv_head, G, n_head_log2, m0, m1);
|
||||
|
||||
__fp16 temp_slopes_aligned[64] __attribute__((aligned(128)));
|
||||
hvx_vmem(temp_slopes_aligned) = hvx_vec_f32_to_f16(v_val, Q6_V_vzero());
|
||||
|
||||
for (uint32_t i = 0; i < G; ++i) {
|
||||
temp_slopes[i] = temp_slopes_aligned[i];
|
||||
}
|
||||
} else {
|
||||
// Fallback path: G > 32 (rare configurations)
|
||||
for (uint32_t i = 0; i < G; ++i) {
|
||||
temp_slopes[i] = (__fp16)alibi_slope(kv_head * G + i, n_head_log2, m0, m1);
|
||||
}
|
||||
}
|
||||
|
||||
// Allocate stack buffer to avoid scalar writes to VTCM (which generates L2 misses)
|
||||
__fp16 local_slopes[n_rows_g] __attribute__((aligned(128)));
|
||||
for (size_t r = 0; r < n_rows_g; ++r) {
|
||||
local_slopes[r] = temp_slopes[fastmodulo(r, G, &factx->div_G)];
|
||||
}
|
||||
|
||||
// Copy to VTCM slopes using HVX block copy (both are aligned to 128 bytes)
|
||||
hvx_copy_f16_aa((uint8_t *)slopes, (const uint8_t *)local_slopes, n_rows_g);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
@@ -1254,19 +1281,22 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
const uint32_t G = neq2 / n_kv_heads;
|
||||
|
||||
// Thread count for multi-thread HVX phases
|
||||
const uint32_t n_threads = octx->n_threads;
|
||||
const uint32_t n_threads_init = octx->n_threads;
|
||||
|
||||
// Compute dynamic block sizes (GQA-aware, accounting for per-thread row bufs)
|
||||
size_t Br, Bc;
|
||||
const size_t vtcm_budget = ctx->vtcm_size;
|
||||
if (hmx_fa_find_chunk_size(&Br, &Bc, G, DK, DV, neq1, nek1, vtcm_budget, n_threads) != 0) {
|
||||
if (hmx_fa_find_chunk_size(&Br, &Bc, G, DK, DV, neq1, nek1, vtcm_budget, n_threads_init) != 0) {
|
||||
return HTP_STATUS_VTCM_TOO_SMALL;
|
||||
}
|
||||
|
||||
const size_t g_br = hex_align_up(G * Br, HMX_FP16_TILE_N_ROWS);
|
||||
|
||||
const uint32_t n_kv_blocks = (nek1 + Bc - 1) / Bc;
|
||||
const bool use_pipeline = (n_kv_blocks >= FA_MIN_KV_BLOCKS && n_threads >= 2);
|
||||
const bool use_pipeline = (n_kv_blocks >= FA_MIN_KV_BLOCKS && n_threads_init >= 2);
|
||||
|
||||
// Bypass thread pool dispatch for small prompts/non-pipelined prefill by setting n_threads = 1
|
||||
const uint32_t n_threads = use_pipeline ? n_threads_init : 1;
|
||||
|
||||
FARF(HIGH, "hmx-fa: neq1=%u nek1=%u DK=%u DV=%u G=%u Br=%zu Bc=%zu g_br=%zu n_kv_blocks=%u pipeline=%d vtcm=%zu",
|
||||
neq1, nek1, DK, DV, G, Br, Bc, g_br, n_kv_blocks, use_pipeline, vtcm_budget);
|
||||
@@ -1282,6 +1312,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
factx.n_kv_heads = n_kv_heads;
|
||||
factx.n_heads = neq2;
|
||||
factx.G = G;
|
||||
factx.div_G = init_fastdiv_values(G);
|
||||
factx.neq1 = neq1;
|
||||
factx.Br = (uint32_t) Br;
|
||||
factx.Bc = (uint32_t) Bc;
|
||||
@@ -1354,7 +1385,12 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
factx.vtcm_v_fp16[0] = (__fp16 *) vtcm_seq_alloc(&vtcm_cur, v_dma_bytes);
|
||||
factx.vtcm_v_fp16[1] = (__fp16 *) vtcm_seq_alloc(&vtcm_cur, v_dma_bytes);
|
||||
factx.vtcm_k_tiles = (__fp16 *) vtcm_seq_alloc(&vtcm_cur, k_tile_bytes);
|
||||
factx.vtcm_v_tiles = (__fp16 *) vtcm_seq_alloc(&vtcm_cur, v_tile_bytes);
|
||||
factx.vtcm_v_tiles[0] = (__fp16 *) vtcm_seq_alloc(&vtcm_cur, v_tile_bytes);
|
||||
if (use_pipeline) {
|
||||
factx.vtcm_v_tiles[1] = (__fp16 *) vtcm_seq_alloc(&vtcm_cur, v_tile_bytes);
|
||||
} else {
|
||||
factx.vtcm_v_tiles[1] = NULL;
|
||||
}
|
||||
factx.vtcm_s_tiles = (__fp16 *) vtcm_seq_alloc(&vtcm_cur, s_tile_bytes);
|
||||
factx.vtcm_p_tiles = (__fp16 *) vtcm_seq_alloc(&vtcm_cur, s_tile_bytes);
|
||||
factx.vtcm_d_tiles = (__fp16 *) vtcm_seq_alloc(&vtcm_cur, d_tile_bytes);
|
||||
@@ -1457,6 +1493,8 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
// ---- KV block loop with DMA double-buffering ----
|
||||
size_t buf_idx = 0;
|
||||
|
||||
fa_compute_slopes(&factx, kv_head, n_rows_g);
|
||||
|
||||
// Prefetch first KV block
|
||||
if (factx.n_kv_blocks > 0) {
|
||||
const uint32_t kv_rows0 = hex_smin(Bc, nek1);
|
||||
@@ -1535,7 +1573,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
ou_job.o_curr = o_tile_curr;
|
||||
ou_job.o_prev = o_tile_prev;
|
||||
ou_job.p_tiles = factx.vtcm_p_tiles;
|
||||
ou_job.v_tiles = factx.vtcm_v_tiles;
|
||||
ou_job.v_tiles = factx.vtcm_v_tiles[1 - buf_idx];
|
||||
ou_job.d_tiles = factx.vtcm_d_tiles;
|
||||
ou_job.hmx_scales = factx.vtcm_hmx_scales_id;
|
||||
ou_job.n_row_tiles = n_row_tiles;
|
||||
@@ -1550,11 +1588,6 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
fa_phase_k_interleave(&factx, kv_rows, k_src_stride, buf_idx);
|
||||
TIMER_STOP(k_interleave);
|
||||
|
||||
if (kv_blk > 0) {
|
||||
hmx_queue_pop(hmx_q);
|
||||
hex_swap_ptr((void **) &o_tile_curr, (void **) &o_tile_prev);
|
||||
}
|
||||
|
||||
// ---- Phase 2: qk_dot(blk) on HMX ‖ V_int(blk) + DMA prefetch on HVX ----
|
||||
qk_job.q_tiles = factx.vtcm_q_tiles;
|
||||
qk_job.k_tiles = factx.vtcm_k_tiles;
|
||||
@@ -1574,6 +1607,13 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
fa_phase_v_interleave(&factx, kv_rows, v_src_stride, buf_idx, n_tiles_per_bc);
|
||||
TIMER_STOP(v_interleave);
|
||||
|
||||
// Pop and swap previous block's output update (deferred HMX pop)
|
||||
if (kv_blk > 0) {
|
||||
hmx_queue_pop(hmx_q);
|
||||
hex_swap_ptr((void **) &o_tile_curr, (void **) &o_tile_prev);
|
||||
}
|
||||
|
||||
// Pop current block's dot product job
|
||||
hmx_queue_pop(hmx_q);
|
||||
TIMER_STOP(qk_dot);
|
||||
|
||||
@@ -1601,7 +1641,6 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
sargs.mask_vtcm = has_mask_dma ? (const __fp16 *) factx.vtcm_mask_buf : NULL;
|
||||
sargs.mask_vtcm_row_stride = factx.mask_buf_row_stride;
|
||||
sargs.slopes = factx.vtcm_slopes;
|
||||
fa_compute_slopes(&sargs, &factx, kv_head, n_rows_g);
|
||||
|
||||
TIMER_START(softmax);
|
||||
fa_phase_softmax_and_build_d(&factx, &sargs, n_row_tiles, n_row_tiles_g_br);
|
||||
@@ -1617,7 +1656,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
ou_job.o_curr = o_tile_curr;
|
||||
ou_job.o_prev = o_tile_prev;
|
||||
ou_job.p_tiles = factx.vtcm_p_tiles;
|
||||
ou_job.v_tiles = factx.vtcm_v_tiles;
|
||||
ou_job.v_tiles = factx.vtcm_v_tiles[1 - buf_idx];
|
||||
ou_job.d_tiles = factx.vtcm_d_tiles;
|
||||
ou_job.hmx_scales = factx.vtcm_hmx_scales_id;
|
||||
ou_job.n_row_tiles = n_row_tiles;
|
||||
@@ -1712,7 +1751,6 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
sargs.mask_vtcm = has_mask_dma ? (const __fp16 *) factx.vtcm_mask_buf : NULL;
|
||||
sargs.mask_vtcm_row_stride = factx.mask_buf_row_stride;
|
||||
sargs.slopes = factx.vtcm_slopes;
|
||||
fa_compute_slopes(&sargs, &factx, kv_head, n_rows_g);
|
||||
|
||||
TIMER_START(softmax);
|
||||
fa_phase_softmax_and_build_d(&factx, &sargs, n_row_tiles, n_row_tiles_g_br);
|
||||
@@ -1732,7 +1770,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
const size_t DV_tiles = (size_t) (DV / 32);
|
||||
const __fp16 * restrict d_base = factx.vtcm_d_tiles;
|
||||
const __fp16 * restrict p_base = factx.vtcm_p_tiles;
|
||||
const __fp16 * restrict v_base = factx.vtcm_v_tiles;
|
||||
const __fp16 * restrict v_base = factx.vtcm_v_tiles[0];
|
||||
const __fp16 * restrict op_base = o_tile_prev;
|
||||
__fp16 * restrict oc_base = o_tile_curr;
|
||||
__builtin_assume(n_row_tiles > 0);
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,6 @@
|
||||
// HMX operations compiled as a single translation unit.
|
||||
// This allows interprocedural optimizations within HMX ops without requiring global HTP LTO.
|
||||
|
||||
#include "hmx-queue.c"
|
||||
#include "hmx-matmul-ops.c"
|
||||
#include "hmx-flash-attn-ops.c"
|
||||
@@ -52,14 +52,32 @@ int hmx_matmul_f16_f32(struct htp_context *ctx,
|
||||
// Batch semantics match ggml_mul_mat(): src0 broadcasts to src1 in dims 2/3.
|
||||
int hmx_matmul_f16_f32_batched(struct htp_context *ctx, const hmx_matmul_f16_f32_batched_params_t *params);
|
||||
|
||||
// HMX matrix multiplication — quantised weights (Q4_0/Q8_0/IQ4_NL/MXFP4)
|
||||
int hmx_matmul_q_f32(struct htp_context *ctx,
|
||||
// HMX matrix multiplication — all supported weight types (F16/F32/Q4_0/Q4_1/Q8_0/IQ4_NL/MXFP4)
|
||||
int hmx_matmul_2d_f32(struct htp_context *ctx,
|
||||
float *restrict dst,
|
||||
const float *activation,
|
||||
const uint8_t *permuted_weight,
|
||||
int m, int k, int n,
|
||||
int act_stride,
|
||||
int weight_stride,
|
||||
int weight_type);
|
||||
|
||||
struct mmid_row_mapping;
|
||||
|
||||
int hmx_matmul_id_2d_f32(struct htp_context *ctx,
|
||||
float *restrict dst,
|
||||
const float *activation,
|
||||
const uint8_t *permuted_weight,
|
||||
int m, int k, int n,
|
||||
int ne11,
|
||||
size_t act_nb1, size_t act_nb2,
|
||||
size_t dst_nb1, size_t dst_nb2,
|
||||
int weight_stride,
|
||||
int weight_type,
|
||||
const struct mmid_row_mapping *matrix_rows,
|
||||
int cur_a,
|
||||
int mapping_stride);
|
||||
|
||||
// HMX flash attention
|
||||
int hmx_flash_attn_ext(struct htp_ops_context * octx);
|
||||
|
||||
|
||||
@@ -79,6 +79,10 @@ struct htp_context {
|
||||
|
||||
uint64_t max_vmem;
|
||||
|
||||
// Persistent DDR scratchpad for MUL_MAT_ID mappings
|
||||
void * ddr_spad_base;
|
||||
size_t ddr_spad_size;
|
||||
|
||||
struct htp_ops_context octx;
|
||||
|
||||
#ifdef HTP_HAS_HMX
|
||||
|
||||
@@ -0,0 +1,47 @@
|
||||
#ifndef HVX_FLASH_ATTN_H
|
||||
#define HVX_FLASH_ATTN_H
|
||||
|
||||
#include <math.h>
|
||||
#include "hvx-utils.h"
|
||||
|
||||
// Scalar helper to compute a single ALiBi slope.
|
||||
static inline float alibi_slope(uint32_t h, uint32_t n_head_log2, float m0, float m1) {
|
||||
return (h < n_head_log2) ? powf(m0, h + 1) : powf(m1, 2 * (h - n_head_log2) + 1);
|
||||
}
|
||||
|
||||
// Vectorized helper to compute 32 ALiBi slopes starting from (kv_head * G).
|
||||
static inline HVX_Vector hvx_alibi_slopes(
|
||||
uint32_t kv_head,
|
||||
uint32_t G,
|
||||
uint32_t n_head_log2,
|
||||
float m0,
|
||||
float m1
|
||||
) {
|
||||
static const float ramp_32[32] __attribute__((aligned(128))) = {
|
||||
0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f,
|
||||
8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f,
|
||||
16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f,
|
||||
24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f, 31.0f
|
||||
};
|
||||
HVX_Vector v_ramp = hvx_vmem(ramp_32);
|
||||
HVX_Vector v_h_base = hvx_vec_splat_f32((float)(kv_head * G));
|
||||
HVX_Vector v_h = hvx_vec_add_f32_f32(v_h_base, v_ramp);
|
||||
|
||||
// Compute exponent_m0: h + 1
|
||||
HVX_Vector v_exp_m0 = hvx_vec_add_f32_f32(v_h, hvx_vec_splat_f32(1.0f));
|
||||
|
||||
// Compute exponent_m1: 2 * (h - n_head_log2) + 1
|
||||
HVX_Vector v_n_head_log2 = hvx_vec_splat_f32((float)n_head_log2);
|
||||
HVX_Vector v_h_minus = hvx_vec_sub_f32_f32(v_h, v_n_head_log2);
|
||||
HVX_Vector v_exp_m1 = hvx_vec_add_f32_f32(hvx_vec_mul_f32_f32(hvx_vec_splat_f32(2.0f), v_h_minus), hvx_vec_splat_f32(1.0f));
|
||||
|
||||
// Compute powers
|
||||
HVX_Vector v_pow_m0 = hvx_vec_pow_const_base_f32(m0, v_exp_m0);
|
||||
HVX_Vector v_pow_m1 = hvx_vec_pow_const_base_f32(m1, v_exp_m1);
|
||||
|
||||
// Select based on h < n_head_log2
|
||||
HVX_VectorPred p_cond = Q6_Q_vcmp_gt_VsfVsf(v_n_head_log2, v_h); // v_n_head_log2 > v_h <=> h < n_head_log2
|
||||
return Q6_V_vmux_QVV(p_cond, v_pow_m0, v_pow_m1);
|
||||
}
|
||||
|
||||
#endif /* HVX_FLASH_ATTN_H */
|
||||
@@ -0,0 +1,65 @@
|
||||
#ifndef HVX_LOG_H
|
||||
#define HVX_LOG_H
|
||||
|
||||
#include "hvx-base.h"
|
||||
|
||||
// Approximates ln(x) element-wise for float vectors.
|
||||
// x must contain positive float elements.
|
||||
// Uses Abramowitz & Stegun polynomial approximation 4.1.44 for ln(1+y) over [0, 1].
|
||||
static inline HVX_Vector hvx_vec_log_f32(HVX_Vector x) {
|
||||
// x = m * 2^e, where m in [1, 2)
|
||||
HVX_Vector biased_e = Q6_Vuw_vlsr_VuwR(x, 23);
|
||||
HVX_Vector e_int = Q6_Vw_vsub_VwVw(biased_e, Q6_V_vsplat_R(127));
|
||||
HVX_Vector e_float = Q6_Vsf_equals_Vw(e_int);
|
||||
|
||||
// Extract mantissa and set exponent to 127 (which represents float value in [1.0, 2.0))
|
||||
HVX_Vector mant_mask = Q6_V_vsplat_R(0x007FFFFF);
|
||||
HVX_Vector exp_127 = Q6_V_vsplat_R(0x3F800000);
|
||||
HVX_Vector m = Q6_V_vor_VV(Q6_V_vand_VV(x, mant_mask), exp_127);
|
||||
|
||||
// y = m - 1.0f, y in [0, 1)
|
||||
HVX_Vector y = hvx_vec_sub_f32_f32(m, hvx_vec_splat_f32(1.0f));
|
||||
|
||||
// Abramowitz & Stegun 4.1.44 polynomial approximation of ln(1+y)
|
||||
HVX_Vector c;
|
||||
HVX_Vector res;
|
||||
|
||||
c = hvx_vec_splat_f32(-0.0064535442f);
|
||||
res = hvx_vec_mul_f32_f32(y, c);
|
||||
|
||||
c = hvx_vec_splat_f32(0.0360884937f);
|
||||
res = hvx_vec_add_f32_f32(res, c);
|
||||
res = hvx_vec_mul_f32_f32(y, res);
|
||||
|
||||
c = hvx_vec_splat_f32(-0.0953293897f);
|
||||
res = hvx_vec_add_f32_f32(res, c);
|
||||
res = hvx_vec_mul_f32_f32(y, res);
|
||||
|
||||
c = hvx_vec_splat_f32(0.1676540711f);
|
||||
res = hvx_vec_add_f32_f32(res, c);
|
||||
res = hvx_vec_mul_f32_f32(y, res);
|
||||
|
||||
c = hvx_vec_splat_f32(-0.2407338084f);
|
||||
res = hvx_vec_add_f32_f32(res, c);
|
||||
res = hvx_vec_mul_f32_f32(y, res);
|
||||
|
||||
c = hvx_vec_splat_f32(0.3317990258f);
|
||||
res = hvx_vec_add_f32_f32(res, c);
|
||||
res = hvx_vec_mul_f32_f32(y, res);
|
||||
|
||||
c = hvx_vec_splat_f32(-0.4998741238f);
|
||||
res = hvx_vec_add_f32_f32(res, c);
|
||||
res = hvx_vec_mul_f32_f32(y, res);
|
||||
|
||||
c = hvx_vec_splat_f32(0.9999964239f);
|
||||
res = hvx_vec_add_f32_f32(res, c);
|
||||
res = hvx_vec_mul_f32_f32(y, res);
|
||||
|
||||
// ln(x) = e * ln(2) + ln(1+y)
|
||||
HVX_Vector ln2 = hvx_vec_splat_f32(0.69314718056f);
|
||||
HVX_Vector term_e = hvx_vec_mul_f32_f32(e_float, ln2);
|
||||
|
||||
return hvx_vec_add_f32_f32(term_e, res);
|
||||
}
|
||||
|
||||
#endif /* HVX_LOG_H */
|
||||
@@ -0,0 +1,42 @@
|
||||
#ifndef HVX_POW_H
|
||||
#define HVX_POW_H
|
||||
|
||||
#include <math.h>
|
||||
#include "hvx-base.h"
|
||||
#include "hvx-exp.h"
|
||||
#include "hvx-log.h"
|
||||
|
||||
// Approximates base^exponent element-wise for float vectors.
|
||||
// base must be a positive constant. exponent is an HVX f32 vector.
|
||||
// Uses base^x = exp(x * ln(base)).
|
||||
static inline HVX_Vector hvx_vec_pow_const_base_f32(float base, HVX_Vector exponent) {
|
||||
float ln_base = logf(base);
|
||||
HVX_Vector ln_base_v = hvx_vec_splat_f32(ln_base);
|
||||
HVX_Vector x = hvx_vec_mul_f32_f32(exponent, ln_base_v);
|
||||
|
||||
static const float kInf = INFINITY;
|
||||
static const float kMaxExp = 88.7228f;
|
||||
|
||||
const HVX_Vector max_exp = hvx_vec_splat_f32(kMaxExp);
|
||||
const HVX_Vector inf = hvx_vec_splat_f32(kInf);
|
||||
|
||||
return hvx_vec_exp_f32_guard(x, max_exp, inf);
|
||||
}
|
||||
|
||||
// Approximates base^exponent element-wise for float vectors.
|
||||
// base and exponent are HVX f32 vectors. base elements must be positive.
|
||||
// Uses base^exponent = exp(exponent * ln(base)).
|
||||
static inline HVX_Vector hvx_vec_pow_f32(HVX_Vector base, HVX_Vector exponent) {
|
||||
HVX_Vector ln_base = hvx_vec_log_f32(base);
|
||||
HVX_Vector x = hvx_vec_mul_f32_f32(exponent, ln_base);
|
||||
|
||||
static const float kInf = INFINITY;
|
||||
static const float kMaxExp = 88.7228f;
|
||||
|
||||
const HVX_Vector max_exp = hvx_vec_splat_f32(kMaxExp);
|
||||
const HVX_Vector inf = hvx_vec_splat_f32(kInf);
|
||||
|
||||
return hvx_vec_exp_f32_guard(x, max_exp, inf);
|
||||
}
|
||||
|
||||
#endif /* HVX_POW_H */
|
||||
@@ -17,5 +17,7 @@
|
||||
#include "hvx-floor.h"
|
||||
#include "hvx-sin-cos.h"
|
||||
#include "hvx-base.h"
|
||||
#include "hvx-pow.h"
|
||||
#include "hvx-log.h"
|
||||
|
||||
#endif /* HVX_UTILS_H */
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
#include <HAP_mem.h>
|
||||
#include <HAP_power.h>
|
||||
#include <HAP_ps.h>
|
||||
#include <HAP_dcvs.h>
|
||||
#include <qurt.h>
|
||||
#include <qurt_thread.h>
|
||||
#include <qurt_memory.h>
|
||||
@@ -63,8 +64,7 @@ AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
|
||||
|
||||
request.type = HAP_power_set_DCVS_v3;
|
||||
request.dcvs_v3.set_dcvs_enable = TRUE;
|
||||
request.dcvs_v3.dcvs_enable = TRUE;
|
||||
request.dcvs_v3.dcvs_option = HAP_DCVS_V2_PERFORMANCE_MODE;
|
||||
request.dcvs_v3.dcvs_enable = FALSE;
|
||||
request.dcvs_v3.set_bus_params = TRUE;
|
||||
request.dcvs_v3.bus_params.min_corner = HAP_DCVS_VCORNER_MAX;
|
||||
request.dcvs_v3.bus_params.max_corner = HAP_DCVS_VCORNER_MAX;
|
||||
@@ -75,6 +75,10 @@ AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
|
||||
request.dcvs_v3.core_params.target_corner = HAP_DCVS_VCORNER_MAX;
|
||||
request.dcvs_v3.set_sleep_disable = TRUE;
|
||||
request.dcvs_v3.sleep_disable = TRUE;
|
||||
|
||||
#if (__HEXAGON_ARCH__ >= 79)
|
||||
HAP_set_dcvs_v3_protected_bus_corners(&request, 1);
|
||||
#endif
|
||||
if ((err = HAP_power_set((void *) ctx, &request)) != 0) {
|
||||
return err;
|
||||
}
|
||||
@@ -103,7 +107,7 @@ AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
|
||||
FARF(ALWAYS, "Setting HMX clock\n");
|
||||
err = HAP_power_set((void *) ctx, &request);
|
||||
if (err != AEE_SUCCESS) {
|
||||
FARF(ERROR, "Error setting HMX clock.");
|
||||
FARF(ERROR, "ggml-hex: error setting HMX clock.");
|
||||
return err;
|
||||
}
|
||||
}
|
||||
@@ -117,7 +121,7 @@ AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
|
||||
FARF(ALWAYS, "Powering HMX on\n");
|
||||
err = HAP_power_set((void *) ctx, &request);
|
||||
if (err != AEE_SUCCESS) {
|
||||
FARF(ERROR, "Error powering on HMX.");
|
||||
FARF(ERROR, "ggml-hex: error powering on HMX.");
|
||||
return err;
|
||||
}
|
||||
}
|
||||
@@ -423,10 +427,18 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que
|
||||
ctx->dma[i] = dma_queue_create(256); // queue depth
|
||||
}
|
||||
|
||||
ctx->ddr_spad_size = 512 * 1024; // 512 KB
|
||||
ctx->ddr_spad_base = memalign(128, ctx->ddr_spad_size);
|
||||
|
||||
// init worker pool
|
||||
err = worker_pool_init(&ctx->worker_pool, n_hvx);
|
||||
if (err != AEE_SUCCESS) {
|
||||
FARF(ERROR, "Unable to create worker pool");
|
||||
if (ctx->ddr_spad_base) {
|
||||
free(ctx->ddr_spad_base);
|
||||
ctx->ddr_spad_base = NULL;
|
||||
ctx->ddr_spad_size = 0;
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
@@ -474,6 +486,12 @@ AEEResult htp_iface_stop(remote_handle64 handle) {
|
||||
|
||||
vtcm_free(ctx);
|
||||
|
||||
if (ctx->ddr_spad_base) {
|
||||
free(ctx->ddr_spad_base);
|
||||
ctx->ddr_spad_base = NULL;
|
||||
ctx->ddr_spad_size = 0;
|
||||
}
|
||||
|
||||
return AEE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
@@ -53,6 +53,11 @@ struct htp_matmul_context {
|
||||
struct fastdiv_values mm_div_ne1;
|
||||
struct fastdiv_values mm_div_r2;
|
||||
struct fastdiv_values mm_div_r3;
|
||||
|
||||
// Fields for scattered mapping & HMX support in MUL_MAT_ID
|
||||
const uint32_t * matrix_row_counts;
|
||||
const struct mmid_row_mapping * matrix_rows;
|
||||
bool hmx_eligible;
|
||||
};
|
||||
|
||||
// vdelta control to expand first 32 e8m0 values into 32 uint32 elements
|
||||
@@ -2913,6 +2918,176 @@ static void vec_dot_mxfp4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float
|
||||
hvx_vec_store_u(&s1[0], 8, r0_r1_c1_sum); // row0,col1 row1,col1
|
||||
}
|
||||
|
||||
#if __HVX_ARCH__ < 79
|
||||
#define HVX_OP_ADD_F32(a, b) Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a, b))
|
||||
#define HVX_OP_MUL_F32(a, b) Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a, b))
|
||||
#else
|
||||
#define HVX_OP_ADD_F32(a, b) Q6_Vsf_vadd_VsfVsf(a, b)
|
||||
#define HVX_OP_MUL_F32(a, b) Q6_Vsf_vmpy_VsfVsf(a, b)
|
||||
#endif
|
||||
|
||||
static void vec_dot_f32_f32_aa_1x1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
||||
const HVX_Vector * restrict x = (const HVX_Vector *) vx;
|
||||
const HVX_Vector * restrict y = (const HVX_Vector *) vy;
|
||||
|
||||
uint32_t nvec = n / VLEN_FP32; // num full fp32 hvx vectors
|
||||
uint32_t nloe = n % VLEN_FP32; // leftover elements
|
||||
|
||||
HVX_Vector rsum = Q6_V_vzero();
|
||||
|
||||
uint32_t i = 0;
|
||||
|
||||
#pragma unroll(4)
|
||||
for (i = 0; i < nvec; i++) {
|
||||
HVX_Vector prod = HVX_OP_MUL_F32(x[i], y[i]);
|
||||
rsum = HVX_OP_ADD_F32(rsum, prod);
|
||||
}
|
||||
|
||||
if (nloe) {
|
||||
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 4);
|
||||
HVX_Vector x_sf = Q6_V_vand_QV(bmask, x[i]);
|
||||
HVX_Vector y_sf = Q6_V_vand_QV(bmask, y[i]);
|
||||
HVX_Vector prod = HVX_OP_MUL_F32(x_sf, y_sf);
|
||||
rsum = HVX_OP_ADD_F32(rsum, prod);
|
||||
}
|
||||
|
||||
*s = hvx_vec_get_f32(hvx_vec_reduce_sum_f32(rsum));
|
||||
}
|
||||
|
||||
static void vec_dot_f32_f32_aa_2x1(const int n, float * restrict s0,
|
||||
const void * restrict vx0, const void * restrict vx1,
|
||||
const void * restrict vy0) {
|
||||
const HVX_Vector * restrict x0 = (const HVX_Vector *) vx0;
|
||||
const HVX_Vector * restrict x1 = (const HVX_Vector *) vx1;
|
||||
const HVX_Vector * restrict y = (const HVX_Vector *) vy0;
|
||||
|
||||
uint32_t nvec = n / VLEN_FP32;
|
||||
uint32_t nloe = n % VLEN_FP32;
|
||||
|
||||
HVX_Vector rsum0 = Q6_V_vzero();
|
||||
HVX_Vector rsum1 = Q6_V_vzero();
|
||||
|
||||
uint32_t i = 0;
|
||||
|
||||
#pragma unroll(2)
|
||||
for (i = 0; i < nvec; i++) {
|
||||
HVX_Vector y_sf = y[i];
|
||||
HVX_Vector prod0 = HVX_OP_MUL_F32(x0[i], y_sf);
|
||||
HVX_Vector prod1 = HVX_OP_MUL_F32(x1[i], y_sf);
|
||||
rsum0 = HVX_OP_ADD_F32(rsum0, prod0);
|
||||
rsum1 = HVX_OP_ADD_F32(rsum1, prod1);
|
||||
}
|
||||
|
||||
if (nloe) {
|
||||
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 4);
|
||||
HVX_Vector y_sf = Q6_V_vand_QV(bmask, y[i]);
|
||||
HVX_Vector x0_sf = Q6_V_vand_QV(bmask, x0[i]);
|
||||
HVX_Vector x1_sf = Q6_V_vand_QV(bmask, x1[i]);
|
||||
HVX_Vector prod0 = HVX_OP_MUL_F32(x0_sf, y_sf);
|
||||
HVX_Vector prod1 = HVX_OP_MUL_F32(x1_sf, y_sf);
|
||||
rsum0 = HVX_OP_ADD_F32(rsum0, prod0);
|
||||
rsum1 = HVX_OP_ADD_F32(rsum1, prod1);
|
||||
}
|
||||
|
||||
HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(rsum0, rsum1);
|
||||
HVX_VectorAlias va;
|
||||
va.v = rsum;
|
||||
s0[0] = va.fp32[0];
|
||||
s0[1] = va.fp32[1];
|
||||
}
|
||||
|
||||
static void vec_dot_f32_f32_aa_2x2(const int n, float * restrict s0, float * restrict s1,
|
||||
const void * restrict vx0, const void * restrict vx1,
|
||||
const void * restrict vy0, const void * restrict vy1) {
|
||||
const HVX_Vector * restrict x0 = (const HVX_Vector *) vx0;
|
||||
const HVX_Vector * restrict x1 = (const HVX_Vector *) vx1;
|
||||
const HVX_Vector * restrict y0 = (const HVX_Vector *) vy0;
|
||||
const HVX_Vector * restrict y1 = (const HVX_Vector *) vy1;
|
||||
|
||||
uint32_t nvec = n / VLEN_FP32;
|
||||
uint32_t nloe = n % VLEN_FP32;
|
||||
|
||||
HVX_Vector r0_c0_sum = Q6_V_vzero();
|
||||
HVX_Vector r0_c1_sum = Q6_V_vzero();
|
||||
HVX_Vector r1_c0_sum = Q6_V_vzero();
|
||||
HVX_Vector r1_c1_sum = Q6_V_vzero();
|
||||
|
||||
uint32_t i = 0;
|
||||
|
||||
#pragma unroll(2)
|
||||
for (i = 0; i < nvec; i++) {
|
||||
HVX_Vector r0_sf = x0[i];
|
||||
HVX_Vector r1_sf = x1[i];
|
||||
HVX_Vector c0_sf = y0[i];
|
||||
HVX_Vector c1_sf = y1[i];
|
||||
|
||||
r0_c0_sum = HVX_OP_ADD_F32(r0_c0_sum, HVX_OP_MUL_F32(r0_sf, c0_sf));
|
||||
r0_c1_sum = HVX_OP_ADD_F32(r0_c1_sum, HVX_OP_MUL_F32(r0_sf, c1_sf));
|
||||
r1_c0_sum = HVX_OP_ADD_F32(r1_c0_sum, HVX_OP_MUL_F32(r1_sf, c0_sf));
|
||||
r1_c1_sum = HVX_OP_ADD_F32(r1_c1_sum, HVX_OP_MUL_F32(r1_sf, c1_sf));
|
||||
}
|
||||
|
||||
if (nloe) {
|
||||
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 4);
|
||||
|
||||
HVX_Vector r0_sf = Q6_V_vand_QV(bmask, x0[i]);
|
||||
HVX_Vector r1_sf = Q6_V_vand_QV(bmask, x1[i]);
|
||||
HVX_Vector c0_sf = Q6_V_vand_QV(bmask, y0[i]);
|
||||
HVX_Vector c1_sf = Q6_V_vand_QV(bmask, y1[i]);
|
||||
|
||||
r0_c0_sum = HVX_OP_ADD_F32(r0_c0_sum, HVX_OP_MUL_F32(r0_sf, c0_sf));
|
||||
r0_c1_sum = HVX_OP_ADD_F32(r0_c1_sum, HVX_OP_MUL_F32(r0_sf, c1_sf));
|
||||
r1_c0_sum = HVX_OP_ADD_F32(r1_c0_sum, HVX_OP_MUL_F32(r1_sf, c0_sf));
|
||||
r1_c1_sum = HVX_OP_ADD_F32(r1_c1_sum, HVX_OP_MUL_F32(r1_sf, c1_sf));
|
||||
}
|
||||
|
||||
// Reduce and store results
|
||||
HVX_Vector r0_r1_c0_sum = hvx_vec_reduce_sum_f32x2(r0_c0_sum, r1_c0_sum);
|
||||
HVX_Vector r0_r1_c1_sum = hvx_vec_reduce_sum_f32x2(r0_c1_sum, r1_c1_sum);
|
||||
|
||||
HVX_VectorAlias va0, va1;
|
||||
va0.v = r0_r1_c0_sum;
|
||||
va1.v = r0_r1_c1_sum;
|
||||
s0[0] = va0.fp32[0];
|
||||
s0[1] = va0.fp32[1];
|
||||
s1[0] = va1.fp32[0];
|
||||
s1[1] = va1.fp32[1];
|
||||
}
|
||||
|
||||
static void vec_dot_f32_f32_uu_1x1(const int n, float * restrict s, const void * restrict x, const void * restrict y) {
|
||||
const HVX_UVector * restrict vx = (const HVX_UVector * restrict) x;
|
||||
const HVX_UVector * restrict vy = (const HVX_UVector * restrict) y;
|
||||
|
||||
uint32_t nvec = n / VLEN_FP32; // num full fp32 hvx vectors
|
||||
uint32_t nloe = n % VLEN_FP32; // leftover elements
|
||||
|
||||
HVX_Vector rsum = Q6_V_vzero();
|
||||
|
||||
uint32_t i = 0;
|
||||
|
||||
#pragma unroll(2)
|
||||
for (i = 0; i < nvec; i++) {
|
||||
HVX_Vector x_sf = vx[i];
|
||||
HVX_Vector y_sf = vy[i];
|
||||
|
||||
rsum = HVX_OP_ADD_F32(rsum, HVX_OP_MUL_F32(x_sf, y_sf));
|
||||
}
|
||||
|
||||
if (nloe) {
|
||||
HVX_Vector x_sf = vx[i];
|
||||
HVX_Vector y_sf = vy[i];
|
||||
|
||||
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 4);
|
||||
x_sf = Q6_V_vand_QV(bmask, x_sf);
|
||||
y_sf = Q6_V_vand_QV(bmask, y_sf);
|
||||
|
||||
rsum = HVX_OP_ADD_F32(rsum, HVX_OP_MUL_F32(x_sf, y_sf));
|
||||
}
|
||||
|
||||
rsum = hvx_vec_reduce_sum_f32(rsum);
|
||||
hvx_vec_store_u(&s[0], 4, rsum);
|
||||
}
|
||||
|
||||
static void vec_dot_f16_f16_aa_1x1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
||||
const HVX_Vector * restrict x = (const HVX_Vector *) vx;
|
||||
const HVX_Vector * restrict y = (const HVX_Vector *) vy;
|
||||
@@ -3331,7 +3506,7 @@ static void matmul_2d(unsigned int nth, unsigned int ith, void * data) {
|
||||
// Process the last row (if any)
|
||||
if (src0_end_row != src0_end_row_x2) {
|
||||
uint32_t ir0 = src0_end_row_x2;
|
||||
const int is0 = (ir0 - src0_start_row);
|
||||
const int is0 = (ir0 - src0_start_row) % MM_SPAD_SRC0_NROWS;
|
||||
dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + ir0 * src0_row_size),
|
||||
src0_stride, src0_row_size, 1);
|
||||
const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
|
||||
@@ -3466,7 +3641,7 @@ static void matvec_2d(unsigned int nth, unsigned int ith, void * data) {
|
||||
// Process the last row (if any)
|
||||
if (src0_end_row != src0_end_row_x2) {
|
||||
const uint32_t ir0 = src0_end_row_x2;
|
||||
const uint32_t is0 = (ir0 - src0_start_row);
|
||||
const uint32_t is0 = (ir0 - src0_start_row) % MM_SPAD_SRC0_NROWS;
|
||||
dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + ir0 * src0_row_size),
|
||||
src0_stride, src0_row_size, 1);
|
||||
const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
|
||||
@@ -3516,11 +3691,8 @@ static void matmul_id(unsigned int nth, unsigned int ith, void * data) {
|
||||
const uint32_t n_ids = ids->ne[0]; // n_expert_used
|
||||
const uint32_t n_as = ne02; // n_expert
|
||||
|
||||
const size_t matrix_row_counts_size = n_as * sizeof(uint32_t);
|
||||
const size_t matrix_row_map_size = n_as * ids->ne[0] * ids->ne[1] * sizeof(struct mmid_row_mapping);
|
||||
|
||||
const uint32_t * matrix_row_counts = (const uint32_t *) src2_spad->data + 0;
|
||||
const struct mmid_row_mapping * matrix_rows = (const void *) src2_spad->data + matrix_row_counts_size;
|
||||
const uint32_t * matrix_row_counts = mmctx->matrix_row_counts;
|
||||
const struct mmid_row_mapping * matrix_rows = mmctx->matrix_rows;
|
||||
|
||||
const size_t dst_row_size = nb1;
|
||||
const size_t src0_row_size = nb01;
|
||||
@@ -3542,6 +3714,10 @@ static void matmul_id(unsigned int nth, unsigned int ith, void * data) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (mmctx->hmx_eligible) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const uint8_t * src0_row = (const uint8_t *) src0->data + (0 + cur_a * nb02 + 0);
|
||||
|
||||
// Prefill spad with src0 rows
|
||||
@@ -3583,7 +3759,7 @@ static void matmul_id(unsigned int nth, unsigned int ith, void * data) {
|
||||
// Process the last row (if any)
|
||||
if (src0_end_row != src0_end_row_x2) {
|
||||
uint32_t ir0 = src0_end_row_x2;
|
||||
const uint32_t is0 = (ir0 - src0_start_row);
|
||||
const uint32_t is0 = (ir0 - src0_start_row) % MM_SPAD_SRC0_NROWS;
|
||||
dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
|
||||
src0_row_size_padded, src0_row_size, 1);
|
||||
const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
|
||||
@@ -3685,7 +3861,7 @@ static void matvec_id(unsigned int nth, unsigned int ith, void * data) {
|
||||
// Process the last row (if any)
|
||||
if (src0_end_row != src0_end_row_x2) {
|
||||
uint32_t ir0 = src0_end_row_x2;
|
||||
const uint32_t is0 = (ir0 - src0_start_row);
|
||||
const uint32_t is0 = (ir0 - src0_start_row) % MM_SPAD_SRC0_NROWS;
|
||||
dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
|
||||
src0_row_size_padded, src0_row_size, 1);
|
||||
const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
|
||||
@@ -4086,6 +4262,47 @@ static void quantize_f32_q8_1x4x2(unsigned int nth, unsigned int ith, void * dat
|
||||
ir_last, src_row_size, dst_row_size, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
|
||||
}
|
||||
|
||||
static void quantize_f32_f32(unsigned int nth, unsigned int ith, void * data) {
|
||||
struct htp_matmul_context * mmctx = data;
|
||||
struct htp_ops_context * octx = mmctx->octx;
|
||||
|
||||
const struct htp_tensor * src = octx->src[1];
|
||||
uint8_t * restrict dst = octx->src1_spad.data;
|
||||
uint32_t nrows_per_thread = mmctx->src1_nrows_per_thread;
|
||||
uint32_t dst_stride = octx->src1_spad.stride;
|
||||
|
||||
uint64_t t1 = HAP_perf_get_qtimer_count();
|
||||
|
||||
const uint32_t ne0 = src->ne[0];
|
||||
const uint32_t ne1 = src->ne[1];
|
||||
const uint32_t ne2 = src->ne[2];
|
||||
const uint32_t ne3 = src->ne[3];
|
||||
|
||||
const uint32_t nrows = ne1 * ne2 * ne3; // total n_rows
|
||||
|
||||
const uint32_t ir_first = nrows_per_thread * ith; // first row
|
||||
const uint32_t ir_last = MIN(ir_first + nrows_per_thread, nrows); // last row
|
||||
|
||||
const size_t src_row_size = ne0 * sizeof(float);
|
||||
const size_t src_stride = src->nb[1];
|
||||
|
||||
uint8_t * restrict src_data = (uint8_t *) src->data + (src_stride * ir_first);
|
||||
uint8_t * restrict dst_data = (uint8_t *) dst + (dst_stride * ir_first);
|
||||
|
||||
for (uint32_t i = ir_first; i < ir_last; ++i) {
|
||||
hex_l2fetch(src_data, src_row_size, src_stride, 2);
|
||||
hvx_copy_f32_au(dst_data, src_data, ne0);
|
||||
|
||||
dst_data += dst_stride;
|
||||
src_data += src_stride;
|
||||
}
|
||||
|
||||
uint64_t t2 = HAP_perf_get_qtimer_count();
|
||||
|
||||
FARF(HIGH, "quantize-f32-f32: %u/%u : n-rows %u (%u:%u) row-size %u (%u) -> %u usec %u\n", ith, nth, nrows, ir_first,
|
||||
ir_last, src_row_size, src_stride, dst_stride, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
|
||||
}
|
||||
|
||||
static void quantize_f32_f16(unsigned int nth, unsigned int ith, void * data) {
|
||||
struct htp_matmul_context * mmctx = data;
|
||||
struct htp_ops_context * octx = mmctx->octx;
|
||||
@@ -4328,6 +4545,60 @@ static int op_matmul_hvx(struct htp_ops_context * octx) {
|
||||
mmctx->mm_div_r2 = init_fastdiv_values(src1->ne[2] / src0->ne[2]);
|
||||
mmctx->mm_div_r3 = init_fastdiv_values(src1->ne[3] / src0->ne[3]);
|
||||
|
||||
need_quant = false;
|
||||
}
|
||||
} else if (src0->type == HTP_TYPE_F32) {
|
||||
// Try optimized f32-f32 path first (src1 in VTCM)
|
||||
const size_t f32_src1_row_size = hex_round_up(ne10 * 4, 128);
|
||||
const size_t f32_src1_spad_size = hex_round_up(f32_src1_row_size * src1_nrows, 256);
|
||||
const size_t f32_src0_spad_size = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256) * octx->n_threads;
|
||||
const size_t f32_dst_spad_size = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256) * octx->n_threads;
|
||||
|
||||
const size_t f32_total_size = f32_src1_spad_size + f32_src0_spad_size + f32_dst_spad_size;
|
||||
|
||||
const bool is_batched = (ne02 > 1) || (ne03 > 1);
|
||||
const bool is_permuted = htp_is_permuted(octx->src[0]) || htp_is_permuted(octx->src[1]);
|
||||
|
||||
if (!is_batched && !is_permuted && f32_total_size <= octx->ctx->vtcm_size) {
|
||||
// Optimized path
|
||||
quant_job_func = quantize_f32_f32;
|
||||
mmctx->type = "f32-f32";
|
||||
mmctx->vec_dot_1x1 = vec_dot_f32_f32_aa_1x1;
|
||||
mmctx->vec_dot_2x1 = vec_dot_f32_f32_aa_2x1;
|
||||
mmctx->vec_dot_2x2 = vec_dot_f32_f32_aa_2x2;
|
||||
|
||||
src1_row_size = f32_src1_row_size;
|
||||
|
||||
octx->dst_spad.size_per_thread = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256);
|
||||
octx->src0_spad.size_per_thread = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256);
|
||||
octx->src1_spad.size_per_thread = hex_round_up(src1_row_size * src1_nrows, 256);
|
||||
|
||||
octx->src1_spad.size = octx->src1_spad.size_per_thread;
|
||||
octx->src0_spad.size = octx->src0_spad.size_per_thread * octx->n_threads;
|
||||
octx->dst_spad.size = octx->dst_spad.size_per_thread * octx->n_threads;
|
||||
} else {
|
||||
// Fallback to DDR / broadcasting
|
||||
quant_job_func = NULL;
|
||||
mmctx->type = "f32-f32";
|
||||
mmctx->vec_dot_1x1 = vec_dot_f32_f32_uu_1x1;
|
||||
matmul_job_func = matmul_4d;
|
||||
|
||||
src1_row_size = nb11;
|
||||
|
||||
octx->dst_spad.size_per_thread = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256);
|
||||
octx->src0_spad.size_per_thread = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size, 256);
|
||||
octx->src1_spad.size_per_thread = hex_round_up(MM_SPAD_SRC1_NROWS * src1_row_size, 256);
|
||||
|
||||
octx->src0_spad.size = octx->src0_spad.size_per_thread * octx->n_threads;
|
||||
octx->src1_spad.size = octx->src1_spad.size_per_thread * octx->n_threads;
|
||||
octx->dst_spad.size = octx->dst_spad.size_per_thread * octx->n_threads;
|
||||
|
||||
// Init fastdiv for matmul_4d (supports broadcasting)
|
||||
mmctx->mm_div_ne12_ne1 = init_fastdiv_values(src1->ne[2] * dst->ne[1]);
|
||||
mmctx->mm_div_ne1 = init_fastdiv_values(dst->ne[1]);
|
||||
mmctx->mm_div_r2 = init_fastdiv_values(src1->ne[2] / src0->ne[2]);
|
||||
mmctx->mm_div_r3 = init_fastdiv_values(src1->ne[3] / src0->ne[3]);
|
||||
|
||||
need_quant = false;
|
||||
}
|
||||
} else {
|
||||
@@ -4405,20 +4676,20 @@ int op_matmul(struct htp_ops_context * octx) {
|
||||
return op_matmul_hvx(octx);
|
||||
}
|
||||
|
||||
// HMX supports F16, Q4_0, Q8_0, IQ4_NL, MXFP4 weights.
|
||||
// HMX supports F16, F32, Q4_0, Q8_0, IQ4_NL, MXFP4 weights.
|
||||
// Other types fall back to HVX.
|
||||
uint32_t wtype = src0->type;
|
||||
if (wtype != HTP_TYPE_F16 && wtype != HTP_TYPE_Q4_0 && wtype != HTP_TYPE_Q4_1 && wtype != HTP_TYPE_Q8_0 && wtype != HTP_TYPE_IQ4_NL && wtype != HTP_TYPE_MXFP4) {
|
||||
if (wtype != HTP_TYPE_F16 && wtype != HTP_TYPE_F32 && wtype != HTP_TYPE_Q4_0 && wtype != HTP_TYPE_Q4_1 && wtype != HTP_TYPE_Q8_0 && wtype != HTP_TYPE_IQ4_NL && wtype != HTP_TYPE_MXFP4) {
|
||||
return op_matmul_hvx(octx);
|
||||
}
|
||||
|
||||
// Quantised HMX path requires K aligned to 256 (x4x2 super-block).
|
||||
// F16 HMX path requires K aligned to 32 (tile width).
|
||||
if (wtype != HTP_TYPE_F16 && src0->ne[0] % 256 != 0) {
|
||||
// F16 and F32 HMX paths require K aligned to 32 (tile width).
|
||||
if (wtype != HTP_TYPE_F16 && wtype != HTP_TYPE_F32 && src0->ne[0] % 256 != 0) {
|
||||
return op_matmul_hvx(octx);
|
||||
}
|
||||
|
||||
if (wtype == HTP_TYPE_F16 && src0->ne[0] % 32 != 0) {
|
||||
if ((wtype == HTP_TYPE_F16 || wtype == HTP_TYPE_F32) && src0->ne[0] % 32 != 0) {
|
||||
return op_matmul_hvx(octx);
|
||||
}
|
||||
|
||||
@@ -4463,8 +4734,8 @@ int op_matmul(struct htp_ops_context * octx) {
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
|
||||
if (src0->type == HTP_TYPE_F16) {
|
||||
if (is_batched) {
|
||||
if (is_batched) {
|
||||
if (src0->type == HTP_TYPE_F16) {
|
||||
hmx_matmul_f16_f32_batched_params_t batch_params = {
|
||||
.dst = (float *) dst->data,
|
||||
.activation = (float *) src1->data,
|
||||
@@ -4488,13 +4759,11 @@ int op_matmul(struct htp_ops_context * octx) {
|
||||
};
|
||||
ret = hmx_matmul_f16_f32_batched(octx->ctx, &batch_params);
|
||||
} else {
|
||||
ret = hmx_matmul_f16_f32(octx->ctx,
|
||||
(float*) dst->data, (float*) src1->data, (const __fp16 *) src0->data,
|
||||
m_total, k, n, act_stride, wgt_stride);
|
||||
return op_matmul_hvx(octx);
|
||||
}
|
||||
} else {
|
||||
ret = hmx_matmul_q_f32(octx->ctx, (float*) dst->data, (float*) src1->data, (const uint8_t *) src0->data,
|
||||
m_total, k, n, (int) src0->type);
|
||||
ret = hmx_matmul_2d_f32(octx->ctx, (float*) dst->data, (float*) src1->data, (const uint8_t *) src0->data,
|
||||
m_total, k, n, act_stride, (int) src0->nb[1], (int) src0->type);
|
||||
}
|
||||
|
||||
if (ret != 0) {
|
||||
@@ -4539,8 +4808,30 @@ int op_matmul_id(struct htp_ops_context * octx) {
|
||||
|
||||
size_t matrix_row_counts_size = n_as * sizeof(uint32_t);
|
||||
size_t matrix_row_map_size = n_as * ids->ne[0] * ids->ne[1] * sizeof(struct mmid_row_mapping);
|
||||
const size_t total_map_size = matrix_row_counts_size + matrix_row_map_size;
|
||||
|
||||
void * mapping_buf = NULL;
|
||||
bool must_free_mapping = false;
|
||||
|
||||
if (octx->ctx->ddr_spad_base && total_map_size <= octx->ctx->ddr_spad_size) {
|
||||
mapping_buf = octx->ctx->ddr_spad_base;
|
||||
} else {
|
||||
mapping_buf = memalign(128, total_map_size);
|
||||
if (mapping_buf) {
|
||||
must_free_mapping = true;
|
||||
} else {
|
||||
return HTP_STATUS_INTERNAL_ERR;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t * matrix_row_counts = (uint32_t *) mapping_buf;
|
||||
struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) ((uint8_t *) mapping_buf + matrix_row_counts_size);
|
||||
|
||||
mmctx->matrix_row_counts = matrix_row_counts;
|
||||
mmctx->matrix_rows = matrix_rows;
|
||||
|
||||
if (htp_mminit_vec_dot(mmctx, src0->type) != 0) {
|
||||
if (must_free_mapping) free(mapping_buf);
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
|
||||
@@ -4552,7 +4843,7 @@ int op_matmul_id(struct htp_ops_context * octx) {
|
||||
src1_row_size = q8x4x2_row_size(ne10);
|
||||
}
|
||||
|
||||
const size_t src2_spad_size_per_thread = hex_round_up(matrix_row_counts_size + matrix_row_map_size, 256);
|
||||
const size_t src2_spad_size_per_thread = 0; // We moved the mapping to DDR!
|
||||
htp_mminit_spad(octx, dst_row_size, src0_row_size_padded, src1_row_size, src1_nrows, src2_spad_size_per_thread);
|
||||
|
||||
size_t spad_size = octx->src2_spad.size + octx->src1_spad.size + octx->src0_spad.size + octx->dst_spad.size;
|
||||
@@ -4568,6 +4859,7 @@ int op_matmul_id(struct htp_ops_context * octx) {
|
||||
// Make sure the reserved vtcm size is sufficient
|
||||
if (octx->ctx->vtcm_size < spad_size) {
|
||||
FARF(ERROR, "matmul-id-%s : current VTCM reservation %zu is too small, needed %zu\n", mmctx->type, octx->ctx->vtcm_size, spad_size);
|
||||
if (must_free_mapping) free(mapping_buf);
|
||||
return HTP_STATUS_VTCM_TOO_SMALL;
|
||||
}
|
||||
|
||||
@@ -4587,9 +4879,6 @@ int op_matmul_id(struct htp_ops_context * octx) {
|
||||
|
||||
if (src1_nrows > 1) {
|
||||
// initialize matrix_row_counts and map
|
||||
uint32_t * matrix_row_counts = (uint32_t *) octx->src2_spad.data + 0;
|
||||
struct mmid_row_mapping * matrix_rows = (void *) octx->src2_spad.data + matrix_row_counts_size;
|
||||
|
||||
memset(matrix_row_counts, 0, n_as * sizeof(uint32_t));
|
||||
|
||||
// group rows by src0 matrix
|
||||
@@ -4599,14 +4888,60 @@ int op_matmul_id(struct htp_ops_context * octx) {
|
||||
|
||||
assert(i02 >= 0 && i02 < n_as);
|
||||
|
||||
MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = (struct mmid_row_mapping) { id, iid1 };
|
||||
matrix_rows[i02 * n_ids * ids->ne[1] + matrix_row_counts[i02]] = (struct mmid_row_mapping) { id, iid1 };
|
||||
matrix_row_counts[i02] += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)
|
||||
if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
|
||||
if (must_free_mapping) free(mapping_buf);
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
|
||||
bool hmx_eligible = false;
|
||||
#ifdef HTP_HAS_HMX
|
||||
if (octx->ctx->hmx_enabled && src1_nrows > 1) {
|
||||
uint32_t wtype = src0->type;
|
||||
if (ne01 % 32 == 0 &&
|
||||
(wtype == HTP_TYPE_F16 || wtype == HTP_TYPE_F32 || wtype == HTP_TYPE_Q4_0 || wtype == HTP_TYPE_Q4_1 || wtype == HTP_TYPE_Q8_0 || wtype == HTP_TYPE_IQ4_NL || wtype == HTP_TYPE_MXFP4)) {
|
||||
if ((wtype == HTP_TYPE_F16 || wtype == HTP_TYPE_F32) && ne00 % 32 == 0) {
|
||||
hmx_eligible = true;
|
||||
} else if (wtype != HTP_TYPE_F16 && wtype != HTP_TYPE_F32 && ne00 % 256 == 0) {
|
||||
hmx_eligible = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
mmctx->hmx_eligible = hmx_eligible;
|
||||
|
||||
if (hmx_eligible) {
|
||||
for (uint32_t cur_a = 0; cur_a < n_as; ++cur_a) {
|
||||
const int32_t cne1 = matrix_row_counts[cur_a];
|
||||
if (cne1 == 0) continue;
|
||||
|
||||
int ret = hmx_matmul_id_2d_f32(octx->ctx, (float*) dst->data, (float*) src1->data,
|
||||
(const uint8_t *) src0->data + cur_a * nb02,
|
||||
cne1, ne00, ne01,
|
||||
ne11,
|
||||
nb11, nb12,
|
||||
nb1, nb2,
|
||||
(int) src0->nb[1], (int) src0->type,
|
||||
matrix_rows, cur_a, n_ids * ids->ne[1]);
|
||||
if (ret != 0) {
|
||||
FARF(ERROR, "HMX matmul failed for expert %u, error %d\n", cur_a, ret);
|
||||
if (must_free_mapping) free(mapping_buf);
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
}
|
||||
|
||||
// HMX has overwritten VTCM, so force dynamic quantization cache to clear
|
||||
octx->src1_spad.src = NULL;
|
||||
|
||||
if (must_free_mapping) free(mapping_buf);
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
|
||||
if (octx->src1_spad.src != src1) {
|
||||
const uint32_t n_quant_jobs = MIN(src1_nrows, octx->n_threads);
|
||||
@@ -4618,5 +4953,6 @@ int op_matmul_id(struct htp_ops_context * octx) {
|
||||
const uint32_t n_matmul_jobs = octx->n_threads;
|
||||
worker_pool_run_func(octx->ctx->worker_pool, matmul_id_job_func, mmctx, n_matmul_jobs);
|
||||
|
||||
if (must_free_mapping) free(mapping_buf);
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
|
||||
@@ -511,6 +511,8 @@ int op_pad(struct htp_ops_context * octx) {
|
||||
octx->dst_spad.size = n_threads * octx->dst_spad.size_per_thread;
|
||||
octx->src0_spad.data = octx->ctx->vtcm_base;
|
||||
octx->dst_spad.data = octx->src0_spad.data + octx->src0_spad.size;
|
||||
octx->src0_spad.src = NULL;
|
||||
octx->dst_spad.src = NULL;
|
||||
}
|
||||
|
||||
struct htp_pad_context pctx = {
|
||||
|
||||
@@ -692,6 +692,11 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
|
||||
const uint8_t * restrict data_src1 = uctx->data_src1;
|
||||
uint8_t * restrict data_dst = uctx->data_dst;
|
||||
|
||||
const struct htp_tensor * src1 = (htp_op == HTP_OP_RMS_NORM_MUL) ? octx->src[1] : NULL;
|
||||
const uint32_t nb11 = src1 ? src1->nb[1] : 0;
|
||||
const uint32_t nb12 = src1 ? src1->nb[2] : 0;
|
||||
const uint32_t nb13 = src1 ? src1->nb[3] : 0;
|
||||
|
||||
uint8_t * src0_spad_data = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
|
||||
uint8_t * src1_spad_data = octx->src1_spad.data + (ith * octx->src1_spad.size_per_thread);
|
||||
uint8_t * dst_spad_data = octx->dst_spad.data + (ith * octx->dst_spad.size_per_thread);
|
||||
@@ -738,10 +743,10 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
|
||||
src0_row_size_aligned, nb01, src0_data_row_size, block_size);
|
||||
|
||||
if (htp_op == HTP_OP_RMS_NORM_MUL && !uctx->broadcast_weight) {
|
||||
const size_t src1_off = unary_row_offset(ir, ne01, ne02, nb01, nb02, nb03);
|
||||
const size_t src1_off = unary_row_offset(ir, ne01, ne02, nb11, nb12, nb13);
|
||||
dma_queue_push(dma_queue,
|
||||
dma_make_ptr(src1_spad_data + (spad_idx * src1_spad_half_size), data_src1 + src1_off),
|
||||
uctx->src1_row_size_aligned, nb01, uctx->src1_data_row_size, block_size);
|
||||
uctx->src1_row_size_aligned, nb11, uctx->src1_data_row_size, block_size);
|
||||
}
|
||||
|
||||
ir += block_size;
|
||||
@@ -823,10 +828,10 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
|
||||
src0_row_size_aligned, nb01, src0_data_row_size, pref_block_size);
|
||||
|
||||
if (htp_op == HTP_OP_RMS_NORM_MUL && !uctx->broadcast_weight) {
|
||||
const size_t src1_pref_off = unary_row_offset(pref_ir, ne01, ne02, nb01, nb02, nb03);
|
||||
const size_t src1_pref_off = unary_row_offset(pref_ir, ne01, ne02, nb11, nb12, nb13);
|
||||
dma_queue_push(dma_queue,
|
||||
dma_make_ptr(src1_spad, data_src1 + src1_pref_off),
|
||||
uctx->src1_row_size_aligned, nb01, uctx->src1_data_row_size, pref_block_size);
|
||||
uctx->src1_row_size_aligned, nb11, uctx->src1_data_row_size, pref_block_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -977,6 +982,10 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
|
||||
octx->dst_spad.data = octx->src0_spad.data + octx->src0_spad.size;
|
||||
}
|
||||
|
||||
octx->src0_spad.src = NULL;
|
||||
octx->src1_spad.src = NULL;
|
||||
octx->dst_spad.src = NULL;
|
||||
|
||||
FARF(HIGH, "%s: (%ux%ux%ux%u) -> (%ux%ux%ux%u) : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n", op_type,
|
||||
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
|
||||
octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size);
|
||||
|
||||
@@ -1107,7 +1107,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
||||
case GGML_GLU_OP_SWIGLU_OAI:
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
case GGML_GLU_OP_GEGLU_QUICK:
|
||||
return ggml_is_contiguous_1(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||
return ggml_is_contiguous_1(op->src[0]) && (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1421,7 +1421,8 @@ template [[host_name("kernel_repeat_f16")]] kernel kernel_repeat_t kernel_repeat
|
||||
template [[host_name("kernel_repeat_i32")]] kernel kernel_repeat_t kernel_repeat<int>;
|
||||
template [[host_name("kernel_repeat_i16")]] kernel kernel_repeat_t kernel_repeat<short>;
|
||||
|
||||
kernel void kernel_reglu_f32(
|
||||
template<typename T>
|
||||
kernel void kernel_reglu(
|
||||
constant ggml_metal_kargs_glu & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
@@ -1429,19 +1430,25 @@ kernel void kernel_reglu_f32(
|
||||
uint tgpig[[threadgroup_position_in_grid]],
|
||||
uint tpitg[[thread_position_in_threadgroup]],
|
||||
uint ntg[[threads_per_threadgroup]]) {
|
||||
device const float * src0_row = (device const float *) ((device const char *) src0 + tgpig*args.nb01) + args.i00;
|
||||
device const float * src1_row = (device const float *) ((device const char *) src1 + tgpig*args.nb11) + args.i10;
|
||||
device float * dst_row = (device float *) ((device char *) dst + tgpig*args.nb1);
|
||||
device const T * src0_row = (device const T *) ((device const char *) src0 + tgpig*args.nb01) + args.i00;
|
||||
device const T * src1_row = (device const T *) ((device const char *) src1 + tgpig*args.nb11) + args.i10;
|
||||
device T * dst_row = (device T *) ((device char *) dst + tgpig*args.nb1);
|
||||
|
||||
for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) {
|
||||
const float x0 = src0_row[i0];
|
||||
const float x1 = src1_row[i0];
|
||||
|
||||
dst_row[i0] = x0*x1*(x0 > 0.0f);
|
||||
dst_row[i0] = (T)(x0*x1*(x0 > 0.0f));
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_geglu_f32(
|
||||
typedef decltype(kernel_reglu<float>) kernel_reglu_t;
|
||||
|
||||
template [[host_name("kernel_reglu_f32")]] kernel kernel_reglu_t kernel_reglu<float>;
|
||||
template [[host_name("kernel_reglu_f16")]] kernel kernel_reglu_t kernel_reglu<half>;
|
||||
|
||||
template<typename T>
|
||||
kernel void kernel_geglu(
|
||||
constant ggml_metal_kargs_glu & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
@@ -1449,9 +1456,9 @@ kernel void kernel_geglu_f32(
|
||||
uint tgpig[[threadgroup_position_in_grid]],
|
||||
uint tpitg[[thread_position_in_threadgroup]],
|
||||
uint ntg[[threads_per_threadgroup]]) {
|
||||
device const float * src0_row = (device const float *) ((device const char *) src0 + tgpig*args.nb01) + args.i00;
|
||||
device const float * src1_row = (device const float *) ((device const char *) src1 + tgpig*args.nb11) + args.i10;
|
||||
device float * dst_row = (device float *) ((device char *) dst + tgpig*args.nb1);
|
||||
device const T * src0_row = (device const T *) ((device const char *) src0 + tgpig*args.nb01) + args.i00;
|
||||
device const T * src1_row = (device const T *) ((device const char *) src1 + tgpig*args.nb11) + args.i10;
|
||||
device T * dst_row = (device T *) ((device char *) dst + tgpig*args.nb1);
|
||||
|
||||
for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) {
|
||||
const float x0 = src0_row[i0];
|
||||
@@ -1459,11 +1466,17 @@ kernel void kernel_geglu_f32(
|
||||
|
||||
const float gelu = 0.5f*x0*(1.0f + precise::tanh(SQRT_2_OVER_PI*x0*(1.0f + GELU_COEF_A*x0*x0)));
|
||||
|
||||
dst_row[i0] = gelu*x1;
|
||||
dst_row[i0] = (T)(gelu*x1);
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_swiglu_f32(
|
||||
typedef decltype(kernel_geglu<float>) kernel_geglu_t;
|
||||
|
||||
template [[host_name("kernel_geglu_f32")]] kernel kernel_geglu_t kernel_geglu<float>;
|
||||
template [[host_name("kernel_geglu_f16")]] kernel kernel_geglu_t kernel_geglu<half>;
|
||||
|
||||
template<typename T>
|
||||
kernel void kernel_swiglu(
|
||||
constant ggml_metal_kargs_glu & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
@@ -1471,9 +1484,9 @@ kernel void kernel_swiglu_f32(
|
||||
uint tgpig[[threadgroup_position_in_grid]],
|
||||
uint tpitg[[thread_position_in_threadgroup]],
|
||||
uint ntg[[threads_per_threadgroup]]) {
|
||||
device const float * src0_row = (device const float *) ((device const char *) src0 + tgpig*args.nb01) + args.i00;
|
||||
device const float * src1_row = (device const float *) ((device const char *) src1 + tgpig*args.nb11) + args.i10;
|
||||
device float * dst_row = (device float *) ((device char *) dst + tgpig*args.nb1);
|
||||
device const T * src0_row = (device const T *) ((device const char *) src0 + tgpig*args.nb01) + args.i00;
|
||||
device const T * src1_row = (device const T *) ((device const char *) src1 + tgpig*args.nb11) + args.i10;
|
||||
device T * dst_row = (device T *) ((device char *) dst + tgpig*args.nb1);
|
||||
|
||||
for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) {
|
||||
const float x0 = src0_row[i0];
|
||||
@@ -1481,11 +1494,17 @@ kernel void kernel_swiglu_f32(
|
||||
|
||||
const float silu = x0 / (1.0f + exp(-x0));
|
||||
|
||||
dst_row[i0] = silu*x1;
|
||||
dst_row[i0] = (T)(silu*x1);
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_swiglu_oai_f32(
|
||||
typedef decltype(kernel_swiglu<float>) kernel_swiglu_t;
|
||||
|
||||
template [[host_name("kernel_swiglu_f32")]] kernel kernel_swiglu_t kernel_swiglu<float>;
|
||||
template [[host_name("kernel_swiglu_f16")]] kernel kernel_swiglu_t kernel_swiglu<half>;
|
||||
|
||||
template<typename T>
|
||||
kernel void kernel_swiglu_oai(
|
||||
constant ggml_metal_kargs_glu & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
@@ -1493,9 +1512,9 @@ kernel void kernel_swiglu_oai_f32(
|
||||
uint tgpig[[threadgroup_position_in_grid]],
|
||||
uint tpitg[[thread_position_in_threadgroup]],
|
||||
uint ntg[[threads_per_threadgroup]]) {
|
||||
device const float * src0_row = (device const float *) ((device const char *) src0 + tgpig*args.nb01) + args.i00;
|
||||
device const float * src1_row = (device const float *) ((device const char *) src1 + tgpig*args.nb11) + args.i10;
|
||||
device float * dst_row = (device float *) ((device char *) dst + tgpig*args.nb1);
|
||||
device const T * src0_row = (device const T *) ((device const char *) src0 + tgpig*args.nb01) + args.i00;
|
||||
device const T * src1_row = (device const T *) ((device const char *) src1 + tgpig*args.nb11) + args.i10;
|
||||
device T * dst_row = (device T *) ((device char *) dst + tgpig*args.nb1);
|
||||
|
||||
for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) {
|
||||
float x0 = src0_row[i0];
|
||||
@@ -1507,11 +1526,17 @@ kernel void kernel_swiglu_oai_f32(
|
||||
float out_glu = x0 / (1.0f + exp(-x0 * args.alpha));
|
||||
out_glu = out_glu * (1.0f + x1);
|
||||
|
||||
dst_row[i0] = out_glu;
|
||||
dst_row[i0] = (T)out_glu;
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_geglu_erf_f32(
|
||||
typedef decltype(kernel_swiglu_oai<float>) kernel_swiglu_oai_t;
|
||||
|
||||
template [[host_name("kernel_swiglu_oai_f32")]] kernel kernel_swiglu_oai_t kernel_swiglu_oai<float>;
|
||||
template [[host_name("kernel_swiglu_oai_f16")]] kernel kernel_swiglu_oai_t kernel_swiglu_oai<half>;
|
||||
|
||||
template<typename T>
|
||||
kernel void kernel_geglu_erf(
|
||||
constant ggml_metal_kargs_glu & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
@@ -1519,9 +1544,9 @@ kernel void kernel_geglu_erf_f32(
|
||||
uint tgpig[[threadgroup_position_in_grid]],
|
||||
uint tpitg[[thread_position_in_threadgroup]],
|
||||
uint ntg[[threads_per_threadgroup]]) {
|
||||
device const float * src0_row = (device const float *) ((device const char *) src0 + tgpig*args.nb01) + args.i00;
|
||||
device const float * src1_row = (device const float *) ((device const char *) src1 + tgpig*args.nb11) + args.i10;
|
||||
device float * dst_row = (device float *) ((device char *) dst + tgpig*args.nb1);
|
||||
device const T * src0_row = (device const T *) ((device const char *) src0 + tgpig*args.nb01) + args.i00;
|
||||
device const T * src1_row = (device const T *) ((device const char *) src1 + tgpig*args.nb11) + args.i10;
|
||||
device T * dst_row = (device T *) ((device char *) dst + tgpig*args.nb1);
|
||||
|
||||
for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) {
|
||||
const float x0 = src0_row[i0];
|
||||
@@ -1529,11 +1554,17 @@ kernel void kernel_geglu_erf_f32(
|
||||
|
||||
const float gelu_erf = 0.5f*x0*(1.0f+erf_approx<float>(x0*SQRT_2_INV));
|
||||
|
||||
dst_row[i0] = gelu_erf*x1;
|
||||
dst_row[i0] = (T)(gelu_erf*x1);
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_geglu_quick_f32(
|
||||
typedef decltype(kernel_geglu_erf<float>) kernel_geglu_erf_t;
|
||||
|
||||
template [[host_name("kernel_geglu_erf_f32")]] kernel kernel_geglu_erf_t kernel_geglu_erf<float>;
|
||||
template [[host_name("kernel_geglu_erf_f16")]] kernel kernel_geglu_erf_t kernel_geglu_erf<half>;
|
||||
|
||||
template<typename T>
|
||||
kernel void kernel_geglu_quick(
|
||||
constant ggml_metal_kargs_glu & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
@@ -1541,9 +1572,9 @@ kernel void kernel_geglu_quick_f32(
|
||||
uint tgpig[[threadgroup_position_in_grid]],
|
||||
uint tpitg[[thread_position_in_threadgroup]],
|
||||
uint ntg[[threads_per_threadgroup]]) {
|
||||
device const float * src0_row = (device const float *) ((device const char *) src0 + tgpig*args.nb01) + args.i00;
|
||||
device const float * src1_row = (device const float *) ((device const char *) src1 + tgpig*args.nb11) + args.i10;
|
||||
device float * dst_row = (device float *) ((device char *) dst + tgpig*args.nb1);
|
||||
device const T * src0_row = (device const T *) ((device const char *) src0 + tgpig*args.nb01) + args.i00;
|
||||
device const T * src1_row = (device const T *) ((device const char *) src1 + tgpig*args.nb11) + args.i10;
|
||||
device T * dst_row = (device T *) ((device char *) dst + tgpig*args.nb1);
|
||||
|
||||
for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) {
|
||||
const float x0 = src0_row[i0];
|
||||
@@ -1551,10 +1582,15 @@ kernel void kernel_geglu_quick_f32(
|
||||
|
||||
const float gelu_quick = x0*(1.0f/(1.0f+exp(GELU_QUICK_COEF*x0)));
|
||||
|
||||
dst_row[i0] = gelu_quick*x1;
|
||||
dst_row[i0] = (T)(gelu_quick*x1);
|
||||
}
|
||||
}
|
||||
|
||||
typedef decltype(kernel_geglu_quick<float>) kernel_geglu_quick_t;
|
||||
|
||||
template [[host_name("kernel_geglu_quick_f32")]] kernel kernel_geglu_quick_t kernel_geglu_quick<float>;
|
||||
template [[host_name("kernel_geglu_quick_f16")]] kernel kernel_geglu_quick_t kernel_geglu_quick<half>;
|
||||
|
||||
kernel void kernel_op_sum_f32(
|
||||
constant ggml_metal_kargs_sum & args,
|
||||
device const float * src0,
|
||||
|
||||
@@ -87,6 +87,10 @@ set(GGML_OPENCL_KERNELS
|
||||
mul_mv_q4_1_f32_flat
|
||||
mul_mv_q4_k_f32
|
||||
mul_mv_q4_k_f32_flat
|
||||
mul_mv_q5_0_f32
|
||||
mul_mv_q5_0_f32_flat
|
||||
mul_mv_q5_1_f32
|
||||
mul_mv_q5_1_f32_flat
|
||||
mul_mv_q5_k_f32
|
||||
mul_mv_q5_k_f32_flat
|
||||
mul_mv_q6_k_f32
|
||||
@@ -126,6 +130,8 @@ set(GGML_OPENCL_KERNELS
|
||||
mul_mm_f16_f32_l4_lm
|
||||
mul_mm_q4_0_f32_l4_lm
|
||||
mul_mm_q4_1_f32_l4_lm
|
||||
mul_mm_q5_0_f32_l4_lm
|
||||
mul_mm_q5_1_f32_l4_lm
|
||||
mul_mm_q8_0_f32_l4_lm
|
||||
mul_mm_iq4_nl_f32_l4_lm
|
||||
mul_mm_q4_k_f32_l4_lm
|
||||
|
||||
@@ -380,7 +380,7 @@ struct ggml_backend_opencl_device_context {
|
||||
ADRENO_GPU_GEN adreno_gen = ADRENO_GPU_GEN::ADRENO_UNKNOWN;
|
||||
|
||||
std::regex *opfilter = nullptr; // regex of ops to not claim
|
||||
std::string opfilter_str; // regex string for opfilter
|
||||
std::string opfilter_str = ""; // regex string for opfilter
|
||||
size_t global_mem_size = 0;
|
||||
};
|
||||
|
||||
@@ -576,7 +576,9 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_convert_block_q4_0_trans4_ns, kernel_restore_block_q4_0_trans4_ns;
|
||||
cl_kernel kernel_convert_block_q4_1, kernel_restore_block_q4_1;
|
||||
cl_kernel kernel_convert_block_q4_1_trans4_ns, kernel_restore_block_q4_1_trans4_ns;
|
||||
cl_kernel kernel_convert_block_q5_0, kernel_restore_block_q5_0;
|
||||
cl_kernel kernel_convert_block_q5_0_trans4_ns, kernel_restore_block_q5_0_trans4_ns;
|
||||
cl_kernel kernel_convert_block_q5_1, kernel_restore_block_q5_1;
|
||||
cl_kernel kernel_convert_block_q5_1_trans4_ns, kernel_restore_block_q5_1_trans4_ns;
|
||||
cl_kernel kernel_convert_block_q4_k_trans4_ns, kernel_restore_block_q4_k_trans4_ns;
|
||||
cl_kernel kernel_convert_block_q5_k_trans4_ns, kernel_restore_block_q5_k_trans4_ns;
|
||||
@@ -604,6 +606,10 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
|
||||
cl_kernel kernel_mul_mv_q4_1_f32;
|
||||
cl_kernel kernel_mul_mv_q4_1_f32_flat;
|
||||
cl_kernel kernel_mul_mv_q5_0_f32;
|
||||
cl_kernel kernel_mul_mv_q5_0_f32_flat;
|
||||
cl_kernel kernel_mul_mv_q5_1_f32;
|
||||
cl_kernel kernel_mul_mv_q5_1_f32_flat;
|
||||
cl_kernel kernel_mul_mv_q4_K_f32;
|
||||
cl_kernel kernel_mul_mv_q4_K_f32_flat;
|
||||
cl_kernel kernel_mul_mv_q5_K_f32;
|
||||
@@ -662,6 +668,8 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_mul_mm_f16_f32_l4_lm;
|
||||
cl_kernel kernel_mul_mm_q4_0_f32_l4_lm;
|
||||
cl_kernel kernel_mul_mm_q4_1_f32_l4_lm;
|
||||
cl_kernel kernel_mul_mm_q5_0_f32_l4_lm;
|
||||
cl_kernel kernel_mul_mm_q5_1_f32_l4_lm;
|
||||
cl_kernel kernel_mul_mm_q8_0_f32_l4_lm;
|
||||
cl_kernel kernel_mul_mm_q4_k_f32_l4_lm;
|
||||
cl_kernel kernel_mul_mm_q5_k_f32_l4_lm;
|
||||
@@ -1141,8 +1149,12 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q4_1 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_1", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q4_1_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_1_trans4_ns", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q4_1_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_1_trans4_ns", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q5_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_0", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q5_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_0", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q5_0_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_0_trans4_ns", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q5_0_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_0_trans4_ns", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q5_1 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_1", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q5_1 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_1", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q5_1_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_1_trans4_ns", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q5_1_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_1_trans4_ns", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q4_k_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_k_trans4_ns", &err), err));
|
||||
@@ -1485,6 +1497,74 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// mul_mv_q5_0_f32
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "mul_mv_q5_0_f32.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("mul_mv_q5_0_f32.cl");
|
||||
#endif
|
||||
cl_program prog =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_mul_mv_q5_0_f32 = clCreateKernel(prog, "kernel_mul_mv_q5_0_f32", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// mul_mv_q5_0_f32_flat
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "mul_mv_q5_0_f32_flat.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("mul_mv_q5_0_f32_flat.cl");
|
||||
#endif
|
||||
cl_program prog =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_mul_mv_q5_0_f32_flat = clCreateKernel(prog, "kernel_mul_mv_q5_0_f32_flat", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// mul_mv_q5_1_f32
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "mul_mv_q5_1_f32.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("mul_mv_q5_1_f32.cl");
|
||||
#endif
|
||||
cl_program prog =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_mul_mv_q5_1_f32 = clCreateKernel(prog, "kernel_mul_mv_q5_1_f32", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// mul_mv_q5_1_f32_flat
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "mul_mv_q5_1_f32_flat.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("mul_mv_q5_1_f32_flat.cl");
|
||||
#endif
|
||||
cl_program prog =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_mul_mv_q5_1_f32_flat = clCreateKernel(prog, "kernel_mul_mv_q5_1_f32_flat", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// mul_mv_q5_k_f32
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
@@ -1835,6 +1915,38 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// mul_mm_q5_0_f32_l4_lm
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "mul_mm_q5_0_f32_l4_lm.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("mul_mm_q5_0_f32_l4_lm.cl");
|
||||
#endif
|
||||
cl_program prog =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_mul_mm_q5_0_f32_l4_lm = clCreateKernel(prog, "kernel_mul_mm_q5_0_f32_l4_lm", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// mul_mm_q5_1_f32_l4_lm
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "mul_mm_q5_1_f32_l4_lm.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("mul_mm_q5_1_f32_l4_lm.cl");
|
||||
#endif
|
||||
cl_program prog =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_mul_mm_q5_1_f32_l4_lm = clCreateKernel(prog, "kernel_mul_mm_q5_1_f32_l4_lm", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// mul_mm_q8_0_f32_l4_lm
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
@@ -4838,6 +4950,21 @@ inline bool enable_adreno_trans_weight(const ggml_backend_opencl_context *backen
|
||||
return ((elem_num < 128 * 1024 * 1024) && adreno_kernel); // max element num: 2**27
|
||||
}
|
||||
|
||||
static inline bool use_flat_gemv_for_large_m_q4_K(const ggml_tensor *tensor) {
|
||||
// gemv_noshuffle variant perf drops for large M, use flat variant for large M.
|
||||
// threshold is well above typical hidden/FFN dims, but below typical vocab sizes.
|
||||
// note that this forces large M weights to use LM GEMM.
|
||||
return tensor->ne[1] >= 32768 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
||||
}
|
||||
|
||||
static inline bool use_flat_gemv_for_large_m_q6_K(const ggml_tensor *tensor) {
|
||||
// gemv_noshuffle variant perf drops for large M, use flat variant for large M.
|
||||
// threshold is well above typical hidden/FFN dims, but below typical vocab sizes.
|
||||
// q6_K flat gemv is worse for smaller K; 2048 seems to be a reasonable threshold.
|
||||
// note that this forces large M weights to use LM GEMM.
|
||||
return tensor->ne[1] >= 32768 && tensor->ne[0] >= 2048 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
||||
}
|
||||
|
||||
static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
||||
ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *)dev->context;
|
||||
ggml_backend_opencl_context * backend_ctx = dev_ctx->backend_ctx;
|
||||
@@ -5027,6 +5154,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
} else if (op->src[0]->type == GGML_TYPE_F32) {
|
||||
return op->src[1]->type == GGML_TYPE_F32;
|
||||
} else if (op->src[0]->type == GGML_TYPE_Q4_0 || op->src[0]->type == GGML_TYPE_Q4_1 ||
|
||||
op->src[0]->type == GGML_TYPE_Q5_0 || op->src[0]->type == GGML_TYPE_Q5_1 ||
|
||||
op->src[0]->type == GGML_TYPE_MXFP4 ||
|
||||
op->src[0]->type == GGML_TYPE_IQ4_NL ||
|
||||
op->src[0]->type == GGML_TYPE_Q4_K ||
|
||||
@@ -5977,7 +6105,24 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
return;
|
||||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
return;
|
||||
cl_kernel kernel = backend_ctx->kernel_convert_block_q5_0;
|
||||
cl_ulong n_blk = ggml_nelements(tensor)/ggml_blck_size(tensor->type);
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qs));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->qh));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &n_blk));
|
||||
|
||||
size_t global_work_size[] = {(size_t)CEIL_DIV(n_blk, 64) * 64, 1, 1};
|
||||
size_t local_work_size[] = {64, 1, 1};
|
||||
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
CL_CHECK(clReleaseMemObject(data_device));
|
||||
|
||||
tensor->extra = extra;
|
||||
return;
|
||||
}
|
||||
if (tensor->type == GGML_TYPE_Q5_1) {
|
||||
ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
|
||||
@@ -6078,6 +6223,24 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
return;
|
||||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
cl_kernel kernel = backend_ctx->kernel_convert_block_q5_1;
|
||||
cl_ulong n_blk = ggml_nelements(tensor)/ggml_blck_size(tensor->type);
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qs));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->qh));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->m));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &n_blk));
|
||||
|
||||
size_t global_work_size[] = {(size_t)CEIL_DIV(n_blk, 64) * 64, 1, 1};
|
||||
size_t local_work_size[] = {64, 1, 1};
|
||||
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
CL_CHECK(clReleaseMemObject(data_device));
|
||||
|
||||
tensor->extra = extra;
|
||||
return;
|
||||
}
|
||||
if (tensor->type == GGML_TYPE_MXFP4) {
|
||||
@@ -6447,7 +6610,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
cl_kernel kernel = backend_ctx->kernel_convert_block_q4_K;
|
||||
if (use_adreno_kernels(backend_ctx, tensor)) {
|
||||
if (use_adreno_kernels(backend_ctx, tensor) && !use_flat_gemv_for_large_m_q4_K(tensor)) {
|
||||
kernel = backend_ctx->kernel_convert_block_q4_K_noshuffle;
|
||||
}
|
||||
#else
|
||||
@@ -6475,7 +6638,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
|
||||
tensor->extra = extra;
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
if (use_adreno_kernels(backend_ctx, tensor)) {
|
||||
if (use_adreno_kernels(backend_ctx, tensor) && !use_flat_gemv_for_large_m_q4_K(tensor)) {
|
||||
|
||||
int M = tensor->ne[1];
|
||||
int K = tensor->ne[0];
|
||||
@@ -6674,9 +6837,6 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
|
||||
cl_buffer_region region;
|
||||
|
||||
cl_uchar mask_0F = 0x0F;
|
||||
cl_uchar mask_F0 = 0xF0;
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
// Adreno MoE Q6_K kernel needs special transposed layout
|
||||
if (use_adreno_moe_kernels(backend_ctx, tensor)) {
|
||||
@@ -6710,6 +6870,9 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_convert_block_q6_k_trans4_ns;
|
||||
|
||||
cl_uchar mask_0F = 0x0F;
|
||||
cl_uchar mask_F0 = 0xF0;
|
||||
|
||||
int ne00 = tensor->ne[0];
|
||||
int ne01 = tensor->ne[1];
|
||||
int ne02 = tensor->ne[2];
|
||||
@@ -6775,7 +6938,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
cl_kernel kernel;
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
kernel = backend_ctx->kernel_convert_block_q6_K;
|
||||
if (use_adreno_kernels(backend_ctx, tensor)) {
|
||||
if (use_adreno_kernels(backend_ctx, tensor) && !use_flat_gemv_for_large_m_q6_K(tensor)) {
|
||||
kernel = backend_ctx->kernel_convert_block_q6_K_noshuffle;
|
||||
}
|
||||
#else
|
||||
@@ -6808,7 +6971,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
tensor->extra = extra;
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
if (use_adreno_kernels(backend_ctx, tensor)) {
|
||||
if (use_adreno_kernels(backend_ctx, tensor) && !use_flat_gemv_for_large_m_q6_K(tensor)) {
|
||||
cl_int M = tensor->ne[1]; // ne01
|
||||
cl_int K = tensor->ne[0]; // ne00
|
||||
|
||||
@@ -6846,7 +7009,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
|
||||
cl_int err;
|
||||
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
|
||||
size, (void *) data, &err);
|
||||
size, const_cast<void *>(data), &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_convert_bf16_to_f16;
|
||||
@@ -7135,8 +7298,29 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||
return;
|
||||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
// TODO: normal q5_0
|
||||
(void) extra;
|
||||
|
||||
cl_int err;
|
||||
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
||||
ggml_nbytes(tensor), NULL, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_restore_block_q5_0;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->qs));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qh));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &data_device));
|
||||
|
||||
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
||||
size_t local_work_size[] = {1, 1, 1};
|
||||
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
|
||||
global_work_size, local_work_size, 0, NULL, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
CL_CHECK(clEnqueueReadBuffer(
|
||||
queue, data_device, CL_TRUE, offset,
|
||||
size, data, 0, NULL, NULL));
|
||||
CL_CHECK(clReleaseMemObject(data_device));
|
||||
return;
|
||||
}
|
||||
if (tensor->type == GGML_TYPE_Q5_1) {
|
||||
@@ -7177,8 +7361,29 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||
return;
|
||||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
// TODO: normal q5_1
|
||||
(void) extra;
|
||||
cl_int err;
|
||||
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
||||
ggml_nbytes(tensor), NULL, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_restore_block_q5_1;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->qs));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qh));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->m));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &data_device));
|
||||
|
||||
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
||||
size_t local_work_size[] = {1, 1, 1};
|
||||
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
|
||||
global_work_size, local_work_size, 0, NULL, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
CL_CHECK(clEnqueueReadBuffer(
|
||||
queue, data_device, CL_TRUE, offset,
|
||||
size, data, 0, NULL, NULL));
|
||||
CL_CHECK(clReleaseMemObject(data_device));
|
||||
return;
|
||||
}
|
||||
if (tensor->type == GGML_TYPE_MXFP4) {
|
||||
@@ -7409,7 +7614,7 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||
CL_CHECK(clReleaseMemObject(data_device));
|
||||
return;
|
||||
}
|
||||
if (use_adreno_kernels(backend_ctx, tensor)) {
|
||||
if (use_adreno_kernels(backend_ctx, tensor) && !use_flat_gemv_for_large_m_q4_K(tensor)) {
|
||||
int M = tensor->ne[1];
|
||||
int K = tensor->ne[0];
|
||||
|
||||
@@ -7592,9 +7797,6 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||
if (tensor->type == GGML_TYPE_Q6_K) {
|
||||
ggml_tensor_extra_cl_q6_K * extra = (ggml_tensor_extra_cl_q6_K *)tensor->extra;
|
||||
|
||||
cl_uchar mask_0F = 0x0F;
|
||||
cl_uchar mask_F0 = 0xF0;
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
if (use_adreno_moe_kernels(backend_ctx, tensor)) {
|
||||
cl_int err;
|
||||
@@ -7604,6 +7806,9 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_restore_block_q6_k_trans4_ns;
|
||||
|
||||
cl_uchar mask_0F = 0x0F;
|
||||
cl_uchar mask_F0 = 0xF0;
|
||||
|
||||
int ne00 = tensor->ne[0];
|
||||
int ne01 = tensor->ne[1];
|
||||
int ne02 = tensor->ne[2];
|
||||
@@ -7630,7 +7835,7 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||
CL_CHECK(clReleaseMemObject(data_device));
|
||||
return;
|
||||
}
|
||||
if (use_adreno_kernels(backend_ctx, tensor)) {
|
||||
if (use_adreno_kernels(backend_ctx, tensor) && !use_flat_gemv_for_large_m_q6_K(tensor)) {
|
||||
static ggml_cl_buffer buf_trans_ql;
|
||||
static ggml_cl_buffer buf_trans_qh;
|
||||
static ggml_cl_buffer buf_trans_s;
|
||||
@@ -12936,6 +13141,8 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
||||
#ifdef GGML_OPENCL_SOA_Q
|
||||
ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
|
||||
ggml_tensor_extra_cl_q4_1 * extra0_q4_1 = (ggml_tensor_extra_cl_q4_1 *)src0->extra;
|
||||
ggml_tensor_extra_cl_q5_0 * extra0_q5_0 = (ggml_tensor_extra_cl_q5_0 *)src0->extra;
|
||||
ggml_tensor_extra_cl_q5_1 * extra0_q5_1 = (ggml_tensor_extra_cl_q5_1 *)src0->extra;
|
||||
ggml_tensor_extra_cl_mxfp4 * extra0_mxfp4 = (ggml_tensor_extra_cl_mxfp4 *)src0->extra;
|
||||
ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra;
|
||||
ggml_tensor_extra_cl_iq4_nl * extra0_iq4_nl = (ggml_tensor_extra_cl_iq4_nl *)src0->extra;
|
||||
@@ -13021,13 +13228,13 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
||||
}
|
||||
|
||||
// q4_k x fp32
|
||||
if (src0t == GGML_TYPE_Q4_K && src1t == GGML_TYPE_F32) {
|
||||
if (src0t == GGML_TYPE_Q4_K && src1t == GGML_TYPE_F32 && !use_flat_gemv_for_large_m_q4_K(src0)) {
|
||||
ggml_cl_mul_mat_q4_k_f32_adreno(backend, src0, src1, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
// q6_K x fp32
|
||||
if (src0t == GGML_TYPE_Q6_K && src1t == GGML_TYPE_F32) {
|
||||
if (src0t == GGML_TYPE_Q6_K && src1t == GGML_TYPE_F32 && !use_flat_gemv_for_large_m_q6_K(src0)) {
|
||||
ggml_cl_mul_mat_q6_K_f32_adreno(backend, src0, src1, dst);
|
||||
return;
|
||||
}
|
||||
@@ -13271,6 +13478,93 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
return;
|
||||
}
|
||||
case GGML_TYPE_Q5_0: {
|
||||
if (ne11 < 32) {
|
||||
break;
|
||||
}
|
||||
if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1)) {
|
||||
break;
|
||||
}
|
||||
|
||||
kernel = backend_ctx->kernel_mul_mm_q5_0_f32_l4_lm;
|
||||
nth0 = 128; // calculated as (BM*BN)/(TM*TN)
|
||||
|
||||
int batch_stride_a = ne00*ne01;
|
||||
int batch_stride_b = ne10*ne11;
|
||||
int batch_stride_d = ne0*ne1;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q5_0->qs));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q5_0->qh));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q5_0->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10)); // stride_a
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10)); // stride_b
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne01)); // stride_d
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &batch_stride_a));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &batch_stride_b));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &batch_stride_d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &r3));
|
||||
|
||||
// 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed.
|
||||
size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
|
||||
size_t local_work_size[] = {(size_t)nth0, 1, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
return;
|
||||
}
|
||||
case GGML_TYPE_Q5_1: {
|
||||
if (ne11 < 32) {
|
||||
break;
|
||||
}
|
||||
if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1)) {
|
||||
break;
|
||||
}
|
||||
|
||||
kernel = backend_ctx->kernel_mul_mm_q5_1_f32_l4_lm;
|
||||
nth0 = 128; // calculated as (BM*BN)/(TM*TN)
|
||||
|
||||
int batch_stride_a = ne00*ne01;
|
||||
int batch_stride_b = ne10*ne11;
|
||||
int batch_stride_d = ne0*ne1;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q5_1->qs));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q5_1->qh));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q5_1->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q5_1->m));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10)); // stride_a
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10)); // stride_b
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne01)); // stride_d
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &batch_stride_a));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &batch_stride_b));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &batch_stride_d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &r2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &r3));
|
||||
|
||||
// 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed.
|
||||
size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
|
||||
size_t local_work_size[] = {(size_t)nth0, 1, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
return;
|
||||
}
|
||||
case GGML_TYPE_Q8_0: {
|
||||
if (ne11 < 32) {
|
||||
break;
|
||||
@@ -13807,6 +14101,137 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
||||
#endif // GGML_OPENCL_SOA_Q
|
||||
break;
|
||||
}
|
||||
case GGML_TYPE_Q5_0: {
|
||||
#ifdef GGML_OPENCL_SOA_Q
|
||||
if (backend_ctx->gpu_family == INTEL) {
|
||||
nth0 = 16;
|
||||
nth1 = 1;
|
||||
ndst = 4;
|
||||
} else if (backend_ctx->gpu_family == ADRENO) {
|
||||
nth0 = 64;
|
||||
nth1 = 1;
|
||||
ndst = 4;
|
||||
} else {
|
||||
GGML_ASSERT(false && "TODO: Unknown GPU");
|
||||
}
|
||||
|
||||
kernel = backend_ctx->kernel_mul_mv_q5_0_f32_flat;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q5_0->qs));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q5_0->qh));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q5_0->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &r3));
|
||||
#else
|
||||
if (backend_ctx->gpu_family == INTEL) {
|
||||
nth0 = 16;
|
||||
nth1 = 1;
|
||||
ndst = 4;
|
||||
} else if (backend_ctx->gpu_family == ADRENO) {
|
||||
nth0 = 64;
|
||||
nth1 = 1;
|
||||
ndst = 4;
|
||||
} else {
|
||||
GGML_ASSERT(false && "TODO: Unknown GPU");
|
||||
}
|
||||
|
||||
kernel = backend_ctx->kernel_mul_mv_q5_0_f32;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3));
|
||||
#endif // GGML_OPENCL_SOA_Q
|
||||
break;
|
||||
}
|
||||
case GGML_TYPE_Q5_1: {
|
||||
#ifdef GGML_OPENCL_SOA_Q
|
||||
if (backend_ctx->gpu_family == INTEL) {
|
||||
nth0 = 16;
|
||||
nth1 = 1;
|
||||
ndst = 4;
|
||||
} else if (backend_ctx->gpu_family == ADRENO) {
|
||||
nth0 = 64;
|
||||
nth1 = 1;
|
||||
ndst = 4;
|
||||
} else {
|
||||
GGML_ASSERT(false && "TODO: Unknown GPU");
|
||||
}
|
||||
|
||||
kernel = backend_ctx->kernel_mul_mv_q5_1_f32_flat;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q5_1->qs));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q5_1->qh));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q5_1->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q5_1->m));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &r2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &r3));
|
||||
#else
|
||||
if (backend_ctx->gpu_family == INTEL) {
|
||||
nth0 = 16;
|
||||
nth1 = 1;
|
||||
ndst = 4;
|
||||
} else if (backend_ctx->gpu_family == ADRENO) {
|
||||
nth0 = 64;
|
||||
nth1 = 1;
|
||||
ndst = 4;
|
||||
} else {
|
||||
GGML_ASSERT(false && "TODO: Unknown GPU");
|
||||
}
|
||||
|
||||
kernel = backend_ctx->kernel_mul_mv_q5_1_f32;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3));
|
||||
#endif // GGML_OPENCL_SOA_Q
|
||||
break;
|
||||
}
|
||||
case GGML_TYPE_Q8_0: {
|
||||
#ifdef GGML_OPENCL_SOA_Q
|
||||
kernel = backend_ctx->kernel_mul_mv_q8_0_f32_flat;
|
||||
@@ -14247,6 +14672,8 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
||||
|
||||
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_MXFP4 ||
|
||||
src0t == GGML_TYPE_Q4_1 ||
|
||||
src0t == GGML_TYPE_Q5_0 ||
|
||||
src0t == GGML_TYPE_Q5_1 ||
|
||||
src0t == GGML_TYPE_Q8_0 ||
|
||||
src0t == GGML_TYPE_IQ4_NL ||
|
||||
src0t == GGML_TYPE_Q2_K) {
|
||||
@@ -14476,6 +14903,8 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
||||
const int ne1 = dst->ne[1];
|
||||
const int ne2 = dst->ne[2];
|
||||
|
||||
GGML_UNUSED(ne2);
|
||||
|
||||
const int r2 = ne12/ne02;
|
||||
const int r3 = ne13/ne03;
|
||||
const int dst_rows = ne20*ne21; // ne20 = n_used_experts, ne21 = n_rows
|
||||
@@ -14490,6 +14919,8 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
||||
const int n_tile_size = 32;
|
||||
const int max_post_router_tile = (ne20 * ne21 / n_tile_size) + ne02;
|
||||
|
||||
GGML_UNUSED(max_post_router_tile);
|
||||
|
||||
cl_kernel kernel;
|
||||
|
||||
// subgroup mat vec
|
||||
|
||||
@@ -537,6 +537,53 @@ kernel void kernel_restore_block_q4_1_trans4_ns(
|
||||
((__global ushort8 *)(&(b->qs[0])))[0] = pre_block;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// kernel_convert_block_q5_0
|
||||
// Convert the block_q5_0 format to 3 separate arrays (AOS -> SOA).
|
||||
// This kernel does not deshuffle the bits.
|
||||
//------------------------------------------------------------------------------
|
||||
kernel void kernel_convert_block_q5_0(
|
||||
global struct block_q5_0 * src0,
|
||||
global uchar * dst_qs,
|
||||
global uint * dst_qh,
|
||||
global half * dst_d,
|
||||
ulong n_blk
|
||||
) {
|
||||
if (get_global_id(0) >= n_blk) {
|
||||
return;
|
||||
}
|
||||
|
||||
global struct block_q5_0 * b = (global struct block_q5_0 *) src0 + get_global_id(0);
|
||||
global uchar * qs = (global uchar *) dst_qs + (QK5_0/2)*get_global_id(0);
|
||||
global uint * qh = (global uint *) dst_qh + get_global_id(0);
|
||||
global half * d = (global half *) dst_d + get_global_id(0);
|
||||
|
||||
*d = b->d;
|
||||
*qh = *((global uint *)(b->qh));
|
||||
|
||||
for (int i = 0; i < QK5_0/2; ++i) {
|
||||
qs[i] = b->qs[i];
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_restore_block_q5_0(
|
||||
global uchar * src_qs,
|
||||
global uint * src_qh,
|
||||
global half * src_d,
|
||||
global struct block_q5_0 * dst
|
||||
) {
|
||||
global struct block_q5_0 * b = (global struct block_q5_0 *) dst + get_global_id(0);
|
||||
global uchar * qs = (global uchar *) src_qs + (QK5_0/2)*get_global_id(0);
|
||||
global uint * qh = (global uint *) src_qh + get_global_id(0);
|
||||
global half * d = (global half *) src_d + get_global_id(0);
|
||||
|
||||
b->d = *d;
|
||||
*((global uint *)(b->qh)) = *qh;
|
||||
for (int i = 0; i < QK5_0/2; ++i) {
|
||||
b->qs[i] = qs[i];
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_convert_block_q5_0_trans4_ns(
|
||||
__global struct block_q5_0 * src0,
|
||||
__global uint * dst_qs,
|
||||
@@ -636,6 +683,59 @@ kernel void kernel_restore_block_q5_0_trans4_ns(
|
||||
((__global ushort8 *)(&(b->qs[0])))[0] = pre_block;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// kernel_convert_block_q5_1
|
||||
// Convert the block_q5_1 format to 4 separate arrays (AOS -> SOA).
|
||||
// This kernel does not deshuffle the bits.
|
||||
//------------------------------------------------------------------------------
|
||||
kernel void kernel_convert_block_q5_1(
|
||||
global struct block_q5_1 * src0,
|
||||
global uchar * dst_qs,
|
||||
global uint * dst_qh,
|
||||
global half * dst_d,
|
||||
global half * dst_m,
|
||||
ulong n_blk
|
||||
) {
|
||||
if (get_global_id(0) >= n_blk) {
|
||||
return;
|
||||
}
|
||||
|
||||
global struct block_q5_1 * b = (global struct block_q5_1 *) src0 + get_global_id(0);
|
||||
global uchar * qs = (global uchar *) dst_qs + (QK5_1/2)*get_global_id(0);
|
||||
global uint * qh = (global uint *) dst_qh + get_global_id(0);
|
||||
global half * d = (global half *) dst_d + get_global_id(0);
|
||||
global half * m = (global half *) dst_m + get_global_id(0);
|
||||
|
||||
*d = b->d;
|
||||
*m = b->m;
|
||||
*qh = *((global uint *)(b->qh));
|
||||
|
||||
for (int i = 0; i < QK5_1/2; ++i) {
|
||||
qs[i] = b->qs[i];
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_restore_block_q5_1(
|
||||
global uchar * src_qs,
|
||||
global uint * src_qh,
|
||||
global half * src_d,
|
||||
global half * src_m,
|
||||
global struct block_q5_1 * dst
|
||||
) {
|
||||
global struct block_q5_1 * b = (global struct block_q5_1 *) dst + get_global_id(0);
|
||||
global uchar * qs = (global uchar *) src_qs + (QK5_1/2)*get_global_id(0);
|
||||
global uint * qh = (global uint *) src_qh + get_global_id(0);
|
||||
global half * d = (global half *) src_d + get_global_id(0);
|
||||
global half * m = (global half *) src_m + get_global_id(0);
|
||||
|
||||
b->d = *d;
|
||||
b->m = *m;
|
||||
*((global uint *)(b->qh)) = *qh;
|
||||
for (int i = 0; i < QK5_1/2; ++i) {
|
||||
b->qs[i] = qs[i];
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_convert_block_q5_1_trans4_ns(
|
||||
__global struct block_q5_1 * src0,
|
||||
__global uint * dst_qs,
|
||||
|
||||
@@ -0,0 +1,173 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#define LOAD_VEC_A 8
|
||||
#define LOAD_VEC_B 4
|
||||
|
||||
#define BM 64
|
||||
#define BN 64
|
||||
#define BK 32
|
||||
#define TM 4
|
||||
#define TN 8
|
||||
|
||||
kernel void kernel_mul_mm_q5_0_f32_l4_lm(
|
||||
global uchar4 * src0_qs,
|
||||
global uint * src0_qh,
|
||||
global half * src0_d,
|
||||
global float4 * src1,
|
||||
ulong offset1,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne11,
|
||||
int ne12,
|
||||
|
||||
int stride_a,
|
||||
int stride_b,
|
||||
int stride_d,
|
||||
|
||||
int batch_stride_a,
|
||||
int batch_stride_b,
|
||||
int batch_stride_d,
|
||||
|
||||
int r2,
|
||||
int r3
|
||||
) {
|
||||
src1 = (global float4*)((global char*)src1 + offset1);
|
||||
dst = (global float *)((global char*)dst + offsetd);
|
||||
|
||||
local float buf_a[BM * BK];
|
||||
local float buf_b[BN * BK];
|
||||
|
||||
const int batch_idx = get_global_id(2);
|
||||
|
||||
const int i13 = batch_idx / ne12;
|
||||
const int i12 = batch_idx % ne12;
|
||||
|
||||
const int i03 = i13 / r3;
|
||||
const int i02 = i12 / r2;
|
||||
|
||||
const int batch_idx_a = i03 * ne02 + i02;
|
||||
|
||||
const int ir = get_group_id(0);
|
||||
const int ic = get_group_id(1);
|
||||
|
||||
const int tid = get_local_id(0);
|
||||
const int th_r = tid % (BM / TM);
|
||||
const int th_c = tid / (BM / TM);
|
||||
|
||||
const int loadr_a = get_local_id(0) % (BK / LOAD_VEC_A);
|
||||
const int loadc_a = get_local_id(0) / (BK / LOAD_VEC_A);
|
||||
const int loadr_b = get_local_id(0) % (BK / LOAD_VEC_B);
|
||||
const int loadc_b = get_local_id(0) / (BK / LOAD_VEC_B);
|
||||
|
||||
const int loadstride_a = get_local_size(0) * LOAD_VEC_A / BK;
|
||||
const int loadstride_b = get_local_size(0) * LOAD_VEC_B / BK;
|
||||
|
||||
int pos_a = (batch_idx_a * batch_stride_a + ir * BM * stride_a) / LOAD_VEC_A;
|
||||
int pos_b = (batch_idx * batch_stride_b + ic * BN * stride_b) / LOAD_VEC_B;
|
||||
|
||||
float sums[TM * TN];
|
||||
float cache_a[TM];
|
||||
float cache_b[TN];
|
||||
|
||||
for (int i = 0; i < TM * TN; i++) {
|
||||
sums[i] = 0.0f;
|
||||
}
|
||||
|
||||
for (int block = 0; block < ne00; block += BK) {
|
||||
for (int l = 0; l < BM; l += loadstride_a) {
|
||||
if (ir*BM + loadc_a + l < ne01) {
|
||||
int idx = pos_a + (loadc_a + l) * stride_a / LOAD_VEC_A + loadr_a;
|
||||
int ib = idx / 4;
|
||||
int iqs = idx % 4;
|
||||
|
||||
float d = (float)src0_d[ib];
|
||||
uint qh_val = src0_qh[ib];
|
||||
|
||||
global uchar4 * qs_ptr = src0_qs + ib*4 + iqs;
|
||||
uchar4 q = *qs_ptr;
|
||||
|
||||
uint qh_lo = qh_val >> (iqs * 4);
|
||||
uint qh_hi = qh_val >> (iqs * 4 + 16);
|
||||
|
||||
uchar4 b_lo = (uchar4)((uchar)qh_lo, (uchar)(qh_lo >> 1), (uchar)(qh_lo >> 2), (uchar)(qh_lo >> 3)) & (uchar)1;
|
||||
uchar4 b_hi = (uchar4)((uchar)qh_hi, (uchar)(qh_hi >> 1), (uchar)(qh_hi >> 2), (uchar)(qh_hi >> 3)) & (uchar)1;
|
||||
|
||||
float4 v1 = (convert_float4((q & (uchar)0x0F) | (b_lo << (uchar)4)) - 16.0f) * d;
|
||||
float4 v2 = (convert_float4((q >> (uchar)4) | (b_hi << (uchar)4)) - 16.0f) * d;
|
||||
|
||||
buf_a[(loadr_a * 4 + 0) * BM + loadc_a + l] = v1.s0;
|
||||
buf_a[(loadr_a * 4 + 1) * BM + loadc_a + l] = v1.s1;
|
||||
buf_a[(loadr_a * 4 + 2) * BM + loadc_a + l] = v1.s2;
|
||||
buf_a[(loadr_a * 4 + 3) * BM + loadc_a + l] = v1.s3;
|
||||
buf_a[(loadr_a * 4 + 16) * BM + loadc_a + l] = v2.s0;
|
||||
buf_a[(loadr_a * 4 + 17) * BM + loadc_a + l] = v2.s1;
|
||||
buf_a[(loadr_a * 4 + 18) * BM + loadc_a + l] = v2.s2;
|
||||
buf_a[(loadr_a * 4 + 19) * BM + loadc_a + l] = v2.s3;
|
||||
} else {
|
||||
buf_a[(loadr_a * 4 + 0) * BM + loadc_a + l] = 0.0f;
|
||||
buf_a[(loadr_a * 4 + 1) * BM + loadc_a + l] = 0.0f;
|
||||
buf_a[(loadr_a * 4 + 2) * BM + loadc_a + l] = 0.0f;
|
||||
buf_a[(loadr_a * 4 + 3) * BM + loadc_a + l] = 0.0f;
|
||||
buf_a[(loadr_a * 4 + 16) * BM + loadc_a + l] = 0.0f;
|
||||
buf_a[(loadr_a * 4 + 17) * BM + loadc_a + l] = 0.0f;
|
||||
buf_a[(loadr_a * 4 + 18) * BM + loadc_a + l] = 0.0f;
|
||||
buf_a[(loadr_a * 4 + 19) * BM + loadc_a + l] = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
for (int l = 0; l < BN; l += loadstride_b) {
|
||||
if (ic*BN + loadc_b + l < ne11) {
|
||||
int idx = pos_b + (loadc_b + l) * stride_b / LOAD_VEC_B + loadr_b;
|
||||
buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = src1[idx].s0;
|
||||
buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = src1[idx].s1;
|
||||
buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = src1[idx].s2;
|
||||
buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = src1[idx].s3;
|
||||
} else {
|
||||
buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = 0.0f;
|
||||
buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = 0.0f;
|
||||
buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = 0.0f;
|
||||
buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
pos_a += BK / LOAD_VEC_A;
|
||||
pos_b += BK / LOAD_VEC_B;
|
||||
|
||||
for (int i = 0; i < BK; i++) {
|
||||
for (int j = 0; j < TM; j++) {
|
||||
cache_a[j] = buf_a[(i) * BM + th_r * TM + j];
|
||||
}
|
||||
|
||||
for (int j = 0; j < TN; j++) {
|
||||
cache_b[j] = buf_b[(i) * BN + th_c * TN + j];
|
||||
}
|
||||
|
||||
for (int cc = 0; cc < TN; cc++) {
|
||||
for (int cr = 0; cr < TM; cr++) {
|
||||
const int sums_idx = cc*TM + cr;
|
||||
sums[sums_idx] = mad(cache_a[cr], cache_b[cc], sums[sums_idx]);
|
||||
}
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
const int dr = ir * BM + th_r * TM;
|
||||
const int dc = ic * BN + th_c * TN;
|
||||
|
||||
const int offsets = batch_idx * batch_stride_d;
|
||||
|
||||
for (int cc = 0; cc < TN; cc++) {
|
||||
for (int cr = 0; cr < TM; cr++) {
|
||||
if (dr + cr < ne01 && dc + cc < ne11) {
|
||||
dst[offsets + (dc + cc) * stride_d + dr + cr] = sums[cc * TM + cr];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,175 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#define LOAD_VEC_A 8
|
||||
#define LOAD_VEC_B 4
|
||||
|
||||
#define BM 64
|
||||
#define BN 64
|
||||
#define BK 32
|
||||
#define TM 4
|
||||
#define TN 8
|
||||
|
||||
kernel void kernel_mul_mm_q5_1_f32_l4_lm(
|
||||
global uchar4 * src0_qs,
|
||||
global uint * src0_qh,
|
||||
global half * src0_d,
|
||||
global half * src0_m,
|
||||
global float4 * src1,
|
||||
ulong offset1,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne11,
|
||||
int ne12,
|
||||
|
||||
int stride_a,
|
||||
int stride_b,
|
||||
int stride_d,
|
||||
|
||||
int batch_stride_a,
|
||||
int batch_stride_b,
|
||||
int batch_stride_d,
|
||||
|
||||
int r2,
|
||||
int r3
|
||||
) {
|
||||
src1 = (global float4*)((global char*)src1 + offset1);
|
||||
dst = (global float *)((global char*)dst + offsetd);
|
||||
|
||||
local float buf_a[BM * BK];
|
||||
local float buf_b[BN * BK];
|
||||
|
||||
const int batch_idx = get_global_id(2);
|
||||
|
||||
const int i13 = batch_idx / ne12;
|
||||
const int i12 = batch_idx % ne12;
|
||||
|
||||
const int i03 = i13 / r3;
|
||||
const int i02 = i12 / r2;
|
||||
|
||||
const int batch_idx_a = i03 * ne02 + i02;
|
||||
|
||||
const int ir = get_group_id(0);
|
||||
const int ic = get_group_id(1);
|
||||
|
||||
const int tid = get_local_id(0);
|
||||
const int th_r = tid % (BM / TM);
|
||||
const int th_c = tid / (BM / TM);
|
||||
|
||||
const int loadr_a = get_local_id(0) % (BK / LOAD_VEC_A);
|
||||
const int loadc_a = get_local_id(0) / (BK / LOAD_VEC_A);
|
||||
const int loadr_b = get_local_id(0) % (BK / LOAD_VEC_B);
|
||||
const int loadc_b = get_local_id(0) / (BK / LOAD_VEC_B);
|
||||
|
||||
const int loadstride_a = get_local_size(0) * LOAD_VEC_A / BK;
|
||||
const int loadstride_b = get_local_size(0) * LOAD_VEC_B / BK;
|
||||
|
||||
int pos_a = (batch_idx_a * batch_stride_a + ir * BM * stride_a) / LOAD_VEC_A;
|
||||
int pos_b = (batch_idx * batch_stride_b + ic * BN * stride_b) / LOAD_VEC_B;
|
||||
|
||||
float sums[TM * TN];
|
||||
float cache_a[TM];
|
||||
float cache_b[TN];
|
||||
|
||||
for (int i = 0; i < TM * TN; i++) {
|
||||
sums[i] = 0.0f;
|
||||
}
|
||||
|
||||
for (int block = 0; block < ne00; block += BK) {
|
||||
for (int l = 0; l < BM; l += loadstride_a) {
|
||||
if (ir*BM + loadc_a + l < ne01) {
|
||||
int idx = pos_a + (loadc_a + l) * stride_a / LOAD_VEC_A + loadr_a;
|
||||
int ib = idx / 4;
|
||||
int iqs = idx % 4;
|
||||
|
||||
float d = (float)src0_d[ib];
|
||||
float m = (float)src0_m[ib];
|
||||
uint qh_val = src0_qh[ib];
|
||||
|
||||
global uchar4 * qs = src0_qs + ib*4 + iqs;
|
||||
uchar4 q = *qs;
|
||||
|
||||
uint qh_lo = qh_val >> (iqs * 4);
|
||||
uint qh_hi = qh_val >> (iqs * 4 + 16);
|
||||
|
||||
uchar4 b_lo = (uchar4)((uchar)qh_lo, (uchar)(qh_lo >> 1), (uchar)(qh_lo >> 2), (uchar)(qh_lo >> 3)) & (uchar)1;
|
||||
uchar4 b_hi = (uchar4)((uchar)qh_hi, (uchar)(qh_hi >> 1), (uchar)(qh_hi >> 2), (uchar)(qh_hi >> 3)) & (uchar)1;
|
||||
|
||||
float4 v1 = convert_float4((q & (uchar)0x0F) | (b_lo << (uchar)4)) * d + m;
|
||||
float4 v2 = convert_float4((q >> (uchar)4) | (b_hi << (uchar)4)) * d + m;
|
||||
|
||||
buf_a[(loadr_a * 4 + 0) * BM + loadc_a + l] = v1.s0;
|
||||
buf_a[(loadr_a * 4 + 1) * BM + loadc_a + l] = v1.s1;
|
||||
buf_a[(loadr_a * 4 + 2) * BM + loadc_a + l] = v1.s2;
|
||||
buf_a[(loadr_a * 4 + 3) * BM + loadc_a + l] = v1.s3;
|
||||
buf_a[(loadr_a * 4 + 16) * BM + loadc_a + l] = v2.s0;
|
||||
buf_a[(loadr_a * 4 + 17) * BM + loadc_a + l] = v2.s1;
|
||||
buf_a[(loadr_a * 4 + 18) * BM + loadc_a + l] = v2.s2;
|
||||
buf_a[(loadr_a * 4 + 19) * BM + loadc_a + l] = v2.s3;
|
||||
} else {
|
||||
buf_a[(loadr_a * 4 + 0) * BM + loadc_a + l] = 0.0f;
|
||||
buf_a[(loadr_a * 4 + 1) * BM + loadc_a + l] = 0.0f;
|
||||
buf_a[(loadr_a * 4 + 2) * BM + loadc_a + l] = 0.0f;
|
||||
buf_a[(loadr_a * 4 + 3) * BM + loadc_a + l] = 0.0f;
|
||||
buf_a[(loadr_a * 4 + 16) * BM + loadc_a + l] = 0.0f;
|
||||
buf_a[(loadr_a * 4 + 17) * BM + loadc_a + l] = 0.0f;
|
||||
buf_a[(loadr_a * 4 + 18) * BM + loadc_a + l] = 0.0f;
|
||||
buf_a[(loadr_a * 4 + 19) * BM + loadc_a + l] = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
for (int l = 0; l < BN; l += loadstride_b) {
|
||||
if (ic*BN + loadc_b + l < ne11) {
|
||||
int idx = pos_b + (loadc_b + l) * stride_b / LOAD_VEC_B + loadr_b;
|
||||
buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = src1[idx].s0;
|
||||
buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = src1[idx].s1;
|
||||
buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = src1[idx].s2;
|
||||
buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = src1[idx].s3;
|
||||
} else {
|
||||
buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = 0.0f;
|
||||
buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = 0.0f;
|
||||
buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = 0.0f;
|
||||
buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
pos_a += BK / LOAD_VEC_A;
|
||||
pos_b += BK / LOAD_VEC_B;
|
||||
|
||||
for (int i = 0; i < BK; i++) {
|
||||
for (int j = 0; j < TM; j++) {
|
||||
cache_a[j] = buf_a[(i) * BM + th_r * TM + j];
|
||||
}
|
||||
|
||||
for (int j = 0; j < TN; j++) {
|
||||
cache_b[j] = buf_b[(i) * BN + th_c * TN + j];
|
||||
}
|
||||
|
||||
for (int cc = 0; cc < TN; cc++) {
|
||||
for (int cr = 0; cr < TM; cr++) {
|
||||
const int sums_idx = cc*TM + cr;
|
||||
sums[sums_idx] = mad(cache_a[cr], cache_b[cc], sums[sums_idx]);
|
||||
}
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
const int dr = ir * BM + th_r * TM;
|
||||
const int dc = ic * BN + th_c * TN;
|
||||
|
||||
const int offsets = batch_idx * batch_stride_d;
|
||||
|
||||
for (int cc = 0; cc < TN; cc++) {
|
||||
for (int cr = 0; cr < TM; cr++) {
|
||||
if (dr + cr < ne01 && dc + cc < ne11) {
|
||||
dst[offsets + (dc + cc) * stride_d + dr + cr] = sums[cc * TM + cr];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,241 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#ifdef cl_intel_subgroups
|
||||
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
||||
#else
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#endif
|
||||
|
||||
#ifdef cl_intel_required_subgroup_size
|
||||
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
||||
#define INTEL_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
||||
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
||||
#elif defined(cl_qcom_reqd_sub_group_size)
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#endif
|
||||
|
||||
#define QK5_0 32
|
||||
|
||||
struct block_q5_0 {
|
||||
half d;
|
||||
uchar qh[4];
|
||||
uchar qs[QK5_0 / 2];
|
||||
};
|
||||
|
||||
inline float block_q5_0_dot_y(
|
||||
global const struct block_q5_0 * qb_curr,
|
||||
float sumy,
|
||||
float16 yl,
|
||||
int il,
|
||||
global const float * yb
|
||||
) {
|
||||
float d = qb_curr->d;
|
||||
|
||||
float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
|
||||
global const ushort * qs = ((global const ushort *)((global const uchar *) qb_curr + 6 + il));
|
||||
|
||||
acc.s0 += yl.s0 * (qs[0] & 0x000F);
|
||||
acc.s0 += yl.s1 * (qs[0] & 0x0F00);
|
||||
acc.s0 += yl.s8 * (qs[0] & 0x00F0);
|
||||
acc.s3 += yl.s9 * (qs[0] & 0xF000);
|
||||
|
||||
acc.s0 += yl.s2 * (qs[1] & 0x000F);
|
||||
acc.s1 += yl.s3 * (qs[1] & 0x0F00);
|
||||
acc.s2 += yl.sa * (qs[1] & 0x00F0);
|
||||
acc.s3 += yl.sb * (qs[1] & 0xF000);
|
||||
|
||||
acc.s0 += yl.s4 * (qs[2] & 0x000F);
|
||||
acc.s1 += yl.s5 * (qs[2] & 0x0F00);
|
||||
acc.s2 += yl.sc * (qs[2] & 0x00F0);
|
||||
acc.s3 += yl.sd * (qs[2] & 0xF000);
|
||||
|
||||
acc.s0 += yl.s6 * (qs[3] & 0x000F);
|
||||
acc.s1 += yl.s7 * (qs[3] & 0x0F00);
|
||||
acc.s2 += yl.se * (qs[3] & 0x00F0);
|
||||
acc.s3 += yl.sf * (qs[3] & 0xF000);
|
||||
|
||||
uint qh_val = *((global const uint *)((global const uchar *) qb_curr + 2));
|
||||
uchar qh_lo = (uchar)((qh_val >> il) & 0xFF);
|
||||
uchar qh_hi = (uchar)((qh_val >> (il + 16)) & 0xFF);
|
||||
|
||||
float qh_sum = 0.0f;
|
||||
qh_sum += yb[0] * (float)((qh_lo >> 0) & 1);
|
||||
qh_sum += yb[1] * (float)((qh_lo >> 1) & 1);
|
||||
qh_sum += yb[2] * (float)((qh_lo >> 2) & 1);
|
||||
qh_sum += yb[3] * (float)((qh_lo >> 3) & 1);
|
||||
qh_sum += yb[4] * (float)((qh_lo >> 4) & 1);
|
||||
qh_sum += yb[5] * (float)((qh_lo >> 5) & 1);
|
||||
qh_sum += yb[6] * (float)((qh_lo >> 6) & 1);
|
||||
qh_sum += yb[7] * (float)((qh_lo >> 7) & 1);
|
||||
qh_sum += yb[16] * (float)((qh_hi >> 0) & 1);
|
||||
qh_sum += yb[17] * (float)((qh_hi >> 1) & 1);
|
||||
qh_sum += yb[18] * (float)((qh_hi >> 2) & 1);
|
||||
qh_sum += yb[19] * (float)((qh_hi >> 3) & 1);
|
||||
qh_sum += yb[20] * (float)((qh_hi >> 4) & 1);
|
||||
qh_sum += yb[21] * (float)((qh_hi >> 5) & 1);
|
||||
qh_sum += yb[22] * (float)((qh_hi >> 6) & 1);
|
||||
qh_sum += yb[23] * (float)((qh_hi >> 7) & 1);
|
||||
|
||||
return d * (acc.s0 + acc.s1 + acc.s2 + acc.s3 + 16.0f * qh_sum - 16.0f * sumy);
|
||||
}
|
||||
|
||||
#undef N_DST
|
||||
#undef N_SIMDGROUP
|
||||
#undef N_SIMDWIDTH
|
||||
|
||||
#ifdef INTEL_GPU
|
||||
#define N_DST 4 // each subgroup works on 4 rows
|
||||
#define N_SIMDGROUP 1 // number of subgroups in a thread group
|
||||
#define N_SIMDWIDTH 16 // assuming subgroup size is 16
|
||||
#elif defined (ADRENO_GPU)
|
||||
#define N_DST 4
|
||||
#define N_SIMDGROUP 1
|
||||
#define N_SIMDWIDTH 64
|
||||
#endif
|
||||
|
||||
inline void mul_vec_q_n_f32(
|
||||
global void * src0,
|
||||
global float * src1,
|
||||
global float * dst,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne10,
|
||||
int ne12,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int r2,
|
||||
int r3
|
||||
) {
|
||||
const ulong nb = ne00/QK5_0;
|
||||
|
||||
int r0 = get_group_id(0);
|
||||
int r1 = get_group_id(1);
|
||||
int im = get_group_id(2);
|
||||
|
||||
int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
|
||||
|
||||
int i12 = im%ne12;
|
||||
int i13 = im/ne12;
|
||||
|
||||
ulong offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
||||
|
||||
global struct block_q5_0 * x = (global struct block_q5_0 *) src0 + offset0;
|
||||
global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
|
||||
|
||||
float16 yl;
|
||||
float4 sumf = (float4)(0.f, 0.f, 0.f, 0.f);
|
||||
|
||||
int ix = get_sub_group_local_id()/2;
|
||||
int il = 8*(get_sub_group_local_id()%2);
|
||||
|
||||
global float * yb = y + ix * QK5_0 + il;
|
||||
|
||||
for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
|
||||
float sumy = 0;
|
||||
|
||||
sumy += yb[0];
|
||||
sumy += yb[1];
|
||||
sumy += yb[2];
|
||||
sumy += yb[3];
|
||||
sumy += yb[4];
|
||||
sumy += yb[5];
|
||||
sumy += yb[6];
|
||||
sumy += yb[7];
|
||||
|
||||
sumy += yb[16];
|
||||
sumy += yb[17];
|
||||
sumy += yb[18];
|
||||
sumy += yb[19];
|
||||
sumy += yb[20];
|
||||
sumy += yb[21];
|
||||
sumy += yb[22];
|
||||
sumy += yb[23];
|
||||
|
||||
|
||||
yl.s0 = yb[0];
|
||||
yl.s1 = yb[1]/256.f;
|
||||
|
||||
yl.s2 = yb[2];
|
||||
yl.s3 = yb[3]/256.f;
|
||||
|
||||
yl.s4 = yb[4];
|
||||
yl.s5 = yb[5]/256.f;
|
||||
|
||||
yl.s6 = yb[6];
|
||||
yl.s7 = yb[7]/256.f;
|
||||
|
||||
yl.s8 = yb[16]/16.f;
|
||||
yl.s9 = yb[17]/4096.f;
|
||||
|
||||
yl.sa = yb[18]/16.f;
|
||||
yl.sb = yb[19]/4096.f;
|
||||
|
||||
yl.sc = yb[20]/16.f;
|
||||
yl.sd = yb[21]/4096.f;
|
||||
|
||||
yl.se = yb[22]/16.f;
|
||||
yl.sf = yb[23]/4096.f;
|
||||
|
||||
sumf.s0 += block_q5_0_dot_y(x+ib+0*nb, sumy, yl, il, yb);
|
||||
sumf.s1 += block_q5_0_dot_y(x+ib+1*nb, sumy, yl, il, yb);
|
||||
sumf.s2 += block_q5_0_dot_y(x+ib+2*nb, sumy, yl, il, yb);
|
||||
sumf.s3 += block_q5_0_dot_y(x+ib+3*nb, sumy, yl, il, yb);
|
||||
|
||||
yb += QK5_0 * (N_SIMDWIDTH/2);
|
||||
}
|
||||
|
||||
float4 tot = (float4)(
|
||||
sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
|
||||
sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3)
|
||||
);
|
||||
|
||||
if (get_sub_group_local_id() == 0) {
|
||||
if (first_row + 0 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
|
||||
}
|
||||
if (first_row + 1 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
|
||||
}
|
||||
if (first_row + 2 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
|
||||
}
|
||||
if (first_row + 3 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef INTEL_GPU
|
||||
REQD_SUBGROUP_SIZE_16
|
||||
#elif defined (ADRENO_GPU)
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
#endif
|
||||
kernel void kernel_mul_mv_q5_0_f32(
|
||||
global void * src0,
|
||||
ulong offset0,
|
||||
global float * src1,
|
||||
ulong offset1,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne10,
|
||||
int ne12,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int r2,
|
||||
int r3
|
||||
) {
|
||||
src0 = (global void*)((global char*)src0 + offset0);
|
||||
src1 = (global float*)((global char*)src1 + offset1);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
mul_vec_q_n_f32(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
|
||||
}
|
||||
@@ -0,0 +1,243 @@
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#ifdef cl_intel_subgroups
|
||||
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
||||
#else
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#endif
|
||||
|
||||
#ifdef cl_intel_required_subgroup_size
|
||||
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
||||
#define INTEL_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
||||
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
||||
#elif defined(cl_qcom_reqd_sub_group_size)
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#endif
|
||||
|
||||
#define QK5_0 32
|
||||
|
||||
inline float block_q5_0_dot_y_flat(
|
||||
global const uchar * x,
|
||||
global const uint * qh_ptr,
|
||||
global const half * dh,
|
||||
float sumy,
|
||||
float16 yl,
|
||||
int il,
|
||||
global const float * yb
|
||||
) {
|
||||
float d = *dh;
|
||||
global const ushort * qs = ((global const ushort *)(x + il));
|
||||
|
||||
float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
|
||||
acc.s0 += yl.s0 * (qs[0] & 0x000F);
|
||||
acc.s0 += yl.s1 * (qs[0] & 0x0F00);
|
||||
acc.s0 += yl.s8 * (qs[0] & 0x00F0);
|
||||
acc.s3 += yl.s9 * (qs[0] & 0xF000);
|
||||
|
||||
acc.s0 += yl.s2 * (qs[1] & 0x000F);
|
||||
acc.s1 += yl.s3 * (qs[1] & 0x0F00);
|
||||
acc.s2 += yl.sa * (qs[1] & 0x00F0);
|
||||
acc.s3 += yl.sb * (qs[1] & 0xF000);
|
||||
|
||||
acc.s0 += yl.s4 * (qs[2] & 0x000F);
|
||||
acc.s1 += yl.s5 * (qs[2] & 0x0F00);
|
||||
acc.s2 += yl.sc * (qs[2] & 0x00F0);
|
||||
acc.s3 += yl.sd * (qs[2] & 0xF000);
|
||||
|
||||
acc.s0 += yl.s6 * (qs[3] & 0x000F);
|
||||
acc.s1 += yl.s7 * (qs[3] & 0x0F00);
|
||||
acc.s2 += yl.se * (qs[3] & 0x00F0);
|
||||
acc.s3 += yl.sf * (qs[3] & 0xF000);
|
||||
|
||||
uint qh_val = *qh_ptr;
|
||||
uchar qh_lo = (uchar)((qh_val >> il) & 0xFF);
|
||||
uchar qh_hi = (uchar)((qh_val >> (il + 16)) & 0xFF);
|
||||
|
||||
float qh_sum = 0.0f;
|
||||
qh_sum += yb[0] * (float)((qh_lo >> 0) & 1);
|
||||
qh_sum += yb[1] * (float)((qh_lo >> 1) & 1);
|
||||
qh_sum += yb[2] * (float)((qh_lo >> 2) & 1);
|
||||
qh_sum += yb[3] * (float)((qh_lo >> 3) & 1);
|
||||
qh_sum += yb[4] * (float)((qh_lo >> 4) & 1);
|
||||
qh_sum += yb[5] * (float)((qh_lo >> 5) & 1);
|
||||
qh_sum += yb[6] * (float)((qh_lo >> 6) & 1);
|
||||
qh_sum += yb[7] * (float)((qh_lo >> 7) & 1);
|
||||
qh_sum += yb[16] * (float)((qh_hi >> 0) & 1);
|
||||
qh_sum += yb[17] * (float)((qh_hi >> 1) & 1);
|
||||
qh_sum += yb[18] * (float)((qh_hi >> 2) & 1);
|
||||
qh_sum += yb[19] * (float)((qh_hi >> 3) & 1);
|
||||
qh_sum += yb[20] * (float)((qh_hi >> 4) & 1);
|
||||
qh_sum += yb[21] * (float)((qh_hi >> 5) & 1);
|
||||
qh_sum += yb[22] * (float)((qh_hi >> 6) & 1);
|
||||
qh_sum += yb[23] * (float)((qh_hi >> 7) & 1);
|
||||
|
||||
return d * (acc.s0 + acc.s1 + acc.s2 + acc.s3 + 16.0f * qh_sum - 16.0f * sumy);
|
||||
}
|
||||
|
||||
#undef N_DST
|
||||
#undef N_SIMDGROUP
|
||||
#undef N_SIMDWIDTH
|
||||
|
||||
#ifdef INTEL_GPU
|
||||
#define N_DST 4 // each subgroup works on 4 rows
|
||||
#define N_SIMDGROUP 1 // number of subgroups in a thread group
|
||||
#define N_SIMDWIDTH 16 // assuming subgroup size is 16
|
||||
#elif defined (ADRENO_GPU)
|
||||
#define N_DST 4
|
||||
#define N_SIMDGROUP 1
|
||||
#define N_SIMDWIDTH 64
|
||||
#endif
|
||||
|
||||
inline void mul_vec_q_n_f32_flat(
|
||||
global void * src0_qs,
|
||||
global void * src0_qh,
|
||||
global void * src0_d,
|
||||
global float * src1,
|
||||
global float * dst,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne10,
|
||||
int ne12,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int r2,
|
||||
int r3
|
||||
) {
|
||||
const ulong nb = ne00/QK5_0;
|
||||
|
||||
int r0 = get_group_id(0);
|
||||
int r1 = get_group_id(1);
|
||||
int im = get_group_id(2);
|
||||
|
||||
int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
|
||||
|
||||
int i12 = im%ne12;
|
||||
int i13 = im/ne12;
|
||||
|
||||
ulong offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
||||
|
||||
ulong offset0_qs = offset0 * (QK5_0/2);
|
||||
|
||||
global uchar * x = (global uchar *) src0_qs + offset0_qs;
|
||||
global uint * qh = (global uint *) src0_qh + offset0;
|
||||
global half * d = (global half *) src0_d + offset0;
|
||||
global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
|
||||
|
||||
float16 yl;
|
||||
float4 sumf = (float4)(0.f, 0.f, 0.f, 0.f);
|
||||
|
||||
int ix = get_sub_group_local_id()/2;
|
||||
int il = 8*(get_sub_group_local_id()%2);
|
||||
|
||||
global float * yb = y + ix * QK5_0 + il;
|
||||
|
||||
for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
|
||||
float sumy = 0;
|
||||
|
||||
sumy += yb[0];
|
||||
sumy += yb[1];
|
||||
sumy += yb[2];
|
||||
sumy += yb[3];
|
||||
sumy += yb[4];
|
||||
sumy += yb[5];
|
||||
sumy += yb[6];
|
||||
sumy += yb[7];
|
||||
|
||||
sumy += yb[16];
|
||||
sumy += yb[17];
|
||||
sumy += yb[18];
|
||||
sumy += yb[19];
|
||||
sumy += yb[20];
|
||||
sumy += yb[21];
|
||||
sumy += yb[22];
|
||||
sumy += yb[23];
|
||||
|
||||
|
||||
yl.s0 = yb[0];
|
||||
yl.s1 = yb[1]/256.f;
|
||||
|
||||
yl.s2 = yb[2];
|
||||
yl.s3 = yb[3]/256.f;
|
||||
|
||||
yl.s4 = yb[4];
|
||||
yl.s5 = yb[5]/256.f;
|
||||
|
||||
yl.s6 = yb[6];
|
||||
yl.s7 = yb[7]/256.f;
|
||||
|
||||
yl.s8 = yb[16]/16.f;
|
||||
yl.s9 = yb[17]/4096.f;
|
||||
|
||||
yl.sa = yb[18]/16.f;
|
||||
yl.sb = yb[19]/4096.f;
|
||||
|
||||
yl.sc = yb[20]/16.f;
|
||||
yl.sd = yb[21]/4096.f;
|
||||
|
||||
yl.se = yb[22]/16.f;
|
||||
yl.sf = yb[23]/4096.f;
|
||||
|
||||
sumf.s0 += block_q5_0_dot_y_flat(x + ib*(QK5_0/2) + 0*nb*(QK5_0/2), qh + ib + 0*nb, d + ib + 0*nb, sumy, yl, il, yb);
|
||||
sumf.s1 += block_q5_0_dot_y_flat(x + ib*(QK5_0/2) + 1*nb*(QK5_0/2), qh + ib + 1*nb, d + ib + 1*nb, sumy, yl, il, yb);
|
||||
sumf.s2 += block_q5_0_dot_y_flat(x + ib*(QK5_0/2) + 2*nb*(QK5_0/2), qh + ib + 2*nb, d + ib + 2*nb, sumy, yl, il, yb);
|
||||
sumf.s3 += block_q5_0_dot_y_flat(x + ib*(QK5_0/2) + 3*nb*(QK5_0/2), qh + ib + 3*nb, d + ib + 3*nb, sumy, yl, il, yb);
|
||||
|
||||
yb += QK5_0 * (N_SIMDWIDTH/2);
|
||||
}
|
||||
|
||||
float4 tot = (float4)(
|
||||
sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
|
||||
sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3)
|
||||
);
|
||||
|
||||
if (get_sub_group_local_id() == 0) {
|
||||
if (first_row + 0 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
|
||||
}
|
||||
if (first_row + 1 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
|
||||
}
|
||||
if (first_row + 2 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
|
||||
}
|
||||
if (first_row + 3 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef INTEL_GPU
|
||||
REQD_SUBGROUP_SIZE_16
|
||||
#elif defined (ADRENO_GPU)
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
#endif
|
||||
kernel void kernel_mul_mv_q5_0_f32_flat(
|
||||
global void * src0_qs,
|
||||
global void * src0_qh,
|
||||
global void * src0_d,
|
||||
global float * src1,
|
||||
ulong offset1,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne10,
|
||||
int ne12,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int r2,
|
||||
int r3
|
||||
) {
|
||||
src1 = (global float*)((global char*)src1 + offset1);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
mul_vec_q_n_f32_flat(src0_qs, src0_qh, src0_d, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
|
||||
}
|
||||
@@ -0,0 +1,243 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#ifdef cl_intel_subgroups
|
||||
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
||||
#else
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#endif
|
||||
|
||||
#ifdef cl_intel_required_subgroup_size
|
||||
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
||||
#define INTEL_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
||||
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
||||
#elif defined(cl_qcom_reqd_sub_group_size)
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#endif
|
||||
|
||||
#define QK5_1 32
|
||||
|
||||
struct block_q5_1 {
|
||||
half d;
|
||||
half m;
|
||||
uchar qh[4];
|
||||
uchar qs[QK5_1 / 2];
|
||||
};
|
||||
|
||||
inline float block_q5_1_dot_y(
|
||||
global const struct block_q5_1 * qb_curr,
|
||||
float sumy,
|
||||
float16 yl,
|
||||
int il,
|
||||
global const float * yb
|
||||
) {
|
||||
float d = qb_curr->d;
|
||||
float m = qb_curr->m;
|
||||
|
||||
float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
|
||||
global const ushort * qs = ((global const ushort *)((global const uchar *) qb_curr + 8 + il));
|
||||
|
||||
acc.s0 += yl.s0 * (qs[0] & 0x000F);
|
||||
acc.s0 += yl.s1 * (qs[0] & 0x0F00);
|
||||
acc.s0 += yl.s8 * (qs[0] & 0x00F0);
|
||||
acc.s3 += yl.s9 * (qs[0] & 0xF000);
|
||||
|
||||
acc.s0 += yl.s2 * (qs[1] & 0x000F);
|
||||
acc.s1 += yl.s3 * (qs[1] & 0x0F00);
|
||||
acc.s2 += yl.sa * (qs[1] & 0x00F0);
|
||||
acc.s3 += yl.sb * (qs[1] & 0xF000);
|
||||
|
||||
acc.s0 += yl.s4 * (qs[2] & 0x000F);
|
||||
acc.s1 += yl.s5 * (qs[2] & 0x0F00);
|
||||
acc.s2 += yl.sc * (qs[2] & 0x00F0);
|
||||
acc.s3 += yl.sd * (qs[2] & 0xF000);
|
||||
|
||||
acc.s0 += yl.s6 * (qs[3] & 0x000F);
|
||||
acc.s1 += yl.s7 * (qs[3] & 0x0F00);
|
||||
acc.s2 += yl.se * (qs[3] & 0x00F0);
|
||||
acc.s3 += yl.sf * (qs[3] & 0xF000);
|
||||
|
||||
uint qh_val = *((global const uint *)((global const uchar *) qb_curr + 4));
|
||||
uchar qh_lo = (uchar)((qh_val >> il) & 0xFF);
|
||||
uchar qh_hi = (uchar)((qh_val >> (il + 16)) & 0xFF);
|
||||
|
||||
float qh_sum = 0.0f;
|
||||
qh_sum += yb[0] * (float)((qh_lo >> 0) & 1);
|
||||
qh_sum += yb[1] * (float)((qh_lo >> 1) & 1);
|
||||
qh_sum += yb[2] * (float)((qh_lo >> 2) & 1);
|
||||
qh_sum += yb[3] * (float)((qh_lo >> 3) & 1);
|
||||
qh_sum += yb[4] * (float)((qh_lo >> 4) & 1);
|
||||
qh_sum += yb[5] * (float)((qh_lo >> 5) & 1);
|
||||
qh_sum += yb[6] * (float)((qh_lo >> 6) & 1);
|
||||
qh_sum += yb[7] * (float)((qh_lo >> 7) & 1);
|
||||
qh_sum += yb[16] * (float)((qh_hi >> 0) & 1);
|
||||
qh_sum += yb[17] * (float)((qh_hi >> 1) & 1);
|
||||
qh_sum += yb[18] * (float)((qh_hi >> 2) & 1);
|
||||
qh_sum += yb[19] * (float)((qh_hi >> 3) & 1);
|
||||
qh_sum += yb[20] * (float)((qh_hi >> 4) & 1);
|
||||
qh_sum += yb[21] * (float)((qh_hi >> 5) & 1);
|
||||
qh_sum += yb[22] * (float)((qh_hi >> 6) & 1);
|
||||
qh_sum += yb[23] * (float)((qh_hi >> 7) & 1);
|
||||
|
||||
return d * (acc.s0 + acc.s1 + acc.s2 + acc.s3 + 16.0f * qh_sum) + sumy * m;
|
||||
}
|
||||
|
||||
#undef N_DST
|
||||
#undef N_SIMDGROUP
|
||||
#undef N_SIMDWIDTH
|
||||
|
||||
#ifdef INTEL_GPU
|
||||
#define N_DST 4 // each subgroup works on 4 rows
|
||||
#define N_SIMDGROUP 1 // number of subgroups in a thread group
|
||||
#define N_SIMDWIDTH 16 // assuming subgroup size is 16
|
||||
#elif defined (ADRENO_GPU)
|
||||
#define N_DST 4
|
||||
#define N_SIMDGROUP 1
|
||||
#define N_SIMDWIDTH 64
|
||||
#endif
|
||||
|
||||
inline void mul_vec_q_n_f32(
|
||||
global void * src0,
|
||||
global float * src1,
|
||||
global float * dst,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne10,
|
||||
int ne12,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int r2,
|
||||
int r3
|
||||
) {
|
||||
const ulong nb = ne00/QK5_1;
|
||||
|
||||
int r0 = get_group_id(0);
|
||||
int r1 = get_group_id(1);
|
||||
int im = get_group_id(2);
|
||||
|
||||
int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
|
||||
|
||||
int i12 = im%ne12;
|
||||
int i13 = im/ne12;
|
||||
|
||||
ulong offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
||||
|
||||
global struct block_q5_1 * x = (global struct block_q5_1 *) src0 + offset0;
|
||||
global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
|
||||
|
||||
float16 yl;
|
||||
float4 sumf = (float4)(0.f, 0.f, 0.f, 0.f);
|
||||
|
||||
int ix = get_sub_group_local_id()/2;
|
||||
int il = 8*(get_sub_group_local_id()%2);
|
||||
|
||||
global float * yb = y + ix * QK5_1 + il;
|
||||
|
||||
for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
|
||||
float sumy = 0;
|
||||
|
||||
sumy += yb[0];
|
||||
sumy += yb[1];
|
||||
sumy += yb[2];
|
||||
sumy += yb[3];
|
||||
sumy += yb[4];
|
||||
sumy += yb[5];
|
||||
sumy += yb[6];
|
||||
sumy += yb[7];
|
||||
|
||||
sumy += yb[16];
|
||||
sumy += yb[17];
|
||||
sumy += yb[18];
|
||||
sumy += yb[19];
|
||||
sumy += yb[20];
|
||||
sumy += yb[21];
|
||||
sumy += yb[22];
|
||||
sumy += yb[23];
|
||||
|
||||
|
||||
yl.s0 = yb[0];
|
||||
yl.s1 = yb[1]/256.f;
|
||||
|
||||
yl.s2 = yb[2];
|
||||
yl.s3 = yb[3]/256.f;
|
||||
|
||||
yl.s4 = yb[4];
|
||||
yl.s5 = yb[5]/256.f;
|
||||
|
||||
yl.s6 = yb[6];
|
||||
yl.s7 = yb[7]/256.f;
|
||||
|
||||
yl.s8 = yb[16]/16.f;
|
||||
yl.s9 = yb[17]/4096.f;
|
||||
|
||||
yl.sa = yb[18]/16.f;
|
||||
yl.sb = yb[19]/4096.f;
|
||||
|
||||
yl.sc = yb[20]/16.f;
|
||||
yl.sd = yb[21]/4096.f;
|
||||
|
||||
yl.se = yb[22]/16.f;
|
||||
yl.sf = yb[23]/4096.f;
|
||||
|
||||
sumf.s0 += block_q5_1_dot_y(x+ib+0*nb, sumy, yl, il, yb);
|
||||
sumf.s1 += block_q5_1_dot_y(x+ib+1*nb, sumy, yl, il, yb);
|
||||
sumf.s2 += block_q5_1_dot_y(x+ib+2*nb, sumy, yl, il, yb);
|
||||
sumf.s3 += block_q5_1_dot_y(x+ib+3*nb, sumy, yl, il, yb);
|
||||
|
||||
yb += QK5_1 * (N_SIMDWIDTH/2);
|
||||
}
|
||||
|
||||
float4 tot = (float4)(
|
||||
sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
|
||||
sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3)
|
||||
);
|
||||
|
||||
if (get_sub_group_local_id() == 0) {
|
||||
if (first_row + 0 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
|
||||
}
|
||||
if (first_row + 1 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
|
||||
}
|
||||
if (first_row + 2 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
|
||||
}
|
||||
if (first_row + 3 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef INTEL_GPU
|
||||
REQD_SUBGROUP_SIZE_16
|
||||
#elif defined (ADRENO_GPU)
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
#endif
|
||||
kernel void kernel_mul_mv_q5_1_f32(
|
||||
global void * src0,
|
||||
ulong offset0,
|
||||
global float * src1,
|
||||
ulong offset1,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne10,
|
||||
int ne12,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int r2,
|
||||
int r3
|
||||
) {
|
||||
src0 = (global void*)((global char*)src0 + offset0);
|
||||
src1 = (global float*)((global char*)src1 + offset1);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
mul_vec_q_n_f32(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
|
||||
}
|
||||
@@ -0,0 +1,247 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#ifdef cl_intel_subgroups
|
||||
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
||||
#else
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#endif
|
||||
|
||||
#ifdef cl_intel_required_subgroup_size
|
||||
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
||||
#define INTEL_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
||||
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
||||
#elif defined(cl_qcom_reqd_sub_group_size)
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#endif
|
||||
|
||||
#define QK5_1 32
|
||||
|
||||
inline float block_q5_1_dot_y_flat(
|
||||
global const uchar * x,
|
||||
global const uint * qh_ptr,
|
||||
global const half * dh,
|
||||
global const half * mh,
|
||||
float sumy,
|
||||
float16 yl,
|
||||
int il,
|
||||
global const float * yb
|
||||
) {
|
||||
float d = *dh;
|
||||
float m = *mh;
|
||||
global const ushort * qs = ((global const ushort *)(x + il));
|
||||
|
||||
float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
|
||||
acc.s0 += yl.s0 * (qs[0] & 0x000F);
|
||||
acc.s0 += yl.s1 * (qs[0] & 0x0F00);
|
||||
acc.s0 += yl.s8 * (qs[0] & 0x00F0);
|
||||
acc.s3 += yl.s9 * (qs[0] & 0xF000);
|
||||
|
||||
acc.s0 += yl.s2 * (qs[1] & 0x000F);
|
||||
acc.s1 += yl.s3 * (qs[1] & 0x0F00);
|
||||
acc.s2 += yl.sa * (qs[1] & 0x00F0);
|
||||
acc.s3 += yl.sb * (qs[1] & 0xF000);
|
||||
|
||||
acc.s0 += yl.s4 * (qs[2] & 0x000F);
|
||||
acc.s1 += yl.s5 * (qs[2] & 0x0F00);
|
||||
acc.s2 += yl.sc * (qs[2] & 0x00F0);
|
||||
acc.s3 += yl.sd * (qs[2] & 0xF000);
|
||||
|
||||
acc.s0 += yl.s6 * (qs[3] & 0x000F);
|
||||
acc.s1 += yl.s7 * (qs[3] & 0x0F00);
|
||||
acc.s2 += yl.se * (qs[3] & 0x00F0);
|
||||
acc.s3 += yl.sf * (qs[3] & 0xF000);
|
||||
|
||||
uint qh_val = *qh_ptr;
|
||||
uchar qh_lo = (uchar)((qh_val >> il) & 0xFF);
|
||||
uchar qh_hi = (uchar)((qh_val >> (il + 16)) & 0xFF);
|
||||
|
||||
float qh_sum = 0.0f;
|
||||
qh_sum += yb[0] * (float)((qh_lo >> 0) & 1);
|
||||
qh_sum += yb[1] * (float)((qh_lo >> 1) & 1);
|
||||
qh_sum += yb[2] * (float)((qh_lo >> 2) & 1);
|
||||
qh_sum += yb[3] * (float)((qh_lo >> 3) & 1);
|
||||
qh_sum += yb[4] * (float)((qh_lo >> 4) & 1);
|
||||
qh_sum += yb[5] * (float)((qh_lo >> 5) & 1);
|
||||
qh_sum += yb[6] * (float)((qh_lo >> 6) & 1);
|
||||
qh_sum += yb[7] * (float)((qh_lo >> 7) & 1);
|
||||
qh_sum += yb[16] * (float)((qh_hi >> 0) & 1);
|
||||
qh_sum += yb[17] * (float)((qh_hi >> 1) & 1);
|
||||
qh_sum += yb[18] * (float)((qh_hi >> 2) & 1);
|
||||
qh_sum += yb[19] * (float)((qh_hi >> 3) & 1);
|
||||
qh_sum += yb[20] * (float)((qh_hi >> 4) & 1);
|
||||
qh_sum += yb[21] * (float)((qh_hi >> 5) & 1);
|
||||
qh_sum += yb[22] * (float)((qh_hi >> 6) & 1);
|
||||
qh_sum += yb[23] * (float)((qh_hi >> 7) & 1);
|
||||
|
||||
return d * (acc.s0 + acc.s1 + acc.s2 + acc.s3 + 16.0f * qh_sum) + sumy * m;
|
||||
}
|
||||
|
||||
#undef N_DST
|
||||
#undef N_SIMDGROUP
|
||||
#undef N_SIMDWIDTH
|
||||
|
||||
#ifdef INTEL_GPU
|
||||
#define N_DST 4 // each subgroup works on 4 rows
|
||||
#define N_SIMDGROUP 1 // number of subgroups in a thread group
|
||||
#define N_SIMDWIDTH 16 // assuming subgroup size is 16
|
||||
#elif defined (ADRENO_GPU)
|
||||
#define N_DST 4
|
||||
#define N_SIMDGROUP 1
|
||||
#define N_SIMDWIDTH 64
|
||||
#endif
|
||||
|
||||
inline void mul_vec_q_n_f32_flat(
|
||||
global void * src0_qs,
|
||||
global void * src0_qh,
|
||||
global void * src0_d,
|
||||
global void * src0_m,
|
||||
global float * src1,
|
||||
global float * dst,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne10,
|
||||
int ne12,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int r2,
|
||||
int r3
|
||||
) {
|
||||
const ulong nb = ne00/QK5_1;
|
||||
|
||||
int r0 = get_group_id(0);
|
||||
int r1 = get_group_id(1);
|
||||
int im = get_group_id(2);
|
||||
|
||||
int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
|
||||
|
||||
int i12 = im%ne12;
|
||||
int i13 = im/ne12;
|
||||
|
||||
ulong offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
||||
|
||||
ulong offset0_qs = offset0 * (QK5_1/2);
|
||||
|
||||
global uchar * x = (global uchar *) src0_qs + offset0_qs;
|
||||
global uint * qh = (global uint *) src0_qh + offset0;
|
||||
global half * d = (global half *) src0_d + offset0;
|
||||
global half * ms = (global half *) src0_m + offset0;
|
||||
global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
|
||||
|
||||
float16 yl;
|
||||
float4 sumf = (float4)(0.f, 0.f, 0.f, 0.f);
|
||||
|
||||
int ix = get_sub_group_local_id()/2;
|
||||
int il = 8*(get_sub_group_local_id()%2);
|
||||
|
||||
global float * yb = y + ix * QK5_1 + il;
|
||||
|
||||
for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
|
||||
float sumy = 0;
|
||||
|
||||
sumy += yb[0];
|
||||
sumy += yb[1];
|
||||
sumy += yb[2];
|
||||
sumy += yb[3];
|
||||
sumy += yb[4];
|
||||
sumy += yb[5];
|
||||
sumy += yb[6];
|
||||
sumy += yb[7];
|
||||
|
||||
sumy += yb[16];
|
||||
sumy += yb[17];
|
||||
sumy += yb[18];
|
||||
sumy += yb[19];
|
||||
sumy += yb[20];
|
||||
sumy += yb[21];
|
||||
sumy += yb[22];
|
||||
sumy += yb[23];
|
||||
|
||||
|
||||
yl.s0 = yb[0];
|
||||
yl.s1 = yb[1]/256.f;
|
||||
|
||||
yl.s2 = yb[2];
|
||||
yl.s3 = yb[3]/256.f;
|
||||
|
||||
yl.s4 = yb[4];
|
||||
yl.s5 = yb[5]/256.f;
|
||||
|
||||
yl.s6 = yb[6];
|
||||
yl.s7 = yb[7]/256.f;
|
||||
|
||||
yl.s8 = yb[16]/16.f;
|
||||
yl.s9 = yb[17]/4096.f;
|
||||
|
||||
yl.sa = yb[18]/16.f;
|
||||
yl.sb = yb[19]/4096.f;
|
||||
|
||||
yl.sc = yb[20]/16.f;
|
||||
yl.sd = yb[21]/4096.f;
|
||||
|
||||
yl.se = yb[22]/16.f;
|
||||
yl.sf = yb[23]/4096.f;
|
||||
|
||||
sumf.s0 += block_q5_1_dot_y_flat(x + ib*(QK5_1/2) + 0*nb*(QK5_1/2), qh + ib + 0*nb, d + ib + 0*nb, ms + ib + 0*nb, sumy, yl, il, yb);
|
||||
sumf.s1 += block_q5_1_dot_y_flat(x + ib*(QK5_1/2) + 1*nb*(QK5_1/2), qh + ib + 1*nb, d + ib + 1*nb, ms + ib + 1*nb, sumy, yl, il, yb);
|
||||
sumf.s2 += block_q5_1_dot_y_flat(x + ib*(QK5_1/2) + 2*nb*(QK5_1/2), qh + ib + 2*nb, d + ib + 2*nb, ms + ib + 2*nb, sumy, yl, il, yb);
|
||||
sumf.s3 += block_q5_1_dot_y_flat(x + ib*(QK5_1/2) + 3*nb*(QK5_1/2), qh + ib + 3*nb, d + ib + 3*nb, ms + ib + 3*nb, sumy, yl, il, yb);
|
||||
|
||||
yb += QK5_1 * (N_SIMDWIDTH/2);
|
||||
}
|
||||
|
||||
float4 tot = (float4)(
|
||||
sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
|
||||
sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3)
|
||||
);
|
||||
|
||||
if (get_sub_group_local_id() == 0) {
|
||||
if (first_row + 0 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
|
||||
}
|
||||
if (first_row + 1 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
|
||||
}
|
||||
if (first_row + 2 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
|
||||
}
|
||||
if (first_row + 3 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef INTEL_GPU
|
||||
REQD_SUBGROUP_SIZE_16
|
||||
#elif defined (ADRENO_GPU)
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
#endif
|
||||
kernel void kernel_mul_mv_q5_1_f32_flat(
|
||||
global void * src0_qs,
|
||||
global void * src0_qh,
|
||||
global void * src0_d,
|
||||
global void * src0_m,
|
||||
global float * src1,
|
||||
ulong offset1,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne10,
|
||||
int ne12,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int r2,
|
||||
int r3
|
||||
) {
|
||||
src1 = (global float*)((global char*)src1 + offset1);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
mul_vec_q_n_f32_flat(src0_qs, src0_qh, src0_d, src0_m, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
|
||||
}
|
||||
@@ -62,8 +62,10 @@ typedef struct VkPhysicalDeviceCooperativeMatrixDecodeVectorFeaturesNV {
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <unordered_map>
|
||||
#include <shared_mutex>
|
||||
#include <mutex>
|
||||
#include <future>
|
||||
#include <condition_variable>
|
||||
#include <thread>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
@@ -158,8 +160,9 @@ struct vk_pipeline_struct {
|
||||
uint32_t align;
|
||||
// true if fields have been set by ggml_vk_create_pipeline
|
||||
bool initialized {};
|
||||
// set to true to request the pipeline is compiled
|
||||
std::atomic<bool> needed {};
|
||||
// true while a compile is in flight, used to dedupe concurrent claims.
|
||||
// Protected by device->compile_mutex.
|
||||
bool compile_pending {};
|
||||
// set to true when the shader has been compiled
|
||||
std::atomic<bool> compiled {};
|
||||
// number of registers used, extracted from pipeline executable properties
|
||||
@@ -618,6 +621,14 @@ static constexpr std::initializer_list<std::array<int, 3>> rms_norm_mul_rope_vie
|
||||
|
||||
struct vk_device_struct {
|
||||
std::recursive_mutex mutex;
|
||||
mutable std::shared_mutex pinned_memory_mutex;
|
||||
|
||||
// Guards compile_pending, all_pipelines, and the dynamic pipeline maps
|
||||
// (flash_attn, fa_mask_opt, solve_tri, conv2d, etc). The actual compile
|
||||
// runs with no lock held, so different pipelines can compile in parallel.
|
||||
// Lock order is device->mutex -> compile_mutex, never the reverse.
|
||||
std::mutex compile_mutex;
|
||||
std::condition_variable compile_cv;
|
||||
|
||||
vk::PhysicalDevice physical_device;
|
||||
vk::PhysicalDeviceProperties properties;
|
||||
@@ -1727,7 +1738,7 @@ struct ggml_vk_garbage_collector {
|
||||
};
|
||||
|
||||
static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx, vk_context subctx);
|
||||
static void ggml_vk_load_shaders(vk_device& device);
|
||||
static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested = nullptr);
|
||||
static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx);
|
||||
|
||||
static bool vk_memory_logger_enabled = false;
|
||||
@@ -2194,11 +2205,6 @@ static void ggml_vk_wait_for_fence(ggml_backend_vk_context * ctx) {
|
||||
ctx->device->device.resetFences({ ctx->fence });
|
||||
}
|
||||
|
||||
// variables to track number of compiles in progress
|
||||
static uint32_t compile_count = 0;
|
||||
static std::mutex compile_count_mutex;
|
||||
static std::condition_variable compile_count_cond;
|
||||
|
||||
static constexpr uint32_t kSpvOpCooperativeMatrixLoadTensorNV = 5367;
|
||||
static constexpr uint32_t kSpvCapabilityCooperativeMatrixDecodeVectorNV = 5447;
|
||||
static constexpr uint32_t kSpvTensorAddressingDecodeVectorFuncBit = 0x4;
|
||||
@@ -2493,7 +2499,6 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
|
||||
std::cerr << "ggml_vulkan: " << e.what() << std::endl;
|
||||
throw e;
|
||||
}
|
||||
pipeline->compiled = true;
|
||||
|
||||
if (vk_instance.debug_utils_support) {
|
||||
vk::DebugUtilsObjectNameInfoEXT duoni;
|
||||
@@ -2542,14 +2547,13 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
|
||||
}
|
||||
}
|
||||
|
||||
device->all_pipelines.push_back(pipeline);
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> guard(compile_count_mutex);
|
||||
assert(compile_count > 0);
|
||||
compile_count--;
|
||||
std::lock_guard<std::mutex> guard(device->compile_mutex);
|
||||
device->all_pipelines.push_back(pipeline);
|
||||
pipeline->compiled = true;
|
||||
pipeline->compile_pending = false;
|
||||
}
|
||||
compile_count_cond.notify_all();
|
||||
device->compile_cv.notify_all();
|
||||
}
|
||||
|
||||
static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) {
|
||||
@@ -2565,8 +2569,7 @@ static void ggml_pipeline_request_descriptor_sets(ggml_backend_vk_context *ctx,
|
||||
VK_LOG_DEBUG("ggml_pipeline_request_descriptor_sets(" << pipeline->name << ", " << n << ")");
|
||||
ctx->pipeline_descriptor_set_requirements += n;
|
||||
if (!pipeline->compiled) {
|
||||
pipeline->needed = true;
|
||||
ggml_vk_load_shaders(ctx->device);
|
||||
ggml_vk_load_shaders(ctx->device, pipeline);
|
||||
}
|
||||
ggml_pipeline_allocate_descriptor_sets(ctx);
|
||||
}
|
||||
@@ -3565,10 +3568,26 @@ static bool ggml_vk_fa_scalar_uses_mmq(const vk_device& device, ggml_type k_type
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_vk_load_shaders(vk_device& device) {
|
||||
// load_shaders walks the pipeline list under compile_mutex and either claims
|
||||
// the requested pipeline for compilation or, if another thread is already
|
||||
// compiling it, drops the lock and waits on compile_cv. Compiles themselves
|
||||
// run unlocked.
|
||||
struct CompileTask {
|
||||
vk_pipeline pipeline;
|
||||
size_t spv_size;
|
||||
const void * spv_data;
|
||||
std::string entrypoint;
|
||||
uint32_t parameter_count;
|
||||
std::array<uint32_t, 3> wg_denoms;
|
||||
std::vector<uint32_t> specialization_constants;
|
||||
bool disable_robustness;
|
||||
bool require_full_subgroups;
|
||||
uint32_t required_subgroup_size;
|
||||
};
|
||||
|
||||
static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
|
||||
VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")");
|
||||
|
||||
std::lock_guard<std::recursive_mutex> guard(device->mutex);
|
||||
// some shaders have a minimum subgroup size
|
||||
const uint32_t subgroup_size_8 = std::max(device->subgroup_size, 8u);
|
||||
const uint32_t subgroup_size_16 = std::max(device->subgroup_size, 16u);
|
||||
@@ -3598,6 +3617,15 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
l_mmqid_wg_denoms, m_mmqid_wg_denoms, s_mmqid_wg_denoms;
|
||||
|
||||
uint32_t l_align, m_align, s_align;
|
||||
|
||||
vk_pipeline wait_pipeline;
|
||||
CompileTask claimed_task {};
|
||||
bool has_claimed_task = false;
|
||||
|
||||
// The rest of the walk reads and writes shared device state, so hold the
|
||||
// lock until we're done deciding what to compile.
|
||||
std::unique_lock<std::mutex> compile_lock(device->compile_mutex);
|
||||
|
||||
if (device->coopmat2) {
|
||||
// spec constants and tile sizes for non-quant matmul/matmul_id
|
||||
l_warptile = { 256, 128, 256, 64, 1 };
|
||||
@@ -3783,7 +3811,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
device->pipeline_matmul_id_bf16 = std::make_shared<vk_matmul_pipeline_struct>();
|
||||
}
|
||||
|
||||
std::vector<std::future<void>> compiles;
|
||||
auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& base_pipeline, const char *name, size_t spv_size, const void* spv_data, const char *entrypoint,
|
||||
uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants,
|
||||
uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) {
|
||||
@@ -3817,23 +3844,33 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
#endif
|
||||
}
|
||||
|
||||
if (!pipeline->needed || pipeline->compiled) {
|
||||
// We only care about the pipeline this call asked for; the rest
|
||||
// (including the 64-bit indexing variant) are handled by their
|
||||
// own request_descriptor_sets / load_shaders calls.
|
||||
if (pipeline.get() != requested.get()) {
|
||||
continue;
|
||||
}
|
||||
// TODO: We're no longer benefitting from the async compiles (shaders are
|
||||
// compiled individually, as needed) and this complexity can be removed.
|
||||
{
|
||||
// wait until fewer than N compiles are in progress
|
||||
uint32_t N = std::max(1u, std::thread::hardware_concurrency());
|
||||
std::unique_lock<std::mutex> guard(compile_count_mutex);
|
||||
while (compile_count >= N) {
|
||||
compile_count_cond.wait(guard);
|
||||
}
|
||||
compile_count++;
|
||||
|
||||
if (pipeline->compiled) {
|
||||
continue;
|
||||
}
|
||||
|
||||
compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), spv_size, spv_data, entrypoint,
|
||||
parameter_count, wg_denoms, specialization_constants, disable_robustness, require_full_subgroups, required_subgroup_size));
|
||||
wait_pipeline = pipeline;
|
||||
|
||||
if (!pipeline->compile_pending) {
|
||||
pipeline->compile_pending = true;
|
||||
claimed_task.pipeline = pipeline;
|
||||
claimed_task.spv_size = spv_size;
|
||||
claimed_task.spv_data = spv_data;
|
||||
claimed_task.entrypoint = entrypoint;
|
||||
claimed_task.parameter_count = parameter_count;
|
||||
claimed_task.wg_denoms = wg_denoms;
|
||||
claimed_task.specialization_constants = specialization_constants;
|
||||
claimed_task.disable_robustness = disable_robustness;
|
||||
claimed_task.require_full_subgroups = require_full_subgroups;
|
||||
claimed_task.required_subgroup_size = required_subgroup_size;
|
||||
has_claimed_task = true;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -5330,8 +5367,25 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
}
|
||||
}
|
||||
|
||||
for (auto &c : compiles) {
|
||||
c.wait();
|
||||
// Drop compile_mutex so other threads can walk while we compile.
|
||||
compile_lock.unlock();
|
||||
|
||||
// Compile what we claimed; create_pipeline_func reacquires compile_mutex
|
||||
// at the end to flip compile_pending/compiled and notify waiters.
|
||||
if (has_claimed_task) {
|
||||
auto & task = claimed_task;
|
||||
ggml_vk_create_pipeline_func(device, task.pipeline, task.spv_size, task.spv_data,
|
||||
task.entrypoint, task.parameter_count, task.wg_denoms,
|
||||
task.specialization_constants, task.disable_robustness,
|
||||
task.require_full_subgroups, task.required_subgroup_size);
|
||||
}
|
||||
|
||||
// Another thread may be compiling the pipeline we need; block on it here.
|
||||
if (wait_pipeline) {
|
||||
std::unique_lock<std::mutex> wait_lock(device->compile_mutex);
|
||||
device->compile_cv.wait(wait_lock, [&] {
|
||||
return wait_pipeline->compiled.load();
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7010,7 +7064,7 @@ static void * ggml_vk_host_malloc(vk_device& device, size_t size) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
std::lock_guard<std::recursive_mutex> guard(device->mutex);
|
||||
std::lock_guard<std::shared_mutex> guard(device->pinned_memory_mutex);
|
||||
device->pinned_memory.push_back(std::make_tuple(buf->ptr, size, buf));
|
||||
|
||||
return buf->ptr;
|
||||
@@ -7021,7 +7075,7 @@ static void ggml_vk_host_free(vk_device& device, void* ptr) {
|
||||
return;
|
||||
}
|
||||
VK_LOG_MEMORY("ggml_vk_host_free(" << ptr << ")");
|
||||
std::lock_guard<std::recursive_mutex> guard(device->mutex);
|
||||
std::lock_guard<std::shared_mutex> guard(device->pinned_memory_mutex);
|
||||
|
||||
vk_buffer buf;
|
||||
size_t index;
|
||||
@@ -7045,7 +7099,7 @@ static void ggml_vk_host_free(vk_device& device, void* ptr) {
|
||||
}
|
||||
|
||||
static void ggml_vk_host_get(const vk_device& device, const void * ptr, vk_buffer& buf, size_t& buf_offset) {
|
||||
std::lock_guard<std::recursive_mutex> guard(device->mutex);
|
||||
std::shared_lock<std::shared_mutex> guard(device->pinned_memory_mutex);
|
||||
buf = nullptr;
|
||||
buf_offset = 0;
|
||||
for (size_t i = 0; i < device->pinned_memory.size(); i++) {
|
||||
@@ -9720,7 +9774,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
vk_pipeline pipeline = nullptr;
|
||||
|
||||
{
|
||||
std::lock_guard<std::recursive_mutex> guard(ctx->device->mutex);
|
||||
std::lock_guard<std::mutex> guard(ctx->device->compile_mutex);
|
||||
auto &pipelines = ctx->device->pipeline_flash_attn_f32_f16;
|
||||
auto it = pipelines.find(fa_pipeline_state);
|
||||
if (it != pipelines.end()) {
|
||||
@@ -9784,13 +9838,15 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
|
||||
vk_pipeline pipeline_fa_mask_opt = nullptr;
|
||||
if (use_mask_opt) {
|
||||
std::lock_guard<std::recursive_mutex> guard(ctx->device->mutex);
|
||||
auto &pipelines = ctx->device->pipeline_fa_mask_opt;
|
||||
auto it = pipelines.find({Br, Bc});
|
||||
if (it != pipelines.end()) {
|
||||
pipeline_fa_mask_opt = it->second;
|
||||
} else {
|
||||
pipelines[{Br, Bc}] = pipeline_fa_mask_opt = std::make_shared<vk_pipeline_struct>();
|
||||
{
|
||||
std::lock_guard<std::mutex> guard(ctx->device->compile_mutex);
|
||||
auto &pipelines = ctx->device->pipeline_fa_mask_opt;
|
||||
auto it = pipelines.find({Br, Bc});
|
||||
if (it != pipelines.end()) {
|
||||
pipeline_fa_mask_opt = it->second;
|
||||
} else {
|
||||
pipelines[{Br, Bc}] = pipeline_fa_mask_opt = std::make_shared<vk_pipeline_struct>();
|
||||
}
|
||||
}
|
||||
assert(pipeline_fa_mask_opt);
|
||||
ggml_pipeline_request_descriptor_sets(ctx, pipeline_fa_mask_opt, 1);
|
||||
@@ -10324,7 +10380,7 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||
vk_pipeline pipeline = nullptr;
|
||||
|
||||
{
|
||||
std::lock_guard<std::recursive_mutex> guard(ctx->device->mutex);
|
||||
std::lock_guard<std::mutex> guard(ctx->device->compile_mutex);
|
||||
auto it = ctx->device->pipeline_solve_tri_f32.find(solve_tri_pipeline_state);
|
||||
if (it != ctx->device->pipeline_solve_tri_f32.end()) {
|
||||
pipeline = it->second;
|
||||
@@ -10483,7 +10539,7 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||
vk_pipeline pipeline = nullptr;
|
||||
|
||||
{
|
||||
std::lock_guard<std::recursive_mutex> guard(ctx->device->mutex);
|
||||
std::lock_guard<std::mutex> guard(ctx->device->compile_mutex);
|
||||
auto it = pipelines->find(conv2d_pipeline_state);
|
||||
if (it != pipelines->end()) {
|
||||
pipeline = it->second;
|
||||
|
||||
@@ -50,13 +50,13 @@ var<uniform> params: Params;
|
||||
|
||||
@compute @workgroup_size(WG_SIZE)
|
||||
fn main(
|
||||
@builtin(global_invocation_index) gindex: u32,
|
||||
@builtin(global_invocation_id) gid: vec3<u32>,
|
||||
) {
|
||||
if (gindex >= params.ne) {
|
||||
if (gid.x >= params.ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
var i = gindex;
|
||||
var i = gid.x;
|
||||
let i3 = i / (params.src_ne2 * params.src_ne1 * params.src_ne0);
|
||||
i = i % (params.src_ne2 * params.src_ne1 * params.src_ne0);
|
||||
let i2 = i / (params.src_ne1 * params.src_ne0);
|
||||
@@ -64,7 +64,7 @@ fn main(
|
||||
let i1 = i / params.src_ne0;
|
||||
let i0 = i % params.src_ne0;
|
||||
|
||||
var j = gindex;
|
||||
var j = gid.x;
|
||||
let j3 = j / (params.dst_ne2 * params.dst_ne1 * params.dst_ne0);
|
||||
j = j % (params.dst_ne2 * params.dst_ne1 * params.dst_ne0);
|
||||
let j2 = j / (params.dst_ne1 * params.dst_ne0);
|
||||
@@ -80,4 +80,3 @@ fn main(
|
||||
|
||||
dst[params.offset_dst + dst_idx] = DST_TYPE((src[params.offset_src + src_idx]));
|
||||
}
|
||||
|
||||
|
||||
@@ -150,6 +150,7 @@ class Keys:
|
||||
EMBD_LENGTH_PER_LAYER_INP = "{arch}.embedding_length_per_layer_input"
|
||||
SWIGLU_CLAMP_EXP = "{arch}.swiglu_clamp_exp"
|
||||
SWIGLU_CLAMP_SHEXP = "{arch}.swiglu_clamp_shexp"
|
||||
HIDDEN_ACT = "{arch}.hidden_activation"
|
||||
DENSE_FEAT_IN_SIZE = "{arch}.{dense}_feat_in"
|
||||
DENSE_FEAT_OUT_SIZE = "{arch}.{dense}_feat_out"
|
||||
|
||||
@@ -509,6 +510,7 @@ class MODEL_ARCH(IntEnum):
|
||||
MAINCODER = auto()
|
||||
KIMI_LINEAR = auto()
|
||||
TALKIE = auto()
|
||||
MELLUM = auto()
|
||||
|
||||
|
||||
class VISION_PROJECTOR_TYPE(IntEnum):
|
||||
@@ -1029,6 +1031,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.MAINCODER: "maincoder",
|
||||
MODEL_ARCH.KIMI_LINEAR: "kimi-linear",
|
||||
MODEL_ARCH.TALKIE: "talkie",
|
||||
MODEL_ARCH.MELLUM: "mellum",
|
||||
}
|
||||
|
||||
VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
|
||||
@@ -3310,6 +3313,13 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.FFN_POST_NORM,
|
||||
# NextN/MTP tensors - preserved but unused
|
||||
MODEL_TENSOR.NEXTN_EH_PROJ,
|
||||
MODEL_TENSOR.NEXTN_EMBED_TOKENS,
|
||||
MODEL_TENSOR.NEXTN_ENORM,
|
||||
MODEL_TENSOR.NEXTN_HNORM,
|
||||
MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD,
|
||||
MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM,
|
||||
],
|
||||
MODEL_ARCH.EXAONE_MOE: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
@@ -3987,6 +3997,13 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_GATE_SHEXP,
|
||||
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
||||
MODEL_TENSOR.FFN_EXP_PROBS_B,
|
||||
# NextN/MTP tensors (Step3p5 draft head)
|
||||
MODEL_TENSOR.NEXTN_EH_PROJ,
|
||||
MODEL_TENSOR.NEXTN_EMBED_TOKENS,
|
||||
MODEL_TENSOR.NEXTN_ENORM,
|
||||
MODEL_TENSOR.NEXTN_HNORM,
|
||||
MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD,
|
||||
MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM,
|
||||
],
|
||||
MODEL_ARCH.LLAMA_EMBED: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
@@ -4078,6 +4095,23 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.LAYER_OUT_SCALE,
|
||||
],
|
||||
MODEL_ARCH.MELLUM: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_Q_NORM,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_K_NORM,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE_INP,
|
||||
MODEL_TENSOR.FFN_GATE_EXP,
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
],
|
||||
# TODO
|
||||
}
|
||||
|
||||
@@ -4318,6 +4352,7 @@ class VisionProjectorType:
|
||||
LLAMA4 = "llama4"
|
||||
QWEN2VL = "qwen2vl_merger"
|
||||
QWEN25VL = "qwen2.5vl_merger"
|
||||
EXAONE4_5 = "exaone4_5"
|
||||
QWEN3VL = "qwen3vl_merger"
|
||||
STEP3VL = "step3vl"
|
||||
ULTRAVOX = "ultravox"
|
||||
|
||||
@@ -853,6 +853,9 @@ class GGUFWriter:
|
||||
def add_swiglu_clamp_shexp(self, values: Sequence[float]) -> None:
|
||||
self.add_array(Keys.LLM.SWIGLU_CLAMP_SHEXP.format(arch=self.arch), values)
|
||||
|
||||
def add_hidden_act(self, value: str) -> None:
|
||||
self.add_string(Keys.LLM.HIDDEN_ACT.format(arch=self.arch), value)
|
||||
|
||||
def add_expert_group_scale(self, value: float) -> None:
|
||||
self.add_float32(Keys.LLM.EXPERT_GROUP_SCALE.format(arch=self.arch), value)
|
||||
|
||||
|
||||
+6
-1
@@ -339,6 +339,7 @@ extern "C" {
|
||||
uint32_t n_ubatch; // physical maximum batch size
|
||||
uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models)
|
||||
uint32_t n_rs_seq; // number of recurrent-state snapshots per seq for rollback (0 = no rollback) [EXPERIMENTAL]
|
||||
uint32_t n_outputs_max; // max outputs in a ubatch (0 = n_batch)
|
||||
int32_t n_threads; // number of threads to use for generation
|
||||
int32_t n_threads_batch; // number of threads to use for batch processing
|
||||
|
||||
@@ -975,7 +976,11 @@ extern "C" {
|
||||
|
||||
// Set whether the model is in warmup mode or not
|
||||
// If true, all model tensors are activated during llama_decode() to load and cache their weights.
|
||||
LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup);
|
||||
//
|
||||
// note: using this can cause extra graph reallocations because it changes the graph topology with MoE models,
|
||||
// so it is generally not recommended to use in practice. will be removed in the future
|
||||
DEPRECATED(LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup),
|
||||
"user code should do warmup runs manually [TAG_LLAMA_GRAPH_NO_WARMUP]");
|
||||
|
||||
// Set abort callback
|
||||
LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
|
||||
|
||||
+1
-1
@@ -10,7 +10,7 @@ requires-python = '>=3.10,<3.15'
|
||||
dependencies = [
|
||||
'numpy (>=1.26.4,<3.0.0)',
|
||||
'sentencepiece (>=0.1.98,<0.3.0)',
|
||||
'transformers (==5.5.1)',
|
||||
'transformers (==4.57.6)',
|
||||
'protobuf (>=4.21.0,<5.0.0)',
|
||||
'torch (>=2.6.0,<3.0.0)',
|
||||
'gguf @ ./gguf-py',
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
numpy~=1.26.4
|
||||
sentencepiece>=0.1.98,<0.3.0
|
||||
|
||||
transformers==5.5.1
|
||||
transformers==4.57.6
|
||||
|
||||
gguf>=0.1.0
|
||||
protobuf>=4.21.0,<5.0.0
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
aiohttp~=3.9.3
|
||||
pytest~=8.3.3
|
||||
huggingface_hub>=1.5.0,<2.0
|
||||
matplotlib~=3.10.0
|
||||
numpy~=1.26.4
|
||||
openai~=2.14.0
|
||||
|
||||
@@ -11,6 +11,7 @@ from collections import defaultdict
|
||||
|
||||
# Mapping of cli-friendly names to (internal_data_key, Display Header, numeric_sort_key)
|
||||
COL_MAP = {
|
||||
"tot-usec": ("tot_usec", "Tot usec", "_sort_tot_usec"),
|
||||
"op": ("op", "Op", "op"),
|
||||
"dims": ("dims", "Dims", "dims"),
|
||||
"dtypes": ("dtypes", "DTypes", "dtypes"),
|
||||
@@ -24,7 +25,7 @@ COL_MAP = {
|
||||
}
|
||||
|
||||
op_pattern = re.compile(
|
||||
r"profile-op\s+(?P<op_name>[A-Z_0-9+]+):\s+.*?\s+:\s+(?P<dims>[\d:x\s\->!]+)\s+:\s+(?P<types>[a-z\d_\s\->x]+)\s+:\s+.*?\s+usec\s+(?P<usec>\d+)\s+cycles\s+(?P<cycles>\d+)(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?"
|
||||
r"profile-op\s+(?P<op_name>[A-Z_0-9+]+):\s+.*?\s+:\s+(?P<dims>[\d:x\s\->!]+)\s+:\s+(?P<types>[a-z\d_\s\->x]+)\s+:\s+.*?\s+(?:op-)?usec\s+(?P<usec>\d+)\s+(?:op-)?cycles\s+(?P<cycles>\d+)(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?"
|
||||
)
|
||||
|
||||
logger = logging.getLogger("ggml-hexagon-profile")
|
||||
@@ -85,21 +86,27 @@ def generate_report(ops, top_n, width_overrides, sort_col, pmu_name=None):
|
||||
cycles = [o['cycles'] for o in group_ops]
|
||||
pmu_vals = [o['pmu_val'] for o in group_ops if o['pmu_val'] is not None]
|
||||
|
||||
avg_usec_val = statistics.mean(usecs)
|
||||
count_val = len(group_ops)
|
||||
tot_usec_val = avg_usec_val * count_val
|
||||
|
||||
group_stats.append({
|
||||
'op': name,
|
||||
'dims': dims,
|
||||
'dtypes': types,
|
||||
'count': str(len(group_ops)),
|
||||
'count': str(count_val),
|
||||
'max_usec': str(max(usecs)),
|
||||
'avg_usec': f"{statistics.mean(usecs):.2f}",
|
||||
'avg_usec': f"{avg_usec_val:.2f}",
|
||||
'tot_usec': f"{tot_usec_val:.2f}",
|
||||
'max_cycles': str(max(cycles)),
|
||||
'avg_cycles': f"{statistics.mean(cycles):.2f}",
|
||||
'max_pmu': str(max(pmu_vals)) if pmu_vals else "0",
|
||||
'avg_pmu': f"{statistics.mean(pmu_vals):.2f}" if pmu_vals else "0.00",
|
||||
# Numeric values for accurate sorting
|
||||
'_sort_count': len(group_ops),
|
||||
'_sort_count': count_val,
|
||||
'_sort_max_usec': max(usecs),
|
||||
'_sort_avg_usec': statistics.mean(usecs),
|
||||
'_sort_avg_usec': avg_usec_val,
|
||||
'_sort_tot_usec': tot_usec_val,
|
||||
'_sort_max_cycles': max(cycles),
|
||||
'_sort_avg_cycles': statistics.mean(cycles),
|
||||
'_sort_max_pmu': max(pmu_vals) if pmu_vals else 0,
|
||||
@@ -116,7 +123,7 @@ def generate_report(ops, top_n, width_overrides, sort_col, pmu_name=None):
|
||||
active_cols = ["op", "dims", "dtypes"]
|
||||
if pmu_name:
|
||||
active_cols += ["max-pmu", "avg-pmu"]
|
||||
active_cols += ["max-usec", "avg-usec", "max-cycles", "avg-cycles", "count"]
|
||||
active_cols += ["tot-usec", "avg-usec", "avg-cycles", "max-usec", "max-cycles", "count"]
|
||||
|
||||
final_headers, final_keys, final_widths = [], [], []
|
||||
|
||||
@@ -156,7 +163,7 @@ def main():
|
||||
parser = argparse.ArgumentParser(description="Post-process Op profile info.")
|
||||
parser.add_argument("logfile")
|
||||
parser.add_argument("-n", "--top", type=int, default=100)
|
||||
parser.add_argument("--sort", type=str, default="max-usec", choices=list(COL_MAP.keys()))
|
||||
parser.add_argument("--sort", type=str, default="tot-usec", choices=list(COL_MAP.keys()))
|
||||
parser.add_argument("--pmu-index", type=int)
|
||||
parser.add_argument("--pmu-name", type=str)
|
||||
parser.add_argument("--width", action='append', default=['dims:40'], help="Override column width, e.g. --width dims:50")
|
||||
|
||||
@@ -5,7 +5,7 @@ import os
|
||||
import sys
|
||||
import subprocess
|
||||
|
||||
HTTPLIB_VERSION = "refs/tags/v0.46.0"
|
||||
HTTPLIB_VERSION = "refs/tags/v0.46.1"
|
||||
|
||||
vendor = {
|
||||
"https://github.com/nlohmann/json/releases/latest/download/json.hpp": "vendor/nlohmann/json.hpp",
|
||||
|
||||
@@ -135,6 +135,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_MAINCODER, "maincoder" },
|
||||
{ LLM_ARCH_KIMI_LINEAR, "kimi-linear" },
|
||||
{ LLM_ARCH_TALKIE, "talkie" },
|
||||
{ LLM_ARCH_MELLUM, "mellum" },
|
||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||
};
|
||||
|
||||
@@ -195,6 +196,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_MOE_LATENT_SIZE, "%s.moe_latent_size" },
|
||||
{ LLM_KV_NEXTN_PREDICT_LAYERS, "%s.nextn_predict_layers" },
|
||||
{ LLM_KV_NUM_DEEPSTACK_LAYERS, "%s.n_deepstack_layers" },
|
||||
{ LLM_KV_HIDDEN_ACT, "%s.hidden_activation" },
|
||||
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
|
||||
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
||||
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
||||
@@ -245,6 +247,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, "%s.attention.indexer.key_length" },
|
||||
{ LLM_KV_ATTENTION_INDEXER_TOP_K, "%s.attention.indexer.top_k" },
|
||||
{ LLM_KV_ATTENTION_SHARED_KV_LAYERS, "%s.attention.shared_kv_layers" },
|
||||
{ LLM_KV_ATTENTION_RECURRENT_LAYERS, "%s.attention.recurrent_layers" },
|
||||
|
||||
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
||||
{ LLM_KV_ROPE_DIMENSION_COUNT_SWA, "%s.rope.dimension_count_swa" },
|
||||
|
||||
@@ -139,6 +139,7 @@ enum llm_arch {
|
||||
LLM_ARCH_MAINCODER,
|
||||
LLM_ARCH_KIMI_LINEAR,
|
||||
LLM_ARCH_TALKIE,
|
||||
LLM_ARCH_MELLUM,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
};
|
||||
|
||||
@@ -199,6 +200,7 @@ enum llm_kv {
|
||||
LLM_KV_MOE_LATENT_SIZE,
|
||||
LLM_KV_NEXTN_PREDICT_LAYERS,
|
||||
LLM_KV_NUM_DEEPSTACK_LAYERS,
|
||||
LLM_KV_HIDDEN_ACT,
|
||||
LLM_KV_POOLING_TYPE,
|
||||
LLM_KV_LOGIT_SCALE,
|
||||
LLM_KV_DECODER_START_TOKEN_ID,
|
||||
@@ -249,6 +251,7 @@ enum llm_kv {
|
||||
LLM_KV_ATTENTION_INDEXER_KEY_LENGTH,
|
||||
LLM_KV_ATTENTION_INDEXER_TOP_K,
|
||||
LLM_KV_ATTENTION_SHARED_KV_LAYERS,
|
||||
LLM_KV_ATTENTION_RECURRENT_LAYERS,
|
||||
|
||||
LLM_KV_ROPE_DIMENSION_COUNT,
|
||||
LLM_KV_ROPE_DIMENSION_COUNT_SWA,
|
||||
|
||||
+15
-11
@@ -182,6 +182,8 @@ llama_context::llama_context(
|
||||
|
||||
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
|
||||
|
||||
cparams.n_outputs_max = params.n_outputs_max == 0 ? cparams.n_batch : params.n_outputs_max;
|
||||
|
||||
cparams.op_offload = params.op_offload;
|
||||
cparams.kv_unified = params.kv_unified;
|
||||
|
||||
@@ -227,6 +229,7 @@ llama_context::llama_context(
|
||||
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
|
||||
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
|
||||
LLAMA_LOG_INFO("%s: n_rs_seq = %u\n", __func__, cparams.n_rs_seq);
|
||||
LLAMA_LOG_INFO("%s: n_outputs_max = %u\n", __func__, cparams.n_outputs_max);
|
||||
|
||||
if (cparams.n_ctx_seq < hparams.n_ctx_train) {
|
||||
LLAMA_LOG_WARN("%s: n_ctx_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",
|
||||
@@ -531,7 +534,7 @@ void llama_context::sched_reserve() {
|
||||
// note: n_outputs must match n_tokens for embedding models with mean/rank pooling,
|
||||
// because build_pooling creates inp_mean with shape [n_tokens, n_seqs] and multiplies
|
||||
// it with t_embd which is reduced to [n_outputs, ...] via out_ids. if n_outputs != n_tokens,
|
||||
// the ggml_mul_mat assertion fails. this matches the pp reservation below (line ~553).
|
||||
// the ggml_mul_mat assertion fails.
|
||||
const uint32_t n_tokens_ch = 16*n_seqs;
|
||||
auto * gf = graph_reserve(n_tokens_ch, n_seqs, n_tokens_ch, mctx.get(), true);
|
||||
if (!gf) {
|
||||
@@ -577,16 +580,18 @@ void llama_context::sched_reserve() {
|
||||
int n_splits_tg = -1;
|
||||
int n_nodes_tg = -1;
|
||||
|
||||
const uint32_t n_outputs_pp = std::min(n_tokens, cparams.n_outputs_max);
|
||||
|
||||
// reserve pp (prompt processing) graph first so that buffers are only allocated once
|
||||
{
|
||||
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(),
|
||||
auto * gf = graph_reserve(n_tokens, n_seqs, n_outputs_pp, mctx.get(),
|
||||
model.hparams.no_alloc, model.hparams.no_alloc ? backend_buf_exp_size.data() : nullptr);
|
||||
if (!gf) {
|
||||
if (cparams.pipeline_parallel) {
|
||||
LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
|
||||
cparams.pipeline_parallel = false;
|
||||
sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, false, cparams.op_offload));
|
||||
gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
||||
gf = graph_reserve(n_tokens, n_seqs, n_outputs_pp, mctx.get());
|
||||
}
|
||||
if (!gf) {
|
||||
throw std::runtime_error("failed to allocate compute pp buffers");
|
||||
@@ -614,7 +619,7 @@ void llama_context::sched_reserve() {
|
||||
//
|
||||
// auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get());
|
||||
//
|
||||
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(), model.hparams.no_alloc);
|
||||
auto * gf = graph_reserve(n_tokens, n_seqs, n_outputs_pp, mctx.get(), model.hparams.no_alloc);
|
||||
if (!gf) {
|
||||
throw std::runtime_error("failed to allocate compute pp buffers");
|
||||
}
|
||||
@@ -774,7 +779,9 @@ bool llama_context::memory_update(bool optimize) {
|
||||
const uint32_t n_seqs = cparams.n_seq_max;
|
||||
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
||||
|
||||
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
||||
const uint32_t n_outputs_max = std::min(n_tokens, cparams.n_outputs_max);
|
||||
|
||||
auto * gf = graph_reserve(n_tokens, n_seqs, n_outputs_max, mctx.get());
|
||||
if (!gf) {
|
||||
LLAMA_LOG_ERROR("%s: failed to reserve graph after the memory update\n", __func__);
|
||||
}
|
||||
@@ -2140,6 +2147,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
|
||||
this->n_outputs = 0;
|
||||
|
||||
GGML_ASSERT(n_outputs_max <= cparams.n_outputs_max);
|
||||
|
||||
return n_outputs_max;
|
||||
}
|
||||
|
||||
@@ -2226,8 +2235,6 @@ ggml_cgraph * llama_context::graph_reserve(
|
||||
|
||||
if (n_tokens % n_seqs != 0) {
|
||||
n_tokens = ((n_tokens + (n_seqs - 1)) / n_seqs) * n_seqs; // round to next multiple of n_seqs
|
||||
n_outputs = std::max(n_outputs, n_tokens);
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs);
|
||||
}
|
||||
|
||||
@@ -3337,6 +3344,7 @@ llama_context_params llama_context_default_params() {
|
||||
/*.n_ubatch =*/ 512,
|
||||
/*.n_seq_max =*/ 1,
|
||||
/*.n_rs_seq =*/ 0,
|
||||
/*.n_outputs_max =*/ 0,
|
||||
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
|
||||
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
|
||||
/*.ctx_type =*/ LLAMA_CONTEXT_TYPE_DEFAULT,
|
||||
@@ -3403,10 +3411,6 @@ llama_context * llama_init_from_model(
|
||||
LLAMA_LOG_ERROR("%s: SPLIT_MODE_TENSOR requires flash_attn to be enabled\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
if (ggml_is_quantized(params.type_k) || ggml_is_quantized(params.type_v)) {
|
||||
LLAMA_LOG_ERROR("%s: simultaneous use of SPLIT_MODE_TENSOR and KV cache quantization not implemented\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
if (params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED && ggml_is_quantized(params.type_k)) {
|
||||
|
||||
+2
-1
@@ -13,6 +13,7 @@ struct llama_cparams {
|
||||
uint32_t n_ubatch;
|
||||
uint32_t n_seq_max;
|
||||
uint32_t n_rs_seq; // number of recurrent-state snapshots per seq for rollback
|
||||
uint32_t n_outputs_max; // max outputs supported by the context
|
||||
int32_t n_threads; // number of threads to use for generation
|
||||
int32_t n_threads_batch; // number of threads to use for batch processing
|
||||
|
||||
@@ -38,7 +39,7 @@ struct llama_cparams {
|
||||
bool fused_gdn_ch; // use fused gated delta net (chunked)
|
||||
bool auto_fgdn;
|
||||
bool no_perf;
|
||||
bool warmup;
|
||||
bool warmup; // TODO: remove [TAG_LLAMA_GRAPH_NO_WARMUP]
|
||||
bool op_offload;
|
||||
bool kv_unified;
|
||||
bool pipeline_parallel;
|
||||
|
||||
+2
-1
@@ -36,7 +36,8 @@ enum llm_graph_type {
|
||||
LLM_GRAPH_TYPE_DECODER_MTP,
|
||||
};
|
||||
|
||||
enum llm_ffn_op_type {
|
||||
enum llm_ffn_op_type : int {
|
||||
LLM_FFN_NONE = 0, // sentinel: unset; archs must assign before use
|
||||
LLM_FFN_SILU,
|
||||
LLM_FFN_GELU,
|
||||
LLM_FFN_RELU,
|
||||
|
||||
+19
-6
@@ -8,18 +8,31 @@
|
||||
void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
|
||||
if (dense_first) {
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
swa_layers[il] = n_pattern == 0 || (il % n_pattern != 0);
|
||||
is_swa_impl[il] = n_pattern == 0 || (il % n_pattern != 0);
|
||||
}
|
||||
} else {
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
swa_layers[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
|
||||
is_swa_impl[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: implement
|
||||
//void llama_hparams::set_recr_pattern(uint32_t n_pattern, bool dense_first) {
|
||||
// if (dense_first) {
|
||||
// for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
// is_recr_impl[il] = n_pattern == 0 || (il % n_pattern != 0);
|
||||
// }
|
||||
// } else {
|
||||
// for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
// is_recr_impl[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
|
||||
// }
|
||||
// }
|
||||
//}
|
||||
|
||||
bool llama_hparams::is_swa_any() const {
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
if (swa_layers[il]) {
|
||||
if (is_swa_impl[il]) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@@ -193,9 +206,9 @@ uint32_t llama_hparams::n_embd_s() const {
|
||||
return ssm_d_state * ssm_d_inner;
|
||||
}
|
||||
|
||||
bool llama_hparams::is_recurrent(uint32_t il) const {
|
||||
bool llama_hparams::is_recr(uint32_t il) const {
|
||||
if (il < n_layer) {
|
||||
return recurrent_layer_arr[il];
|
||||
return is_recr_impl[il];
|
||||
}
|
||||
|
||||
GGML_ABORT("%s: il (%u) out of bounds (n_layer: %u)\n", __func__, il, n_layer);
|
||||
@@ -207,7 +220,7 @@ uint32_t llama_hparams::n_pos_per_embd() const {
|
||||
|
||||
bool llama_hparams::is_swa(uint32_t il) const {
|
||||
if (il < n_layer) {
|
||||
return swa_layers[il];
|
||||
return is_swa_impl[il];
|
||||
}
|
||||
|
||||
GGML_ABORT("fatal error");
|
||||
|
||||
+30
-12
@@ -23,6 +23,9 @@ enum llama_swa_type {
|
||||
LLAMA_SWA_TYPE_SYMMETRIC = 3,
|
||||
};
|
||||
|
||||
// forward declaration; full definition in llama-graph.h
|
||||
enum llm_ffn_op_type : int;
|
||||
|
||||
struct llama_hparams_posnet {
|
||||
uint32_t n_embd;
|
||||
uint32_t n_layer;
|
||||
@@ -34,6 +37,9 @@ struct llama_hparams_convnext {
|
||||
};
|
||||
|
||||
struct llama_hparams {
|
||||
// note: use the `_impl` suffix to avoid name conflict between members and getters
|
||||
// for example: n_embd_out() vs n_embd_out_impl
|
||||
|
||||
bool vocab_only;
|
||||
bool no_alloc;
|
||||
bool rope_finetuned;
|
||||
@@ -43,7 +49,7 @@ struct llama_hparams {
|
||||
uint32_t n_ctx_train; // context size the model was trained on
|
||||
uint32_t n_embd;
|
||||
uint32_t n_layer;
|
||||
int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
|
||||
int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
|
||||
uint32_t n_expert = 0;
|
||||
uint32_t n_expert_used = 0;
|
||||
uint32_t n_rel_attn_bkts = 0;
|
||||
@@ -134,11 +140,15 @@ struct llama_hparams {
|
||||
llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
// the size of the sliding window (0 - no SWA)
|
||||
uint32_t n_swa = 0;
|
||||
// if swa_layers[il] == 1, then layer il is SWA
|
||||
// if swa_layers[il] == 0, then layer il is dense (i.e. non-SWA)
|
||||
|
||||
// if is_swa_impl[il] == 1, then layer il is SWA
|
||||
// if is_swa_impl[il] == 0, then layer il is dense (i.e. non-SWA)
|
||||
// by default, all layers are dense
|
||||
// note: using uint32_t type for compatibility reason
|
||||
std::array<uint32_t, LLAMA_MAX_LAYERS> swa_layers;
|
||||
std::array<uint32_t, LLAMA_MAX_LAYERS> is_swa_impl;
|
||||
|
||||
// for hybrid state space models
|
||||
std::array<uint32_t, LLAMA_MAX_LAYERS> is_recr_impl;
|
||||
|
||||
// for State Space Models
|
||||
uint32_t ssm_d_conv = 0;
|
||||
@@ -150,9 +160,6 @@ struct llama_hparams {
|
||||
// for Kimi Linear KDA
|
||||
uint32_t n_embd_head_kda = 0;
|
||||
|
||||
// for hybrid state space models
|
||||
std::array<bool, LLAMA_MAX_LAYERS> recurrent_layer_arr;
|
||||
|
||||
bool ssm_dt_b_c_rms = false;
|
||||
|
||||
float f_clamp_kqv = 0.0f;
|
||||
@@ -227,6 +234,14 @@ struct llama_hparams {
|
||||
enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
|
||||
|
||||
|
||||
// Resolved FFN gated activation flavor for archs that read
|
||||
// `<arch>.hidden_activation` from the GGUF (e.g. ModernBert derivatives).
|
||||
// Defaults to LLM_FFN_NONE (sentinel = 0); the mapping from the GGUF
|
||||
// string to a real op is done at hparam-load time via
|
||||
// llm_ffn_op_type_from_string() in llama-model.cpp, mirroring how
|
||||
// rope_scaling_type_train is handled.
|
||||
enum llm_ffn_op_type llm_ffn_op;
|
||||
|
||||
// Step35: optional per-layer clamps for (Swi)GLU
|
||||
std::array<float, LLAMA_MAX_LAYERS> swiglu_clamp_exp; // clamping for expert FFN
|
||||
std::array<float, LLAMA_MAX_LAYERS> swiglu_clamp_shexp; // shared expert
|
||||
@@ -255,6 +270,14 @@ struct llama_hparams {
|
||||
// return true if one of the layers is SWA
|
||||
bool is_swa_any() const;
|
||||
|
||||
bool is_swa(uint32_t il) const;
|
||||
|
||||
// TODO: implement
|
||||
//void set_recr_pattern(uint32_t n_pattern, bool dense_first = false);
|
||||
|
||||
// whether or not the given layer is recurrent (for hybrid models)
|
||||
bool is_recr(uint32_t il) const;
|
||||
|
||||
uint32_t n_head(uint32_t il = 0) const;
|
||||
|
||||
uint32_t n_head_kv(uint32_t il = 0) const;
|
||||
@@ -296,13 +319,8 @@ struct llama_hparams {
|
||||
// dimension of the recurrent state embeddings
|
||||
uint32_t n_embd_s() const;
|
||||
|
||||
// whether or not the given layer is recurrent (for hybrid models)
|
||||
bool is_recurrent(uint32_t il) const;
|
||||
|
||||
uint32_t n_pos_per_embd() const;
|
||||
|
||||
bool is_swa(uint32_t il) const;
|
||||
|
||||
// note: currently only support if either all or none of the layers are MLA
|
||||
bool is_mla() const;
|
||||
|
||||
|
||||
+14
-2
@@ -1876,7 +1876,19 @@ void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, lla
|
||||
uint32_t cell_range_begin = cells.size();
|
||||
|
||||
for (uint32_t i = 0; i < cells.size(); ++i) {
|
||||
if (!cells.is_empty(i) && (seq_id == -1 || cells.seq_has(i, seq_id))) {
|
||||
bool add_cell = true;
|
||||
|
||||
add_cell = add_cell && !cells.is_empty(i);
|
||||
add_cell = add_cell && (seq_id == -1 || cells.seq_has(i, seq_id));
|
||||
|
||||
// check the cell is not SWA-masked
|
||||
if (add_cell && seq_id != -1) {
|
||||
const bool is_masked = llama_hparams::is_masked_swa(n_swa, swa_type, cells.pos_get(i), cells.seq_pos_max(seq_id));
|
||||
|
||||
add_cell = !is_masked;
|
||||
}
|
||||
|
||||
if (add_cell) {
|
||||
++cell_count;
|
||||
if (cell_range_begin == cells.size()) {
|
||||
cell_range_begin = i;
|
||||
@@ -2129,7 +2141,7 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
|
||||
|
||||
sinfo = find_slot(ubatch, false);
|
||||
if (sinfo.empty()) {
|
||||
LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
|
||||
LLAMA_LOG_ERROR("%s: failed to find %d available cells in kv cache\n", __func__, cell_count);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@@ -44,7 +44,7 @@ llama_memory_hybrid_iswa::llama_memory_hybrid_iswa(
|
||||
n_ubatch,
|
||||
n_pad,
|
||||
filter_attn == nullptr ?
|
||||
[&](int32_t il) { return !hparams.is_recurrent(il); }
|
||||
[&](int32_t il) { return !hparams.is_recr(il); }
|
||||
: filter_attn,
|
||||
nullptr
|
||||
)),
|
||||
@@ -57,7 +57,7 @@ llama_memory_hybrid_iswa::llama_memory_hybrid_iswa(
|
||||
n_seq_max,
|
||||
n_rs_seq,
|
||||
filter_recr == nullptr ?
|
||||
[&](int32_t il) { return hparams.is_recurrent(il); }
|
||||
[&](int32_t il) { return hparams.is_recr(il); }
|
||||
: filter_recr
|
||||
)) {}
|
||||
|
||||
|
||||
@@ -45,7 +45,7 @@ llama_memory_hybrid::llama_memory_hybrid(
|
||||
n_swa,
|
||||
swa_type,
|
||||
filter_attn == nullptr ?
|
||||
[&](int32_t il) { return !hparams.is_recurrent(il); }
|
||||
[&](int32_t il) { return !hparams.is_recr(il); }
|
||||
: filter_attn,
|
||||
nullptr
|
||||
)),
|
||||
@@ -58,7 +58,7 @@ llama_memory_hybrid::llama_memory_hybrid(
|
||||
n_seq_max,
|
||||
n_rs_seq,
|
||||
filter_recr == nullptr ?
|
||||
[&](int32_t il) { return hparams.is_recurrent(il); }
|
||||
[&](int32_t il) { return hparams.is_recr(il); }
|
||||
: filter_recr
|
||||
)) {}
|
||||
|
||||
|
||||
@@ -146,7 +146,7 @@ namespace GGUFMeta {
|
||||
const enum gguf_type arr_type = gguf_get_arr_type(ctx, k);
|
||||
return ArrayInfo {
|
||||
arr_type,
|
||||
size_t(gguf_get_arr_n(ctx, k)),
|
||||
gguf_get_arr_n(ctx, k),
|
||||
arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx, k),
|
||||
};
|
||||
}
|
||||
@@ -445,7 +445,7 @@ namespace GGUFMeta {
|
||||
}
|
||||
|
||||
if (n > N_MAX) {
|
||||
throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", (uint32_t) n, (uint32_t) N_MAX, key.c_str()));
|
||||
throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", n, (uint32_t) N_MAX, key.c_str()));
|
||||
}
|
||||
|
||||
if (gguf_get_kv_type(metadata, kid) == GGUF_TYPE_ARRAY) {
|
||||
@@ -502,9 +502,9 @@ namespace GGUFMeta {
|
||||
}
|
||||
|
||||
// TODO: this is not very clever - figure out something better
|
||||
template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
|
||||
template bool llama_model_loader::get_key_or_arr<std::array<int, 4>> (enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
|
||||
template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
|
||||
template bool llama_model_loader::get_key_or_arr<std::array<float, 512>>(enum llm_kv kid, std::array<float, 512> & result, uint32_t n, bool required);
|
||||
template bool llama_model_loader::get_key_or_arr<std::array<float, 512>>(enum llm_kv kid, std::array<float, 512> & result, uint32_t n, bool required);
|
||||
|
||||
|
||||
llama_model_loader::llama_model_loader(
|
||||
|
||||
@@ -14,9 +14,6 @@
|
||||
|
||||
bool llama_model_saver_supports_arch(llm_arch arch) {
|
||||
switch (arch) {
|
||||
case LLM_ARCH_QWEN3NEXT:
|
||||
case LLM_ARCH_QWEN35:
|
||||
case LLM_ARCH_QWEN35MOE:
|
||||
case LLM_ARCH_PLAMO3:
|
||||
case LLM_ARCH_GEMMA3:
|
||||
case LLM_ARCH_GEMMA3N:
|
||||
@@ -29,6 +26,7 @@ bool llama_model_saver_supports_arch(llm_arch arch) {
|
||||
case LLM_ARCH_APERTUS:
|
||||
case LLM_ARCH_MIMO2:
|
||||
case LLM_ARCH_STEP35:
|
||||
case LLM_ARCH_MELLUM:
|
||||
return false;
|
||||
default:
|
||||
return true;
|
||||
@@ -106,6 +104,8 @@ void llama_model_saver::add_kv(const enum llm_kv key, const Container & value, c
|
||||
gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_INT8, value.data(), n_values);
|
||||
} else if (std::is_same<typename Container::value_type, uint32_t>::value) {
|
||||
gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_UINT32, value.data(), n_values);
|
||||
} else if (std::is_same<typename Container::value_type, bool>::value) {
|
||||
gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_BOOL, value.data(), n_values);
|
||||
} else if (std::is_same<typename Container::value_type, int32_t>::value) {
|
||||
gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_INT32, value.data(), n_values);
|
||||
} else if (std::is_same<typename Container::value_type, float>::value) {
|
||||
@@ -244,7 +244,7 @@ void llama_model_saver::add_kv_from_model() {
|
||||
add_kv(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
|
||||
add_kv(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count);
|
||||
add_kv(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
|
||||
// add_kv(LLM_KV_FULL_ATTENTION_INTERVAL, ???);
|
||||
// add_kv(LLM_KV_FULL_ATTENTION_INTERVAL, ???); // saved as LLM_KV_ATTENTION_RECURRENT_LAYERS instead
|
||||
|
||||
add_kv(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, true);
|
||||
add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, true);
|
||||
@@ -278,6 +278,7 @@ void llama_model_saver::add_kv_from_model() {
|
||||
add_kv(LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, hparams.indexer_n_head);
|
||||
add_kv(LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, hparams.indexer_head_size);
|
||||
add_kv(LLM_KV_ATTENTION_INDEXER_TOP_K, hparams.indexer_top_k);
|
||||
add_kv(LLM_KV_ATTENTION_RECURRENT_LAYERS, hparams.is_recr_impl, true);
|
||||
|
||||
const float rope_scaling_factor = hparams.rope_freq_scale_train == 1.0f ? 0.0f : 1.0f/hparams.rope_freq_scale_train;
|
||||
|
||||
|
||||
+67
-36
@@ -81,6 +81,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params
|
||||
return new llama_model_mpt(params);
|
||||
case LLM_ARCH_STABLELM:
|
||||
return new llama_model_stablelm(params);
|
||||
case LLM_ARCH_MELLUM:
|
||||
return new llama_model_mellum(params);
|
||||
case LLM_ARCH_QWEN:
|
||||
return new llama_model_qwen(params);
|
||||
case LLM_ARCH_QWEN2:
|
||||
@@ -371,10 +373,10 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str
|
||||
// count only the same type of previous layers to avoid this
|
||||
auto get_il_eff = [&](const size_t il){
|
||||
size_t ret = 0;
|
||||
const bool il_is_recurrent = hparams.is_recurrent(il);
|
||||
const bool il_is_swa = hparams.is_swa(il);
|
||||
const bool il_is_recr = hparams.is_recr(il);
|
||||
const bool il_is_swa = hparams.is_swa(il);
|
||||
for (size_t il_prev = 0; il_prev < il; il_prev++) {
|
||||
ret += hparams.is_recurrent(il_prev) == il_is_recurrent && hparams.is_swa(il_prev) == il_is_swa;
|
||||
ret += hparams.is_recr(il_prev) == il_is_recr && hparams.is_swa(il_prev) == il_is_swa;
|
||||
}
|
||||
return ret;
|
||||
};
|
||||
@@ -488,7 +490,7 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str
|
||||
return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_MIRRORED);
|
||||
};
|
||||
|
||||
auto get_split_segments = [&](int axis, uint32_t il) -> std::vector<int64_t> {
|
||||
auto get_split_segments = [&](int axis, uint32_t il) -> std::vector<std::pair<int64_t, uint32_t>> {
|
||||
if (ud->model->arch == LLM_ARCH_QWEN3NEXT || ud->model->arch == LLM_ARCH_QWEN35 || ud->model->arch == LLM_ARCH_QWEN35MOE) {
|
||||
const int64_t head_k_dim = hparams.ssm_d_state;
|
||||
const int64_t head_v_dim = hparams.ssm_d_state;
|
||||
@@ -503,26 +505,26 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str
|
||||
if (ud->model->arch == LLM_ARCH_QWEN3NEXT) {
|
||||
if (std::regex_match(tensor_name, pattern_qkv_weight) || std::regex_match(tensor_name, pattern_ssm_conv1d)) {
|
||||
GGML_ASSERT(tensor->ne[axis] == 2*key_dim + value_dim);
|
||||
return {key_dim, key_dim, value_dim};
|
||||
return {{key_dim, 2}, {value_dim, 1}};
|
||||
}
|
||||
} else {
|
||||
const int64_t head_ratio = n_v_heads / n_k_heads;
|
||||
if (std::regex_match(tensor_name, pattern_qkv_weight) || std::regex_match(tensor_name, pattern_ssm_conv1d)) {
|
||||
GGML_ASSERT(tensor->ne[axis] == 2*key_dim + value_dim);
|
||||
return std::vector<int64_t>(2 + head_ratio, key_dim);
|
||||
return {{key_dim, 2 + head_ratio}};
|
||||
}
|
||||
if (std::regex_match(tensor_name, pattern_attn_gate_weight) || std::regex_match(tensor_name, pattern_ssm_out_weight)) {
|
||||
return std::vector<int64_t>(head_ratio, key_dim);
|
||||
return {{key_dim, head_ratio}};
|
||||
}
|
||||
if (std::regex_match(tensor_name, pattern_ssm_dt) || std::regex_match(tensor_name, pattern_ssm_a) ||
|
||||
std::regex_match(tensor_name, pattern_ssm_alpha) || std::regex_match(tensor_name, pattern_ssm_beta)) {
|
||||
return std::vector<int64_t>(head_ratio, n_k_heads);
|
||||
return {{n_k_heads, head_ratio}};
|
||||
}
|
||||
if (std::regex_match(tensor_name, pattern_r_cache)) {
|
||||
return std::vector<int64_t>(2 + head_ratio, key_dim * (hparams.ssm_d_conv - 1));
|
||||
return {{key_dim * (hparams.ssm_d_conv - 1), 2 + head_ratio}};
|
||||
}
|
||||
if (std::regex_match(tensor_name, pattern_s_cache)) {
|
||||
return std::vector<int64_t>(head_ratio, n_k_heads * head_v_dim * head_v_dim);
|
||||
return {{n_k_heads * head_v_dim * head_v_dim, head_ratio}};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -530,9 +532,9 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str
|
||||
if (std::regex_match(tensor_name, pattern_ffn_gate_up_weight)) {
|
||||
const int64_t n_ff_exp = hparams.n_ff_exp;
|
||||
GGML_ASSERT(tensor->ne[axis] == 2*n_ff_exp);
|
||||
return {n_ff_exp, n_ff_exp};
|
||||
return {{n_ff_exp, 2}};
|
||||
}
|
||||
return {tensor->ne[axis]};
|
||||
return {{tensor->ne[axis], 1}};
|
||||
}
|
||||
|
||||
if (std::regex_match(tensor_name, pattern_qkv_weight) || std::regex_match(tensor_name, pattern_qkv_bias)) {
|
||||
@@ -540,18 +542,18 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa(il);
|
||||
GGML_ASSERT(hparams.n_embd_k_gqa() == n_embd_gqa);
|
||||
GGML_ASSERT(tensor->ne[axis] == n_embd + 2*n_embd_gqa);
|
||||
return {n_embd, n_embd_gqa, n_embd_gqa};
|
||||
return {{n_embd, 1}, {n_embd_gqa, 2}};
|
||||
}
|
||||
if (std::regex_match(tensor_name, pattern_ffn_gate_up_weight)) {
|
||||
const int64_t n_ff_exp = hparams.n_ff_exp;
|
||||
GGML_ASSERT(tensor->ne[axis] == 2*n_ff_exp);
|
||||
return {n_ff_exp, n_ff_exp};
|
||||
return {{n_ff_exp, 2}};
|
||||
}
|
||||
return {tensor->ne[axis]};
|
||||
return {{tensor->ne[axis], 1}};
|
||||
};
|
||||
|
||||
auto get_split_granularity = [&](int64_t blck_size, uint32_t il, const std::vector<int64_t> & segments) -> std::vector<int64_t> {
|
||||
if (hparams.is_recurrent(il)) {
|
||||
auto get_split_granularity = [&](int64_t blck_size, uint32_t il, const std::vector<std::pair<int64_t, uint32_t>> & segments) -> std::vector<int64_t> {
|
||||
if (hparams.is_recr(il)) {
|
||||
// linear attention
|
||||
const int64_t head_dim = hparams.ssm_d_state;
|
||||
const int64_t granularity_qkv = std::lcm(blck_size, head_dim);
|
||||
@@ -603,16 +605,16 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str
|
||||
return {granularity_kv};
|
||||
}
|
||||
if (std::regex_match(tensor_name, pattern_qkv_weight) || std::regex_match(tensor_name, pattern_qkv_bias)) {
|
||||
GGML_ASSERT(segments.size() == 3);
|
||||
return {granularity_q, granularity_kv, granularity_kv};
|
||||
GGML_ASSERT(segments.size() == 2);
|
||||
return {granularity_q, granularity_kv};
|
||||
}
|
||||
}
|
||||
|
||||
// FFN
|
||||
if (std::regex_match(tensor_name, pattern_ffn_up_gate_weight) || std::regex_match(tensor_name, pattern_ffn_up_gate_bias) ||
|
||||
std::regex_match(tensor_name, pattern_ffn_gate_up_weight) || std::regex_match(tensor_name, pattern_ffn_down_weight)) {
|
||||
GGML_ASSERT(segments.size() <= 2);
|
||||
return std::vector<int64_t>(segments.size(), blck_size);
|
||||
GGML_ASSERT(segments.size() == 1);
|
||||
return {blck_size};
|
||||
}
|
||||
|
||||
// everything else
|
||||
@@ -636,11 +638,12 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str
|
||||
tensor_split_scan[j] += tensor_split_scan[j - 1];
|
||||
}
|
||||
}
|
||||
const std::vector<int64_t> segments = get_split_segments(split_state.axis, tc.il);
|
||||
const std::vector<std::pair<int64_t, uint32_t>> segments = get_split_segments(split_state.axis, tc.il);
|
||||
const std::vector<int64_t> granularity = get_split_granularity(blck_size, tc.il, segments);
|
||||
for (size_t is = 0; is < segments.size(); is++) {
|
||||
const int64_t ne_s = segments[is];
|
||||
const int64_t g_s = granularity[is];
|
||||
const int64_t ne_s = segments[is].first;
|
||||
const uint32_t nr_s = segments[is].second;
|
||||
const int64_t g_s = granularity[is];
|
||||
GGML_ASSERT(ne_full % g_s == 0);
|
||||
int64_t low = 0;
|
||||
size_t j = 0;
|
||||
@@ -654,10 +657,12 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str
|
||||
low = high;
|
||||
}
|
||||
split_state.ne[is*ud->n_devices + (j + tc.rotation) % ud->n_devices] = ne_s - low;
|
||||
split_state.nr[is] = nr_s;
|
||||
}
|
||||
split_state.n_segments = segments.size();
|
||||
} else {
|
||||
memset(split_state.ne, 0, sizeof(split_state.ne));
|
||||
split_state.nr[0] = 1;
|
||||
split_state.n_segments = 1;
|
||||
}
|
||||
return split_state;
|
||||
@@ -761,6 +766,7 @@ const char * llm_type_name(llm_type type) {
|
||||
case LLM_TYPE_A13B: return "A13B";
|
||||
case LLM_TYPE_7B_A1B: return "7B.A1B";
|
||||
case LLM_TYPE_8B_A1B: return "8B.A1B";
|
||||
case LLM_TYPE_12B_A2_5B: return "12B.A2.5B";
|
||||
case LLM_TYPE_16B_A1B: return "16B.A1B";
|
||||
case LLM_TYPE_21B_A3B: return "21B.A3B";
|
||||
case LLM_TYPE_24B_A2B: return "24B.A2B";
|
||||
@@ -819,6 +825,28 @@ static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::st
|
||||
return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
||||
}
|
||||
|
||||
// Maps the GGUF `<arch>.hidden_activation` string to the FFN op type used by the
|
||||
// graph builders. Only gated activations that map cleanly to llm_ffn_op_type are
|
||||
// listed; unrecognized values fall back to GeGLU, which matches the historical
|
||||
// default for ModernBert-style architectures.
|
||||
static const std::map<std::string, llm_ffn_op_type> LLM_FFN_OP_TYPES_FROM_STRING = {
|
||||
{ "gelu", LLM_FFN_GEGLU },
|
||||
{ "geglu", LLM_FFN_GEGLU },
|
||||
{ "silu", LLM_FFN_SWIGLU },
|
||||
{ "swish", LLM_FFN_SWIGLU },
|
||||
{ "swiglu", LLM_FFN_SWIGLU },
|
||||
{ "relu", LLM_FFN_RELU },
|
||||
{ "reglu", LLM_FFN_REGLU },
|
||||
};
|
||||
|
||||
llm_ffn_op_type llm_ffn_op_type_from_string(const std::string & name, llm_ffn_op_type fallback) {
|
||||
const auto it = LLM_FFN_OP_TYPES_FROM_STRING.find(name);
|
||||
if (it != LLM_FFN_OP_TYPES_FROM_STRING.end()) {
|
||||
return it->second;
|
||||
}
|
||||
return fallback;
|
||||
}
|
||||
|
||||
// CPU: ACCEL -> GPU host -> CPU extra -> CPU
|
||||
static buft_list_t make_cpu_buft_list(const std::vector<llama_device> & devices, bool use_extra_bufts, bool no_host) {
|
||||
buft_list_t buft_list;
|
||||
@@ -1048,18 +1076,16 @@ void llama_model_base::load_hparams(llama_model_loader & ml) {
|
||||
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
|
||||
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
|
||||
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
|
||||
std::fill(
|
||||
hparams.recurrent_layer_arr.begin(),
|
||||
hparams.recurrent_layer_arr.end(),
|
||||
llm_arch_is_recurrent(ml.get_arch()));
|
||||
|
||||
std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
|
||||
std::fill(hparams.swa_layers.begin(), hparams.swa_layers.end(), 0);
|
||||
std::fill(hparams.is_swa_impl.begin(), hparams.is_swa_impl.end(), 0);
|
||||
std::fill(hparams.is_recr_impl.begin(), hparams.is_recr_impl.end(), llm_arch_is_recurrent(ml.get_arch()) ? 1 : 0);
|
||||
|
||||
std::fill(hparams.xielu_alpha_n.begin(), hparams.xielu_alpha_n.end(), 0.0f);
|
||||
std::fill(hparams.xielu_alpha_p.begin(), hparams.xielu_alpha_p.end(), 0.0f);
|
||||
std::fill(hparams.xielu_beta.begin(), hparams.xielu_beta.end(), 0.0f);
|
||||
std::fill(hparams.xielu_eps.begin(), hparams.xielu_eps.end(), 0.0f);
|
||||
std::fill(hparams.xielu_beta.begin(), hparams.xielu_beta.end(), 0.0f);
|
||||
std::fill(hparams.xielu_eps.begin(), hparams.xielu_eps.end(), 0.0f);
|
||||
|
||||
std::fill(hparams.swiglu_clamp_exp.begin(), hparams.swiglu_clamp_exp.end(), 0.0f);
|
||||
std::fill(hparams.swiglu_clamp_shexp.begin(), hparams.swiglu_clamp_shexp.end(), 0.0f);
|
||||
|
||||
@@ -1791,7 +1817,11 @@ void llama_model::print_info() const {
|
||||
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
||||
}
|
||||
|
||||
if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_RND1) {
|
||||
if (arch == LLM_ARCH_MELLUM ||
|
||||
arch == LLM_ARCH_QWEN3MOE ||
|
||||
arch == LLM_ARCH_OPENAI_MOE ||
|
||||
arch == LLM_ARCH_QWEN3VLMOE ||
|
||||
arch == LLM_ARCH_RND1) {
|
||||
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
||||
}
|
||||
|
||||
@@ -2008,18 +2038,18 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
||||
filter_recr = [&](int32_t) { return true; };
|
||||
} else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
||||
filter_attn = [&](int32_t il) {
|
||||
return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
|
||||
return !hparams.is_recr(il) && hparams.n_ff(il) == 0;
|
||||
};
|
||||
filter_recr = [&](int32_t il) {
|
||||
return hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
|
||||
return hparams.is_recr(il) && hparams.n_ff(il) == 0;
|
||||
};
|
||||
} else if (arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE) {
|
||||
const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers;
|
||||
filter_attn = [&, n_main](int32_t il) {
|
||||
return (uint32_t)il < n_main && !hparams.is_recurrent(il);
|
||||
return (uint32_t)il < n_main && !hparams.is_recr(il);
|
||||
};
|
||||
filter_recr = [&, n_main](int32_t il) {
|
||||
return (uint32_t)il < n_main && hparams.is_recurrent(il);
|
||||
return (uint32_t)il < n_main && hparams.is_recr(il);
|
||||
};
|
||||
}
|
||||
|
||||
@@ -2379,6 +2409,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_MIMO2:
|
||||
case LLM_ARCH_STEP35:
|
||||
case LLM_ARCH_TALKIE:
|
||||
case LLM_ARCH_MELLUM:
|
||||
return LLAMA_ROPE_TYPE_NEOX;
|
||||
|
||||
case LLM_ARCH_QWEN2VL:
|
||||
|
||||
@@ -116,6 +116,7 @@ enum llm_type {
|
||||
LLM_TYPE_A13B,
|
||||
LLM_TYPE_7B_A1B,
|
||||
LLM_TYPE_8B_A1B, // lfm2moe
|
||||
LLM_TYPE_12B_A2_5B,
|
||||
LLM_TYPE_16B_A1B,
|
||||
LLM_TYPE_21B_A3B, // Ernie MoE small
|
||||
LLM_TYPE_24B_A2B, // lfm2moe
|
||||
@@ -145,6 +146,10 @@ enum llm_type {
|
||||
|
||||
std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type);
|
||||
|
||||
// Map a GGUF activation-name string to llm_ffn_op_type. Returns `fallback` if
|
||||
// the string is empty or not recognized.
|
||||
llm_ffn_op_type llm_ffn_op_type_from_string(const std::string & name, llm_ffn_op_type fallback);
|
||||
|
||||
struct llama_layer_posnet {
|
||||
// resnet
|
||||
struct ggml_tensor * norm1 = nullptr;
|
||||
|
||||
+29
-7
@@ -353,6 +353,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||
case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
|
||||
case LLAMA_VOCAB_PRE_TYPE_EXAONE:
|
||||
case LLAMA_VOCAB_PRE_TYPE_MINERVA:
|
||||
case LLAMA_VOCAB_PRE_TYPE_MELLUM2:
|
||||
regex_exprs = {
|
||||
"\\p{N}",
|
||||
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
||||
@@ -432,6 +433,15 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_GRANITE_EMB_MULTI:
|
||||
// Same lookaheads as GPT4O but with \p{M} added so combining marks
|
||||
// (diacritics) attach to their base letters. Avoids excessive
|
||||
// backtracking on scripts that use them heavily (Bengali, Hindi,
|
||||
// Telugu, Thai, ...). See PR #22716 for benchmarks.
|
||||
regex_exprs = {
|
||||
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}\\p{M}])([^a-z]))*((?=[\\p{L}\\p{M}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}\\p{M}])([^a-z]))+((?=[\\p{L}\\p{M}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_TINY_AYA:
|
||||
regex_exprs = {
|
||||
// original regex from tokenizer.json: "\\d{1,3}(?=(?:\\d{3})*\\b)"
|
||||
@@ -754,7 +764,7 @@ struct llm_tokenizer_wpm_session {
|
||||
|
||||
void tokenize(const std::string & text, std::vector<llama_token> & output) {
|
||||
// normalize and split by whitespace
|
||||
std::vector<std::string> words = preprocess(text);
|
||||
std::vector<std::string> words = preprocess(text, vocab.get_normalizer_lowercase());
|
||||
// bos token prepended already
|
||||
|
||||
// find the longest tokens that form the words
|
||||
@@ -799,7 +809,7 @@ struct llm_tokenizer_wpm_session {
|
||||
}
|
||||
|
||||
// TODO: reduce string copies by using cpts_offs array
|
||||
static std::vector<std::string> preprocess(const std::string & text) {
|
||||
static std::vector<std::string> preprocess(const std::string & text, bool lowercase) {
|
||||
const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
|
||||
std::vector<std::string> words(1, "");
|
||||
|
||||
@@ -818,7 +828,7 @@ struct llm_tokenizer_wpm_session {
|
||||
continue;
|
||||
}
|
||||
|
||||
const std::string s = unicode_cpt_to_utf8(unicode_tolower(cpt));
|
||||
const std::string s = unicode_cpt_to_utf8(lowercase ? unicode_tolower(cpt) : cpt);
|
||||
if (flags.is_punctuation || ( cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)) {
|
||||
if (words.back().size()) { // finish previous word if any
|
||||
words.emplace_back();
|
||||
@@ -2142,7 +2152,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
tokenizer_pre == "jais-2") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_JAIS2;
|
||||
} else if (
|
||||
tokenizer_pre == "gemma4") {
|
||||
tokenizer_pre == "gemma4" ||
|
||||
tokenizer_pre == "granite-embed-multi-311m") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_GEMMA4;
|
||||
escape_whitespaces = true;
|
||||
} else if (
|
||||
@@ -2159,6 +2170,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
} else if (
|
||||
tokenizer_pre == "whitespace") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_WHITESPACE;
|
||||
normalizer_lowercase = false;
|
||||
} else if (
|
||||
tokenizer_pre == "refact") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT;
|
||||
@@ -2251,6 +2263,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
tokenizer_pre == "talkie") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
|
||||
clean_spaces = false;
|
||||
} else if (
|
||||
tokenizer_pre == "granite-embed-multi-97m") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_GRANITE_EMB_MULTI;
|
||||
clean_spaces = false;
|
||||
ignore_merges = true;
|
||||
} else if (
|
||||
tokenizer_pre == "tiny_aya") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_TINY_AYA;
|
||||
@@ -2309,6 +2326,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
tokenizer_pre == "solar-open") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN;
|
||||
clean_spaces = false;
|
||||
} else if (
|
||||
tokenizer_pre == "mellum2") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_MELLUM2;
|
||||
} else {
|
||||
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||
}
|
||||
@@ -2339,9 +2359,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
}
|
||||
|
||||
ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, add_space_prefix, false);
|
||||
ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, remove_extra_whitespaces, false);
|
||||
ml.get_key(LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, normalizer_lowercase, false);
|
||||
ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, add_space_prefix, false);
|
||||
ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, remove_extra_whitespaces, false);
|
||||
}
|
||||
|
||||
const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
|
||||
@@ -2511,6 +2530,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
}
|
||||
}
|
||||
|
||||
// Lowercase normalizer flag (consulted by WPM / whitespace BPE)
|
||||
ml.get_key(LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, normalizer_lowercase, false);
|
||||
|
||||
// auto-detect special tokens by text
|
||||
// TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_...
|
||||
// for now, we apply this workaround to find the tokens based on their text
|
||||
|
||||
+56
-54
@@ -8,60 +8,62 @@
|
||||
|
||||
// pre-tokenization types
|
||||
enum llama_vocab_pre_type {
|
||||
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
|
||||
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
|
||||
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
|
||||
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
|
||||
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
|
||||
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
|
||||
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
|
||||
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
|
||||
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
|
||||
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
|
||||
LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
|
||||
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
|
||||
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
|
||||
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
|
||||
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
|
||||
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
|
||||
LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
|
||||
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
|
||||
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
|
||||
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
|
||||
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
|
||||
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
|
||||
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
|
||||
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
|
||||
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
|
||||
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
|
||||
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
|
||||
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
|
||||
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
|
||||
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
|
||||
LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
|
||||
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
|
||||
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
|
||||
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
|
||||
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
|
||||
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
|
||||
LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
|
||||
LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
|
||||
LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
|
||||
LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39,
|
||||
LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40,
|
||||
LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2 = 41,
|
||||
LLAMA_VOCAB_PRE_TYPE_AFMOE = 42,
|
||||
LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43,
|
||||
LLAMA_VOCAB_PRE_TYPE_YOUTU = 44,
|
||||
LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE = 45,
|
||||
LLAMA_VOCAB_PRE_TYPE_QWEN35 = 46,
|
||||
LLAMA_VOCAB_PRE_TYPE_TINY_AYA = 47,
|
||||
LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48,
|
||||
LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49,
|
||||
LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50,
|
||||
LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE = 51,
|
||||
LLAMA_VOCAB_PRE_TYPE_MINICPM5 = 52,
|
||||
LLAMA_VOCAB_PRE_TYPE_WHITESPACE = 53,
|
||||
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
|
||||
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
|
||||
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
|
||||
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
|
||||
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
|
||||
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
|
||||
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
|
||||
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
|
||||
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
|
||||
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
|
||||
LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
|
||||
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
|
||||
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
|
||||
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
|
||||
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
|
||||
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
|
||||
LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
|
||||
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
|
||||
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
|
||||
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
|
||||
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
|
||||
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
|
||||
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
|
||||
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
|
||||
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
|
||||
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
|
||||
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
|
||||
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
|
||||
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
|
||||
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
|
||||
LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
|
||||
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
|
||||
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
|
||||
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
|
||||
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
|
||||
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
|
||||
LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
|
||||
LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
|
||||
LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
|
||||
LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39,
|
||||
LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40,
|
||||
LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2 = 41,
|
||||
LLAMA_VOCAB_PRE_TYPE_AFMOE = 42,
|
||||
LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43,
|
||||
LLAMA_VOCAB_PRE_TYPE_YOUTU = 44,
|
||||
LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE = 45,
|
||||
LLAMA_VOCAB_PRE_TYPE_QWEN35 = 46,
|
||||
LLAMA_VOCAB_PRE_TYPE_TINY_AYA = 47,
|
||||
LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48,
|
||||
LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49,
|
||||
LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50,
|
||||
LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE = 51,
|
||||
LLAMA_VOCAB_PRE_TYPE_MINICPM5 = 52,
|
||||
LLAMA_VOCAB_PRE_TYPE_WHITESPACE = 53,
|
||||
LLAMA_VOCAB_PRE_TYPE_GRANITE_EMB_MULTI = 54,
|
||||
LLAMA_VOCAB_PRE_TYPE_MELLUM2 = 55,
|
||||
};
|
||||
|
||||
struct LLM_KV;
|
||||
|
||||
+35
-12
@@ -15,6 +15,9 @@ void llama_model_exaone4::load_arch_hparams(llama_model_loader & ml) {
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
|
||||
GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
|
||||
hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 30: type = LLM_TYPE_1_2B; break;
|
||||
@@ -38,21 +41,37 @@ void llama_model_exaone4::load_arch_tensors(llama_model_loader &) {
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
const bool is_nextn = hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers;
|
||||
int flags = 0;
|
||||
if (is_nextn) {
|
||||
// NextN/MTP layers are preserved in GGUF but are not executed yet.
|
||||
flags |= TENSOR_SKIP;
|
||||
}
|
||||
|
||||
auto & layer = layers[i];
|
||||
|
||||
create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
||||
create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, flags);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, flags);
|
||||
|
||||
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
||||
if (!is_nextn) {
|
||||
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
||||
}
|
||||
|
||||
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
||||
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
||||
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, flags);
|
||||
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, flags);
|
||||
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, flags);
|
||||
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, flags);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, flags);
|
||||
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, flags);
|
||||
|
||||
if (is_nextn) {
|
||||
layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), {2 * n_embd, n_embd}, flags);
|
||||
layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), {n_embd}, flags);
|
||||
layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), {n_embd}, flags);
|
||||
layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), {n_embd}, flags | TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -90,7 +109,11 @@ llama_model_exaone4::graph<iswa>::graph(const llama_model & model, const llm_gra
|
||||
}
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
// MTP / NextN tail blocks are loaded for compatibility but not executed (same as exaone-moe).
|
||||
const int n_layer_main = int(n_layer) - int(hparams.nextn_predict_layers);
|
||||
GGML_ASSERT(n_layer_main > 0);
|
||||
|
||||
for (int il = 0; il < n_layer_main; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
// use RoPE for SWA layers or non-SWA models
|
||||
@@ -126,7 +149,7 @@ llama_model_exaone4::graph<iswa>::graph(const llama_model & model, const llm_gra
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
||||
cb(cur, "attn_out", il);
|
||||
}
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
if (il == n_layer_main - 1 && inp_out_ids) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@ void llama_model_falcon_h1::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
|
||||
ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
|
||||
|
||||
std::fill(hparams.recurrent_layer_arr.begin(), hparams.recurrent_layer_arr.end(), true);
|
||||
std::fill(hparams.is_recr_impl.begin(), hparams.is_recr_impl.end(), true);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 36:
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
void llama_model_gemma4::load_arch_hparams(llama_model_loader & ml) {
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer);
|
||||
|
||||
uint32_t n_kv_shared_layers = 0;
|
||||
ml.get_key(LLM_KV_ATTENTION_SHARED_KV_LAYERS, n_kv_shared_layers, false);
|
||||
|
||||
@@ -20,7 +20,7 @@ void llama_model_granite_hybrid::load_arch_hparams(llama_model_loader & ml) {
|
||||
|
||||
// A layer is recurrent IFF the n_head_kv value is set to 0
|
||||
for (uint32_t i = 0; i < hparams.n_layer; ++i) {
|
||||
hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
|
||||
hparams.is_recr_impl[i] = hparams.n_head_kv(i) == 0;
|
||||
}
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
@@ -71,7 +71,7 @@ void llama_model_granite_hybrid::load_arch_tensors(llama_model_loader &) {
|
||||
// norm
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
if (hparams.is_recurrent(i)) {
|
||||
if (hparams.is_recr(i)) {
|
||||
// ssm layers
|
||||
layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
|
||||
|
||||
@@ -158,7 +158,7 @@ llama_model_granite_hybrid::graph::graph(const llama_model & model, const llm_gr
|
||||
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
if (hparams.is_recurrent(il)) {
|
||||
if (hparams.is_recr(il)) {
|
||||
// ssm layer //
|
||||
cur = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il);
|
||||
} else {
|
||||
|
||||
@@ -9,7 +9,7 @@ void llama_model_jamba::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
for (uint32_t i = 0; i < hparams.n_layer; ++i) {
|
||||
hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
|
||||
hparams.is_recr_impl[i] = hparams.n_head_kv(i) == 0;
|
||||
}
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
|
||||
@@ -15,7 +15,7 @@ void llama_model_kimi_linear::load_arch_hparams(llama_model_loader & ml) {
|
||||
// Mark KDA layers as recurrent using n_head_kv pattern (like Jamba)
|
||||
// Set n_head_kv = 0 for KDA layers (recurrent), n_head_kv = n_head for MLA layers (attention)
|
||||
for (uint32_t i = 0; i < hparams.n_layer; ++i) {
|
||||
hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0; // KDA layers are recurrent
|
||||
hparams.is_recr_impl[i] = hparams.n_head_kv(i) == 0; // KDA layers are recurrent
|
||||
}
|
||||
|
||||
// MoE parameters - Kimi uses moe_intermediate_size = 1024
|
||||
@@ -53,7 +53,7 @@ void llama_model_kimi_linear::load_arch_tensors(llama_model_loader &) {
|
||||
const int64_t n_embd_head_v_kda = hparams.n_embd_head_kda;
|
||||
const int64_t ssm_d_conv = hparams.ssm_d_conv;
|
||||
|
||||
if (hparams.is_recurrent(i)) {
|
||||
if (hparams.is_recr(i)) {
|
||||
// Conv1d weights: try 4D first, then 3D (quantization may remove trailing 1)
|
||||
// 4D: [d_conv, 1, d_inner, 1], 3D: [d_conv, 1, d_inner]
|
||||
layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head, 1}, TENSOR_NOT_REQUIRED);
|
||||
@@ -285,7 +285,7 @@ llama_model_kimi_linear::graph::graph(const llama_model & model, const llm_graph
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
if (hparams.is_recurrent(il)) {
|
||||
if (hparams.is_recr(il)) {
|
||||
// === KDA Layer (Kimi Delta Attention) with Recurrent State ===
|
||||
// Reference: vLLM kda.py
|
||||
const auto * mctx_cur = inp_rs->mctx;
|
||||
|
||||
+5
-5
@@ -6,7 +6,7 @@ void llama_model_lfm2::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
for (uint32_t il = 0; il < hparams.n_layer; ++il) {
|
||||
hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
|
||||
hparams.is_recr_impl[il] = hparams.n_head_kv(il) == 0;
|
||||
}
|
||||
hparams.n_layer_dense_lead = hparams.n_layer;
|
||||
switch (hparams.n_ff()) {
|
||||
@@ -19,7 +19,7 @@ void llama_model_lfm2::load_arch_hparams(llama_model_loader & ml) {
|
||||
if (const auto is_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); is_swa && hparams.n_swa > 0) {
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
for (uint32_t il = 0; il < hparams.n_layer; ++il) {
|
||||
hparams.swa_layers[il] = !hparams.recurrent_layer_arr[il];
|
||||
hparams.is_swa_impl[il] = !hparams.is_recr_impl[il];
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -59,7 +59,7 @@ void llama_model_lfm2::load_arch_tensors(llama_model_loader &) {
|
||||
// for operator_norm
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
if (!hparams.is_recurrent(i)) {
|
||||
if (!hparams.is_recr(i)) {
|
||||
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
||||
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
||||
GGML_ASSERT(n_embd_v_gqa == n_embd_k_gqa);
|
||||
@@ -235,8 +235,8 @@ llama_model_lfm2::graph<iswa>::graph(const llama_model & model, const llm_graph_
|
||||
cur = build_norm(cur, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(cur, "model.layers.{}.operator_norm", il);
|
||||
|
||||
cur = hparams.is_recurrent(il) ? build_shortconv_block(cur, inp_hybrid->get_recr(), il) :
|
||||
build_attn_block(cur, inp_pos, inp_hybrid->get_attn(), il);
|
||||
cur = hparams.is_recr(il) ? build_shortconv_block(cur, inp_hybrid->get_recr(), il) :
|
||||
build_attn_block(cur, inp_pos, inp_hybrid->get_attn(), il);
|
||||
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
|
||||
@@ -10,7 +10,7 @@ void llama_model_lfm2moe::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
|
||||
|
||||
for (uint32_t il = 0; il < hparams.n_layer; ++il) {
|
||||
hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
|
||||
hparams.is_recr_impl[il] = hparams.n_head_kv(il) == 0;
|
||||
}
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
@@ -55,7 +55,7 @@ void llama_model_lfm2moe::load_arch_tensors(llama_model_loader &) {
|
||||
// for operator_norm
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
if (!hparams.is_recurrent(i)) {
|
||||
if (!hparams.is_recr(i)) {
|
||||
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
||||
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
||||
GGML_ASSERT(n_embd_v_gqa == n_embd_k_gqa);
|
||||
|
||||
@@ -15,7 +15,8 @@ void llama_model_llama4::load_arch_hparams(llama_model_loader & ml) {
|
||||
hparams.n_attn_temp_floor_scale = 8192;
|
||||
hparams.f_attn_temp_scale = 0.1f;
|
||||
hparams.f_attn_temp_offset = 1.0f;
|
||||
uint32_t swa_period = 4; // pattern: 3 chunked - 1 full
|
||||
|
||||
uint32_t swa_period = 4; // pattern: 3 chunked - 1 full
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
||||
hparams.set_swa_pattern(swa_period);
|
||||
|
||||
|
||||
@@ -0,0 +1,225 @@
|
||||
#include "models.h"
|
||||
|
||||
void llama_model_mellum::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
||||
|
||||
if (hparams.n_swa > 0) {
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
|
||||
uint32_t swa_period = 4;
|
||||
const auto res = ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
||||
if (res) {
|
||||
hparams.set_swa_pattern(swa_period);
|
||||
} else {
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer);
|
||||
}
|
||||
|
||||
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
||||
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
||||
|
||||
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
||||
} else {
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
}
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 28: type = LLM_TYPE_12B_A2_5B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
}
|
||||
|
||||
void llama_model_mellum::load_arch_tensors(llama_model_loader &) {
|
||||
LLAMA_LOAD_LOCALS;
|
||||
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
// output
|
||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = layers[i];
|
||||
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_gqa, n_embd_gqa, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
||||
|
||||
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
||||
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
||||
|
||||
if (n_expert == 0) {
|
||||
throw std::runtime_error("n_expert must be > 0 for Mellum");
|
||||
}
|
||||
if (n_expert_used == 0) {
|
||||
throw std::runtime_error("n_expert_used must be > 0 for Mellum");
|
||||
}
|
||||
|
||||
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
|
||||
|
||||
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
||||
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
||||
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<llm_graph_context> llama_model_mellum::build_arch_graph(const llm_graph_params & params) const {
|
||||
if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
|
||||
return std::make_unique<graph<true>>(*this, params);
|
||||
}
|
||||
return std::make_unique<graph<false>>(*this, params);
|
||||
}
|
||||
|
||||
template <bool iswa>
|
||||
llama_model_mellum::graph<iswa>::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
|
||||
inp_attn_type * inp_attn = nullptr;
|
||||
|
||||
if constexpr (iswa) {
|
||||
inp_attn = build_attn_inp_kv_iswa();
|
||||
} else {
|
||||
inp_attn = build_attn_inp_kv();
|
||||
}
|
||||
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
cur = build_norm(inpL,
|
||||
model.layers[il].attn_norm, nullptr,
|
||||
LLM_NORM_RMS, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
// self_attention
|
||||
{
|
||||
// compute Q and K and RoPE them
|
||||
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||
n_embd_head, n_head, n_head_kv, il);
|
||||
|
||||
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il);
|
||||
cb(Qcur, "Qcur_normed", il);
|
||||
|
||||
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il);
|
||||
cb(Kcur, "Kcur_normed", il);
|
||||
|
||||
const bool is_swa = hparams.is_swa(il);
|
||||
|
||||
if (is_swa) {
|
||||
// For sliding window layers, use regular rope with no yarn rope scaling.
|
||||
// This is achieved here by setting freq_scale and attn_factor to 1.
|
||||
// We also set ext_factor to 0 to avoid a few unnecessary computations.
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, 1.0,
|
||||
0.0, 1.0, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
Kcur = ggml_rope_ext(
|
||||
ctx0, Kcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, 1.0,
|
||||
0.0, 1.0, beta_fast, beta_slow
|
||||
);
|
||||
} else {
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
Kcur = ggml_rope_ext(
|
||||
ctx0, Kcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
}
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
// MoE
|
||||
cur = build_norm(ffn_inp,
|
||||
model.layers[il].ffn_norm, nullptr,
|
||||
LLM_NORM_RMS, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
ggml_tensor * moe_out =
|
||||
build_moe_ffn(cur,
|
||||
model.layers[il].ffn_gate_inp,
|
||||
model.layers[il].ffn_up_exps,
|
||||
model.layers[il].ffn_gate_exps,
|
||||
model.layers[il].ffn_down_exps,
|
||||
nullptr,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, true,
|
||||
hparams.expert_weights_scale,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||
il,
|
||||
nullptr, nullptr,
|
||||
model.layers[il].ffn_up_exps_s,
|
||||
model.layers[il].ffn_gate_exps_s,
|
||||
model.layers[il].ffn_down_exps_s);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
cur = moe_out;
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
cur = inpL;
|
||||
|
||||
cur = build_norm(cur,
|
||||
model.output_norm, nullptr,
|
||||
LLM_NORM_RMS, -1);
|
||||
|
||||
cb(cur, "result_norm", -1);
|
||||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
|
||||
template struct llama_model_mellum::graph<false>;
|
||||
template struct llama_model_mellum::graph<true>;
|
||||
@@ -8,7 +8,8 @@ void llama_model_mimo2::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
|
||||
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer);
|
||||
|
||||
float value_scale = 0.0f;
|
||||
if (ml.get_key(LLM_KV_ATTENTION_VALUE_SCALE, value_scale, false) && value_scale != 1.0f) {
|
||||
|
||||
@@ -411,6 +411,18 @@ struct llama_model_stablelm : public llama_model_base {
|
||||
std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
|
||||
};
|
||||
|
||||
struct llama_model_mellum : public llama_model_base {
|
||||
llama_model_mellum(const struct llama_model_params & params) : llama_model_base(params) {}
|
||||
void load_arch_hparams(llama_model_loader & ml) override;
|
||||
void load_arch_tensors(llama_model_loader & ml) override;
|
||||
|
||||
template <bool iswa>
|
||||
struct graph : public llm_graph_context {
|
||||
graph(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
|
||||
};
|
||||
|
||||
struct llama_model_qwen : public llama_model_base {
|
||||
llama_model_qwen(const struct llama_model_params & params) : llama_model_base(params) {}
|
||||
@@ -1913,5 +1925,9 @@ struct llama_model_step35 : public llama_model_base {
|
||||
graph(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
struct graph_mtp : public llm_graph_context {
|
||||
graph_mtp(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
|
||||
};
|
||||
|
||||
@@ -14,6 +14,14 @@ void llama_model_modern_bert::load_arch_hparams(llama_model_loader & ml) {
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
|
||||
// Some ModernBert derivatives (e.g. IBM Granite Embedding 97m R2) use
|
||||
// SiLU/SwiGLU in the FFN instead of the default GELU/GeGLU.
|
||||
hparams.llm_ffn_op = LLM_FFN_GEGLU;
|
||||
std::string hidden_act;
|
||||
if (ml.get_key(LLM_KV_HIDDEN_ACT, hidden_act, false)) {
|
||||
hparams.llm_ffn_op = llm_ffn_op_type_from_string(hidden_act, LLM_FFN_GEGLU);
|
||||
}
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 12:
|
||||
type = LLM_TYPE_47M; break; // granite-embedding-small
|
||||
@@ -144,7 +152,8 @@ llama_model_modern_bert::graph::graph(const llama_model & model, const llm_graph
|
||||
NULL, NULL, NULL,
|
||||
model.layers[il].ffn_down, NULL, NULL,
|
||||
NULL,
|
||||
LLM_FFN_GEGLU, LLM_FFN_SEQ, il);
|
||||
hparams.llm_ffn_op,
|
||||
LLM_FFN_SEQ, il);
|
||||
|
||||
// attentions bypass the intermediate layer
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
@@ -10,7 +10,7 @@ void llama_model_nemotron_h::load_arch_hparams(llama_model_loader & ml) {
|
||||
// A layer is recurrent IFF the n_head_kv value is set to 0 and
|
||||
// the n_ff value is set to 0
|
||||
for (uint32_t i = 0; i < hparams.n_layer; ++i) {
|
||||
hparams.recurrent_layer_arr[i] = (hparams.n_head_kv(i) == 0 && hparams.n_ff(i) == 0);
|
||||
hparams.is_recr_impl[i] = (hparams.n_head_kv(i) == 0 && hparams.n_ff(i) == 0);
|
||||
}
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
@@ -62,7 +62,7 @@ void llama_model_nemotron_h::load_arch_tensors(llama_model_loader &) {
|
||||
// all blocks use the attn norm
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
if (hparams.is_recurrent(i)) {
|
||||
if (hparams.is_recr(i)) {
|
||||
// ssm layers
|
||||
layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
|
||||
|
||||
@@ -143,7 +143,7 @@ llama_model_nemotron_h::graph::graph(const llama_model & model, const llm_graph_
|
||||
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
if (hparams.is_recurrent(il)) {
|
||||
if (hparams.is_recr(il)) {
|
||||
// ssm layer //
|
||||
cur = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il);
|
||||
} else if (hparams.n_ff(il) == 0) {
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user