Compare commits

...

53 Commits

Author SHA1 Message Date
rankaiyx 42401c72b8 Fix type casting for unaccounted memory calculation (#22424) 2026-04-27 14:31:13 +02:00
Georgi Gerganov e940b3d468 download : prefer q8_0 when q4_k not available (#22428) 2026-04-27 14:30:29 +02:00
ynankani 0f1bb602dd model : remove duplicate wo_s scale after build_attn (Qwen3, LLaMA) (#22421)
Signed-off-by: Yash Nankani <ynankani@nvidia.com>
2026-04-27 09:58:48 +02:00
Sigbjørn Skjæret d13540becd convert : remove input_scale for dequantized fp8 modelopt (#22356) 2026-04-27 08:45:01 +02:00
Adrien Gallouët f84270ea10 ggml : use 64 bytes aligned tile buffers (#21058)
| Model                            | Test   |   t/s OLD |   t/s NEW |   Speedup |
|:---------------------------------|:-------|----------:|----------:|----------:|
| qwen35 0.8B BF16                 | pp512  |    584.59 |    595.41 |      1.02 |
| qwen35 0.8B BF16                 | tg128  |     52.23 |     52.82 |      1.01 |
| qwen35 0.8B IQ2_M - 2.7 bpw      | pp512  |    260.64 |    261.70 |      1.00 |
| qwen35 0.8B IQ2_M - 2.7 bpw      | tg128  |     81.17 |     80.89 |      1.00 |
| qwen35 0.8B IQ2_XXS - 2.0625 bpw | pp512  |    302.36 |    302.56 |      1.00 |
| qwen35 0.8B IQ2_XXS - 2.0625 bpw | tg128  |     84.93 |     85.12 |      1.00 |
| qwen35 0.8B IQ3_XXS - 3.0625 bpw | pp512  |    263.22 |    260.01 |      0.99 |
| qwen35 0.8B IQ3_XXS - 3.0625 bpw | tg128  |     80.29 |     78.94 |      0.98 |
| qwen35 0.8B IQ4_NL - 4.5 bpw     | pp512  |    728.65 |    742.09 |      1.02 |
| qwen35 0.8B IQ4_NL - 4.5 bpw     | tg128  |     82.39 |     84.46 |      1.03 |
| qwen35 0.8B IQ4_XS - 4.25 bpw    | pp512  |    681.33 |    677.06 |      0.99 |
| qwen35 0.8B IQ4_XS - 4.25 bpw    | tg128  |     80.18 |     79.28 |      0.99 |
| qwen35 0.8B Q2_K_M               | pp512  |    413.28 |    415.94 |      1.01 |
| qwen35 0.8B Q2_K_M               | tg128  |     81.90 |     82.78 |      1.01 |
| qwen35 0.8B Q3_K_M               | pp512  |    493.17 |    495.08 |      1.00 |
| qwen35 0.8B Q3_K_M               | tg128  |     82.75 |     83.23 |      1.01 |
| qwen35 0.8B Q3_K_S               | pp512  |    429.35 |    427.64 |      1.00 |
| qwen35 0.8B Q3_K_S               | tg128  |     86.69 |     87.02 |      1.00 |
| qwen35 0.8B Q4_0                 | pp512  |    783.46 |    782.32 |      1.00 |
| qwen35 0.8B Q4_0                 | tg128  |     88.23 |     87.90 |      1.00 |
| qwen35 0.8B Q4_1                 | pp512  |    741.71 |    729.76 |      0.98 |
| qwen35 0.8B Q4_1                 | tg128  |     85.44 |     86.01 |      1.01 |
| qwen35 0.8B Q4_K_M               | pp512  |    676.24 |    681.31 |      1.01 |
| qwen35 0.8B Q4_K_M               | tg128  |     76.59 |     77.06 |      1.01 |
| qwen35 0.8B Q4_K_S               | pp512  |    683.12 |    688.81 |      1.01 |
| qwen35 0.8B Q4_K_S               | tg128  |     80.50 |     81.19 |      1.01 |
| qwen35 0.8B Q5_K_M               | pp512  |    635.33 |    642.11 |      1.01 |
| qwen35 0.8B Q5_K_M               | tg128  |     72.07 |     72.49 |      1.01 |
| qwen35 0.8B Q5_K_S               | pp512  |    660.95 |    658.18 |      1.00 |
| qwen35 0.8B Q5_K_S               | tg128  |     72.19 |     72.95 |      1.01 |
| qwen35 0.8B Q6_K                 | pp512  |    647.97 |    638.84 |      0.99 |
| qwen35 0.8B Q6_K                 | tg128  |     72.83 |     72.49 |      1.00 |
| qwen35 0.8B Q8_0                 | pp512  |    805.01 |    785.49 |      0.98 |
| qwen35 0.8B Q8_0                 | tg128  |     70.10 |     70.13 |      1.00 |

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2026-04-27 09:30:55 +03:00
Max Krasnyansky 5594d13224 common: fix missing exports in llama-common (#22340)
* common: refactor common/debug to move abort_on_nan into base_callback_data

Passing bool abort_on_nan as template parameter for common_debug_cb_eval is unnecessary and creates an issue with LTO.
It should just be a member of the base_callback_data instead.

* cont : cleanup

* common : use pimpl in debug.h to reduce header dependencies

Move common_debug_cb_user_data's data members (std::regex,
std::vector<uint8_t>) into a private impl struct in debug.cpp.

This removes the includes of common.h and <regex> from debug.h,
reducing transitive dependencies for any translation unit that
includes the header.

Assisted-by: llama.cpp:local pi

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2026-04-27 08:06:39 +03:00
Georgi Gerganov f535774325 pr2wt : symlink .pi (#22386) 2026-04-26 19:49:26 +03:00
Rithik Sharma 06a811d085 add performance-portable tuning for register-tile and subgroup matmul (#22241) 2026-04-26 09:26:28 -07:00
Gaurav Garg 78433f606f Fix recurrent state serialization for partial reads and writes (#22362)
The previous code worked only for full tensor reads and writes and was hitting `GGML_ASSERT(size == ggml_nbytes(tensor)); ` assert when tested with llama-server.
2026-04-26 13:34:40 +02:00
Johannes Gäßler 7ec36aa861 Github: set meta backend code owner (#22388) 2026-04-26 13:34:13 +02:00
Oliver Simons b1a5bd4e0c CUDA: better coalesce data-access for contiguous concat (#22330)
Also, distribute all elements across CTAs evenly instead of launching
one CTA per dim
2026-04-26 09:21:45 +02:00
Sigbjørn Skjæret 0c6ee1cade ggml-cpu : re-enable fast gelu_quick_f16 (#22339) 2026-04-26 09:28:14 +03:00
Eve 2dd84169d1 ggml-cpu: optimize avx2 q6_k (#22345) 2026-04-26 09:27:50 +03:00
lhez f454bd7eb8 opencl: add iq4_nl support (#22272)
* opencl: add general support for iq4_nl

* opencl: add iq4_nl gemm/gemv for adreno

* opencl: pack 2 lut entries into a uint
2026-04-25 21:21:58 -07:00
Trivikram Reddy b760272f1a hexagon: guard HMX clock request for v75+ platforms (#22377) 2026-04-25 17:58:26 -07:00
Piotr Wilkin (ilintar) dcad77cc3b chat: fix handling of space in reasoning markers (#22353)
* chat: fix handling of space in reasoning markers

* fix tests

* whitespace
2026-04-25 21:24:13 +02:00
Georgi Gerganov 98dc1418ea spec : fix vocab compat checks (#22358) 2026-04-25 20:11:35 +03:00
Johannes Gäßler 9725a313be CUDA: reduce MMQ stream-k overhead (#22298)
* CUDA: reduce MMQ stream-k overhead

* use 32 bit integers for kbc
2026-04-25 14:15:03 +02:00
Developer-Ecosystem-Engineering d1649047a3 metal : optimize Metal Tensor API usage for GGML_OP_MUL_MAT (#20962)
* Optimize Metal Tensor API usage for matmul2d

Separates the Metal Tensor API (matmul2d) path in kernel_mul_mm into its own standalone kernel, gated by GGML_METAL_HAS_TENSOR.

The legacy simdgroup_matrix kernel is preserved under #else.

Previously both paths were interleaved via #ifdef blocks within a single kernel, forcing the tensor path to share the legacy kernel's data layout and threadgroup memory scheme. Splitting the kernel enabled memory and dispatch optimizations that weren't possible when the two paths shared code structure.

* cont : cleanup

* cont : cleanup

* cont : cleanup

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2026-04-25 15:14:28 +03:00
ddh0 9d34231bb8 llama-quant : default ftype param Q5_1 --> Q8_0 (#20828)
Change the default `ftype` in `llama_model_quantize_params` from
`LLAMA_FTYPE_MOSTLY_Q5_1` to `LLAMA_FTYPE_MOSTLY_Q8_0`.

In case some external program naively uses the default quantization
params, we should probably default to a known-good type like Q8_0 rather
than Q5_1, which is rather old.
2026-04-25 09:25:35 +03:00
Georgi Gerganov 8ea8fee966 gitignore : add .pi + personal SYSTEM.md (#22316)
* gitignore : add .pi + personal SYSTEM.md

* cont : fix requirements heading in PR template

* cont : shorten line
2026-04-25 09:20:45 +03:00
Neo Zhang eddd7a13a5 [SYCL] Optimize Q4_0 mul_mat for Arc770, add scripts (#22291)
* opt arc770 for Q4_0

* add for Q4_0

* update the script

* add help script for windows

* update guide

* fix format issue

* convert from dos to unix for format issue

* fix missed -sm parameter
2026-04-25 09:20:14 +03:00
Reese Levine dd2914dc81 ggml-webgpu: support for SSM_SCAN and disable set_rows error checking (#22327)
* Implement ssm_scan

* Remove blocking in graph_compute and check for set rows

* Fix bindings

* Update op support
2026-04-25 09:18:15 +03:00
Piotr Wilkin (ilintar) 0adede866d parser: fix structured output bug (#22302)
* fix very stupid structured output bug

* Things just cannot be too easy.
2026-04-24 23:19:55 +02:00
Trivikram Reddy 361fe72acb Hexagon: Bump HMX Frequency to Max Corner (#22334)
* hexagon: bump HMX freq to max corner

* hex-mm: fix error in log msg
2026-04-24 13:55:17 -07:00
Shreya Jain a702f39597 CI Snapdragon: Switch ubuntu-latest to ubuntu-slim runner (#22303)
* switch ubuntu-latest to ubuntu-slim

* Fix the path for upload so CI doesn't fail

* Update .github/workflows/build-and-test-snapdragon.yml

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Use -slim image for key check and consistent naming for artifact dir

Signed-off-by: Max Krasnyansky <maxk@qti.qualcomm.com>

* Remove check-secret extra job

* move QDC key check for Run QDC jobs step specifically

* add a step before to check the secret for qdc jobs

---------

Signed-off-by: Max Krasnyansky <maxk@qti.qualcomm.com>
Co-authored-by: Max Krasnyansky <maxk@qti.qualcomm.com>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2026-04-24 21:21:36 +02:00
Zheyuan Chen 13d36cf891 ggml-webgpu: enable FLASH_ATTN_EXT on browser without subgroup matrix (#22199)
* ggml-webgpu: add tile flash attention fallback

* ggml-webgpu: add new fields and discard usage of mnk for tile version

* ggml-webgpu: modify the vec path to discard the mnk parameter

* ggml-webgpu: enable flash attention vec and tile version for broswer

* ggml-webgpu: stagging KV for flash attention tile version

* formatting

* turn on subgroup uniformity check

* remove Q_TILE as it is always 1 for vec path

* make row_max and exp_sum to local register

* make different bindings with same underlying buffer to have the same usage flags

* move path selection into the shader library and have the host consume a single flash-attn decision object.

* turn off skip_validation and address buffer overlapping when nwg==1

* formatting

* merge binding when kv overlap
2026-04-24 10:39:09 -07:00
Mengsheng Wu f65bc34c68 hexagon: use DIRID 13 in libggml-htp.inf for modern InfVerif (#22306) 2026-04-24 09:21:33 -07:00
Georgi Gerganov 15fa3c493b metal : print GPU description (#22318) 2026-04-24 13:56:03 +03:00
Adrien Gallouët dc80c5252a common : fix jinja warnings with clang 21 (#22313)
Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2026-04-24 12:36:02 +02:00
Georgi Gerganov e583f3b4f5 ggml : minor coding style (#22308) 2026-04-24 11:02:00 +03:00
Georgi Gerganov 017f090442 jinja : remove unused header (#22310) 2026-04-24 11:01:46 +03:00
Georgi Gerganov ffdd983fb8 server : fix swa-full logic (#22288) 2026-04-24 10:17:37 +03:00
Yes You Can Have Your Own 793d0a7931 server: rename debug tags to match --cache-idle-slots naming (#22292) 2026-04-24 09:28:44 +03:00
Mengsheng Wu 8bc492ebb4 hexagon: add SOLVE_TRI op (#21974)
* hexagon: add SOLVE_TRI op

* ggml: fix TODO description for solve_tri

* hexagon: rm unused variable/function warnings

* hexagon: chunk vs batch processingfor better thread utilization

* hexagon: vectorize partial f32 loads

* hexagon: move HVX f32 add/sub/mul wrappers to hvx-base.h

---------

Co-authored-by: Todor Boinovski <todorb@qti.qualcomm.com>
2026-04-23 18:39:13 -07:00
Chen Yuan e5f070a1dc fix(shader): handle the buffer aliasing for rms fuse (#22266) 2026-04-23 16:32:59 -07:00
Ethan Turner fa0b8a70a8 cli: Remove redundant local sampling variables (#20429) (#22264)
This change implements the third requested change in issue 20429.
Because defaults.sampling contains the reasoning budget token count and
the reasoning budget message, it's not necessary to assign them to
struct variables.
2026-04-24 00:53:23 +02:00
Max Krasnyansky 5d2b52d80d hexagon: add support for basic and extended Op profiling (#22269)
* hexagon: restore HTP_OPMASK_QUEUE

* hexagon: honor OPMASK_SKIP_COMPUTE in hmx-matmul

* hex-prof: restore op profiling

* hex-prof: enable PMU

* hexagon: simplify and improve op-queuing with full profiling support

Add separate profile descriptors.

* hexagon: remove opsync and rename opmask into opstage

opsync is no longer needed since the profiler is fully async now.
opmask name was confusing and opstage is more accurate.

* hexagon: refactor opbatch queue handling

* hexagon: add iface hooks for enabling profiler from the host

Also move all the PMU setup stuff out of the hex-utils since it's not inteded for normal use.

* hexagon: make profiler mode configurable

On older devices getting PMU counters is expensive so it's now optional.

* hexagon: add support for setting profiler pmu events from env

* hexagon: simplify profiler output (no need to print buffs, etc)

* hexagon: simplify pmu counter formating

* hexagon: add a simple profile post-proc tool

* hex-prof: add support for reading logs from stdin

* hexagon: document GGML_HEXAGON_PROFILE

* hex-prof: update default width for dims field

* hex-prof: fix linter warnings and errors

* Update ggml/src/ggml-hexagon/htp/htp-ops.h

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update scripts/snapdragon/ggml-hexagon-profile.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

---------

Co-authored-by: Trivikram Reddy <tamarnat@qti.qualcomm.com>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2026-04-23 14:17:21 -07:00
Shreya Jain 187a456370 Enable testing on Snapdragon devices (#21051)
* Add the tests that we want to run on external CI

* remove extra files

* Fixes python issues, reove the deadlock on CI

* remove unecessary changes

* use override to ty.toml

* fix pre-commit and try tests with secret in external repo not upstream

* skip if key is unavailable

* Fix feedback

* switch hexagon to snapdragon

* cleanup

* fix secrets

* remove the copyrights at the top of the files
2026-04-23 13:08:10 -07:00
srkizer 185cbff6f1 server : convert_anthropic_to_oai: also copy chat_template_kwargs (#22154) 2026-04-23 13:32:46 -05:00
Song Li c78fb909b2 server: fix heap-buffer-overflow from negative n_discard (CVE-2026-21869) (#22267)
* server: clamp n_discard to non-negative at JSON parse boundary (CVE-2026-21869)

A negative n_discard from client JSON causes heap-buffer-overflow in
update_slots() context-shift loop (CWE-787, CVSS 8.8). Clamp to 0 at
ingress; n_discard=0 already triggers auto-discard (n_left/2).

Ref: GHSA-8947-pfff-2f3c

* cont : cleaner

* cont : cleanerer

* cont : cleanest

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2026-04-23 18:39:07 +02:00
Adrien Gallouët 12568ca8c8 vendor : update LibreSSL to 4.3.1 (#22285)
Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2026-04-23 17:45:56 +02:00
kvc0 c807c6e3b0 server: (anthropic API) fix prefix caching (#21793)
When testing claude code against llama.cpp, I noticed that only
n_past 18577 was used even when context was 60k or more. The log
in llama-server says:
```
slot update_slots: id  3 | task 10342 | old: ... ; cch= | defa0;You are
slot update_slots: id  3 | task 10342 | new: ... ; cch= | 1c8b4;
```
I observed that the cch value changed every time. Reading about that,
the x-anthropic-billing-header system message seems to be specially
handled inside of the anthropic api. I could remove it, but there
is a meaningful string sometimes included at the end. So instead,
I just replace the changing cch checksum with fffff.

I'm treating this as an anthropic message body API detail - I think this
is the right way to do this, but by all means please correct me!

It's always 5 hexadecimal characters, but I've written the replacement
defensively in case they change the protocol.
2026-04-23 17:45:02 +02:00
Sigbjørn Skjæret 0949beb5a3 fix build number for sycl release (#22283) 2026-04-23 21:38:58 +08:00
Daniel Bevenius 9012c50fc8 model-conversion : fix mmproj output file name [no ci] (#22274)
* model-conversion : fix mmproj output file name [no ci]

This commit updates the convert-model.sh script to properly handle
mmproj output files.

The motivation for this that currently the same name as the original
model is used as the mmproj file, which causes the original model to
be overwritten and no mmproj-<model_name>.gguf to be created.

* model-conversion : use MODEL_NAME [no ci]
2026-04-23 15:07:38 +02:00
Matthias Straka 0dd7f915fd cli : cleanup auto-completion code (#21745) 2026-04-23 15:03:28 +02:00
Tarek Dakhran 550d684bd1 server: Enable transcriptions API for LFM2-Audio (#22000) 2026-04-23 10:47:26 +02:00
Georgi Gerganov 8635e221c8 metal : fix event synchronization (#22260) 2026-04-23 08:22:49 +03:00
Georgi Gerganov 930e0210d1 gitignore: add AGENTS.local.md (#22246)
* gitignore: add AGENTS.local

Assisted-by: llama.cpp:local pi
Signed-off-by: Georgi Gerganov <ggerganov@gmail.com>

* gitignore: rename AGENTS.local to AGENTS.local.md

Assisted-by: llama.cpp:local pi
Signed-off-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Signed-off-by: Georgi Gerganov <ggerganov@gmail.com>
2026-04-23 08:22:24 +03:00
Georgi Gerganov 96c1db26c4 ggml-base: use MATH_LIBRARY variable instead of hardcoded 'm' (#22239)
Fixes #22237 — the find_library(MATH_LIBRARY m) result was being
discarded and the target linked against the literal 'm' string.

This prevents users from overriding the math library (e.g. for AMD AOCL)
via CMake variables. Now the discovered MATH_LIBRARY is used directly.
2026-04-23 08:22:08 +03:00
Neo Zhang Jianyu 4ead6fd957 [SYCL] Update oneapi 2025.3.3, Seperate SYCL build, release Ubuntu 24 package. (#22078)
* upgrade oneAPI to 2025.3.3

* update

* seperate SYCL CI and support release binary package for ubuntu 24

* add dependence

* remove wrong copy lines

* add missed line

* remove other task to test the release for SYCL

* rm more for test release

* fix file name

* correct the error in running

* support build for fp32/fp16

* rm ubuntu-24-sycl-fp16 for duplicated

* refactor build setting

* update guide for ubuntu 24 release package, restore the release.yml for other backend

* user docker replace to install oneAPI

* use download installation package to replace docker

* use wget to download and install oneapi, replace the apt cmd

* enable ccache for oneAPI installation

* fix format error

* enable cache for oneAPI installation

* update guide

* Update .github/workflows/release.yml

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update .github/workflows/release.yml

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update .github/workflows/build-sycl.yml

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update .github/workflows/release.yml

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2026-04-23 08:21:36 +03:00
ynankani 5eaee65384 convert : Handle ModelOpt produced mixed precision model during convert to GGUF (#22247)
* Handle ModelOpt produced mixed precision model during convert to GGUF

* Apply suggestions from code review

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Apply suggestions from code review

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2026-04-23 08:19:51 +03:00
abotsis 60b68a6279 sycl : fused MoE mul_mat_vec_q for TG (#21920)
* sycl : fused MoE mul_mat_vec_q for TG

Create an MMVQ kernel so ggml_sycl_mul_mat_id can consolidate
n_experts_used matmuls in a single kernel launch. The kernel
also reads expert IDs directly, removing a per-call host sync.

This is similar to the CUDA backend's ggml_cuda_mul_mat_vec_q*
paths.

All types supported in the current MMVQ are supported here as well:
Q2_K, Q3_K, Q4_K, Q5_K, Q6_K, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0

It will fall back to the existing per-expert path when src0 has been rewritten
by opt_for_reorder(), and for any shape the fused path doesn't handle.

test-backend-ops passes for supported type/shape combos.

Benchmark: Qwen3-Next-35B-A3B Q4_K_M on Intel Arc B70 (SYCL0),
baseline 707c0b7a6, 16k context, -fa 0.

  build/bin/llama-bench -hf unsloth/Qwen3.5-35B-A3B-GGUF:Q4_K_M \
    -p 1024 -n 128 -d 16384 -ngl 99 -fa 0 -ub 2048 -r 2 -dev SYCL0

Before (3 runs on 707c0b7a6):

  | test            |            run 1 |            run 2 |            run 3 |
  | --------------- | ----------------:| ----------------:| ----------------:|
  | pp1024 @ d16384 |   533.26 ±  4.87 |   535.20 ±  2.78 |   524.27 ±  3.10 |
  | tg128  @ d16384 |    33.47 ±  0.02 |    33.31 ±  0.02 |    33.17 ±  0.05 |

After (3 runs on 707c0b7a6 + this patch):

  | test            |            run 1 |            run 2 |            run 3 |
  | --------------- | ----------------:| ----------------:| ----------------:|
  | pp1024 @ d16384 |   534.06 ±  0.97 |   531.95 ±  0.02 |   520.94 ± 20.10 |
  | tg128  @ d16384 |    45.85 ±  0.21 |    45.95 ±  0.45 |    46.22 ±  0.12 |

disclosure: Claude wrote it, but I reviewed and understand the implementation
(albeit my C is a little rusty).

* sycl: also support nvfp4 and mxfp4 expert types

* sycl: terser comments/nested dispatch in response to review

* sycl: more comment cleanup in mmvq.cpp/hpp

---------

Co-authored-by: Debian <aaron@openllmi.net.bots.is>
2026-04-23 08:18:56 +03:00
121 changed files with 10589 additions and 3278 deletions
+1 -1
View File
@@ -1,4 +1,4 @@
ARG ONEAPI_VERSION=2025.3.2-0-devel-ubuntu24.04
ARG ONEAPI_VERSION=2025.3.3-0-devel-ubuntu24.04
## Build Image
+1 -1
View File
@@ -6,7 +6,7 @@
<!-- You can provide more details and link related discussions here. Delete this section if not applicable -->
# Requirements
## Requirements
<!-- IMPORTANT: Please do NOT delete this section, otherwise your PR may be rejected -->
@@ -0,0 +1,116 @@
name: CI (snapdragon)
on:
workflow_dispatch:
push:
branches:
- master
paths:
- '.github/workflows/build-and-test-snapdragon.yml'
- 'ggml/include/ggml-hexagon.h'
- 'ggml/src/ggml-hexagon/**'
- 'docs/backend/snapdragon/**'
- 'scripts/snapdragon/**'
- 'CMakePresets.json'
pull_request:
types: [opened, synchronize, reopened]
paths:
- '.github/workflows/build-and-test-snapdragon.yml'
- 'ggml/include/ggml-hexagon.h'
- 'ggml/src/ggml-hexagon/**'
- 'docs/backend/snapdragon/**'
- 'scripts/snapdragon/**'
- 'CMakePresets.json'
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
jobs:
android-ndk-snapdragon:
runs-on: ubuntu-latest
container:
image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.3'
defaults:
run:
shell: bash
steps:
- name: Clone
uses: actions/checkout@v6
with:
fetch-depth: 0
lfs: false
- name: Build Llama.CPP for Snapdragon Android
id: build_llama_cpp_snapdragon_android
run: |
cp docs/backend/snapdragon/CMakeUserPresets.json .
cmake --preset arm64-android-snapdragon-release -B build
cmake --build build
cmake --install build --prefix pkg-snapdragon/llama.cpp
- name: Upload Llama.CPP Snapdragon Android Build Artifact
if: ${{ always() && steps.build_llama_cpp_snapdragon_android.outcome == 'success' }}
uses: actions/upload-artifact@v6
with:
name: llama-cpp-android-arm64-snapdragon
path: pkg-snapdragon/llama.cpp
test-snapdragon-qdc:
name: Test on QDC Android Device (${{ matrix.device }})
needs: [android-ndk-snapdragon]
runs-on: ubuntu-slim
strategy:
fail-fast: false
matrix:
device: [SM8750, SM8650, SM8850]
steps:
- name: Checkout
uses: actions/checkout@v6
- name: Download build artifact
uses: actions/download-artifact@v7
with:
name: llama-cpp-android-arm64-snapdragon
path: pkg-snapdragon/llama.cpp
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.x'
cache: pip
- name: Install system dependencies
run: |
sudo apt-get update
sudo apt-get install -y curl unzip
- name: Install QDC SDK wheel
run: |
curl -fSL -o qdc_sdk.zip https://softwarecenter.qualcomm.com/api/download/software/tools/Qualcomm_Device_Cloud_SDK/All/0.2.3/qualcomm_device_cloud_sdk-0.2.3.zip
unzip qdc_sdk.zip -d qdc_sdk
pip install qdc_sdk/qualcomm_device_cloud_sdk-0.2.3-py3-none-any.whl
- name: Check QDC API key
id: check_secret
env:
QDC_API_KEY: ${{ secrets.QDC_API_KEY }}
run: echo "has-qdc-key=${{ env.QDC_API_KEY != '' }}" >> "$GITHUB_OUTPUT"
- name: Run QDC tests (${{ matrix.device }})
if: steps.check_secret.outputs.has-qdc-key == 'true'
run: |
python scripts/snapdragon/qdc/run_qdc_jobs.py \
--test all \
--pkg-dir pkg-snapdragon/llama.cpp \
--model-url "https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf" \
--device ${{ matrix.device }}
env:
QDC_API_KEY: ${{ secrets.QDC_API_KEY }}
- name: Cleanup
if: always()
run: rm -rf pkg-snapdragon qdc_sdk qdc_sdk.zip
+18 -31
View File
@@ -1,26 +1,24 @@
name: CI (android)
on:
workflow_dispatch: # allows manual triggering
workflow_dispatch:
push:
branches:
- master
paths: [
'.github/workflows/build-android.yml',
'**/CMakeLists.txt',
'**/.cmake',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp'
]
paths:
- '.github/workflows/build-android.yml'
- '**/CMakeLists.txt'
- '**/.cmake'
- '**/*.h'
- '**/*.hpp'
- '**/*.c'
- '**/*.cpp'
pull_request:
types: [opened, synchronize, reopened]
paths: [
'.github/workflows/build-android.yml',
'examples/llama.android/**'
]
paths:
- '.github/workflows/build-android.yml'
- 'examples/llama.android/**'
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@@ -67,35 +65,24 @@ jobs:
defaults:
run:
shell: bash
strategy:
matrix:
include:
- build: 'arm64-cpu'
defines: '-D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_OPENSSL=OFF -D GGML_OPENMP=OFF'
- build: 'arm64-snapdragon'
defines: '--preset arm64-android-snapdragon-release'
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
lfs: false
- name: Build Llama.CPP for Hexagon Android
id: build_llama_cpp_hexagon_android
- name: Build
id: ndk_build
run: |
if [[ "${{ matrix.build }}" == "arm64-snapdragon" ]]; then
cp docs/backend/snapdragon/CMakeUserPresets.json .
fi
cmake ${{ matrix.defines }} -B build
cmake -D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_OPENSSL=OFF -D GGML_OPENMP=OFF -B build
cmake --build build
cmake --install build --prefix pkg-adb/llama.cpp
- name: Upload Llama.CPP Hexagon Android Build Artifact
if: ${{ always() && steps.build_llama_cpp_hexagon_android.outcome == 'success' }}
- name: Upload Android Build Artifact
if: ${{ always() && steps.ndk_build.outcome == 'success' }}
uses: actions/upload-artifact@v6
with:
name: llama-cpp-android-${{ matrix.build }}
name: llama-cpp-android-arm64-cpu
path: pkg-adb/llama.cpp
+142
View File
@@ -0,0 +1,142 @@
name: CI (sycl)
on:
workflow_dispatch: # allows manual triggering
push:
branches:
- master
paths: [
'.github/workflows/build-sycl.yml',
'**/CMakeLists.txt',
'**/.cmake',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp'
]
pull_request:
types: [opened, synchronize, reopened]
paths: [
'.github/workflows/build-sycl.yml',
'ggml/src/ggml-sycl/**'
]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
LLAMA_LOG_COLORS: 1
LLAMA_LOG_PREFIX: 1
LLAMA_LOG_TIMESTAMPS: 1
jobs:
ubuntu-24-sycl:
strategy:
matrix:
build: [fp32, fp16]
include:
- build: fp32
fp16: OFF
- build: fp16
fp16: ON
runs-on: ubuntu-24.04
env:
ONEAPI_ROOT: /opt/intel/oneapi/
ONEAPI_INSTALLER_VERSION: "2025.3.3"
continue-on-error: true
steps:
- uses: actions/checkout@v6
- name: Use oneAPI Installation Cache
uses: actions/cache@v5
id: cache-sycl
with:
path: ${{ env.ONEAPI_ROOT }}
key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
- name: Download & Install oneAPI
shell: bash
if: steps.cache-sycl.outputs.cache-hit != 'true'
run: |
cd /tmp
wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: ubuntu-24-sycl-${{ matrix.build }}
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build
id: cmake_build
run: |
source /opt/intel/oneapi/setvars.sh
cmake -B build \
-G "Ninja" \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
-DLLAMA_OPENSSL=OFF \
-DGGML_NATIVE=OFF \
-DGGML_SYCL_F16=${{ matrix.fp16 }}
time cmake --build build --config Release -j $(nproc)
windows-latest-sycl:
runs-on: windows-2022
defaults:
run:
shell: bash
env:
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
ONEAPI_INSTALLER_VERSION: "2025.3.3"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Use oneAPI Installation Cache
uses: actions/cache@v5
id: cache-sycl
with:
path: ${{ env.ONEAPI_ROOT }}
key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
- name: Download & Install oneAPI
shell: bash
if: steps.cache-sycl.outputs.cache-hit != 'true'
run: |
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: windows-latest-sycl
variant: ccache
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
# TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
- name: Build
id: cmake_build
run: examples/sycl/win-build-sycl.bat
-133
View File
@@ -555,106 +555,6 @@ jobs:
-DGGML_MUSA=ON
time cmake --build build --config Release -j $(nproc)
ubuntu-22-sycl:
runs-on: ubuntu-22.04
continue-on-error: true
steps:
- uses: actions/checkout@v6
- name: add oneAPI to apt
shell: bash
run: |
cd /tmp
wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
- name: install oneAPI dpcpp compiler
shell: bash
run: |
sudo apt update
sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev
- name: install oneAPI MKL library
shell: bash
run: |
sudo apt install intel-oneapi-mkl-devel
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: ubuntu-22-sycl
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build
id: cmake_build
run: |
source /opt/intel/oneapi/setvars.sh
cmake -B build \
-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx
time cmake --build build --config Release -j $(nproc)
ubuntu-22-sycl-fp16:
runs-on: ubuntu-22.04
continue-on-error: true
steps:
- uses: actions/checkout@v6
- name: add oneAPI to apt
shell: bash
run: |
cd /tmp
wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
- name: install oneAPI dpcpp compiler
shell: bash
run: |
sudo apt update
sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev ninja-build
- name: install oneAPI MKL library
shell: bash
run: |
sudo apt install intel-oneapi-mkl-devel
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: ubuntu-22-sycl-fp16
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build
id: cmake_build
run: |
source /opt/intel/oneapi/setvars.sh
cmake -B build \
-G "Ninja" \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
-DGGML_SYCL_F16=ON
time cmake --build build --config Release -j $(nproc)
windows-latest:
runs-on: windows-2025
@@ -863,39 +763,6 @@ jobs:
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
cmake --build build --config Release
windows-latest-sycl:
runs-on: windows-2022
defaults:
run:
shell: bash
env:
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: windows-latest-sycl
variant: ccache
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Install
run: |
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
# TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
- name: Build
id: cmake_build
run: examples/sycl/win-build-sycl.bat
windows-latest-hip:
runs-on: windows-2022
+94 -5
View File
@@ -598,15 +598,29 @@ jobs:
shell: bash
env:
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
ONEAPI_INSTALLER_VERSION: "2025.3.3"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Use oneAPI Installation Cache
uses: actions/cache@v5
id: cache-sycl
with:
path: ${{ env.ONEAPI_ROOT }}
key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
- name: Download & Install oneAPI
shell: bash
if: steps.cache-sycl.outputs.cache-hit != 'true'
run: |
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
@@ -614,10 +628,6 @@ jobs:
variant: ccache
evict-old-files: 1d
- name: Install
run: |
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
- name: Build
id: cmake_build
shell: cmd
@@ -670,6 +680,82 @@ jobs:
path: llama-bin-win-sycl-x64.zip
name: llama-bin-win-sycl-x64.zip
ubuntu-24-sycl:
strategy:
matrix:
build: [fp32, fp16]
include:
- build: fp32
fp16: OFF
- build: fp16
fp16: ON
runs-on: ubuntu-24.04
env:
ONEAPI_ROOT: /opt/intel/oneapi/
ONEAPI_INSTALLER_VERSION: "2025.3.3"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Use oneAPI Installation Cache
uses: actions/cache@v5
id: cache-sycl
with:
path: ${{ env.ONEAPI_ROOT }}
key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
- name: Download & Install oneAPI
shell: bash
if: steps.cache-sycl.outputs.cache-hit != 'true'
run: |
cd /tmp
wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: ubuntu-24-sycl-${{ matrix.build }}
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build
id: cmake_build
run: |
source /opt/intel/oneapi/setvars.sh
cmake -B build \
-G "Ninja" \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
-DLLAMA_OPENSSL=OFF \
-DGGML_NATIVE=OFF \
-DGGML_SYCL_F16=${{ matrix.fp16 }}
time cmake --build build --config Release -j $(nproc)
- name: Determine tag name
id: tag
uses: ./.github/actions/get-tag-name
- name: Pack artifacts
id: pack_artifacts
run: |
cp LICENSE ./build/bin/
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
- name: Upload artifacts
uses: actions/upload-artifact@v6
with:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
ubuntu-22-rocm:
runs-on: ubuntu-22.04
@@ -1045,6 +1131,7 @@ jobs:
- ubuntu-cpu
- ubuntu-vulkan
- ubuntu-24-openvino
- ubuntu-24-sycl
- android-arm64
- macOS-cpu
- ios-xcode-build
@@ -1133,6 +1220,8 @@ jobs:
- [Ubuntu arm64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-arm64.tar.gz)
- [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
- [Ubuntu x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ needs.ubuntu-24-openvino.outputs.openvino_version }}-x64.tar.gz)
- [Ubuntu x64 (SYCL FP32)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp32-x64.tar.gz)
- [Ubuntu x64 (SYCL FP16)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp16-x64.tar.gz)
**Android:**
- [Android arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz)
+13 -1
View File
@@ -34,7 +34,6 @@
/.vscode/
/nppBackup
# Coverage
/gcovr-report/
@@ -74,6 +73,7 @@
!/models/templates
# Zig
/zig-out/
/zig-cache/
@@ -93,6 +93,7 @@
!/examples/sycl/*.sh
# Server Web UI temporary files
/tools/server/webui/node_modules
/tools/server/webui/dist
# we no longer use gz for index.html
@@ -106,9 +107,11 @@ __pycache__/
poetry.toml
# Nix
/result
# Test binaries
/tests/test-backend-ops
/tests/test-double-float
/tests/test-grad0
@@ -124,6 +127,7 @@ poetry.toml
/tests/test-tokenizer-1-spm
# Scripts
!/scripts/install-oneapi.bat
# Generated by scripts
@@ -132,16 +136,24 @@ poetry.toml
/wikitext-2-raw/
# Test models for lora adapters
/lora-tests
# Local scripts
/run-vim.sh
/run-chat.sh
/run-spec.sh
/.ccache/
# IDE
/*.code-workspace
/.windsurf/
# emscripten
a.out.*
# AGENTS
AGENTS.local.md
.pi/SYSTEM.md
+33
View File
@@ -0,0 +1,33 @@
You are a coding agent. Here are some very important rules that you must follow:
General:
- By very precise and concise when writing code, comments, explanations, etc.
- PR and commit titles format: `<module> : <title>`. Lookup recents for examples
- Don't try to build or run the code unless you are explicitly asked to do so
Coding:
- When in doubt, always refer to the CONTRIBUTING.md file of the project
- When referencing issues or PRs in comments, use the format:
- C/C++ code: `// ref: <url>`
- Other (CMake, etc.): `# ref: <url>`
Pull requests (PRs):
- New branch names are prefixed with "gg/"
- Before opening a pull request, ask the user to confirm the description
- When creating a pull request, look for the repository's PR template and follow it
- For the AI usage disclosure section, write "YES. llama.cpp + pi"
- Always create the pull requests in draft mode
Commits:
- On every commit that you make, include a "Assisted-by: llama.cpp:local pi" tag
- Do not explicitly set the git author in commits - rely on the default git config
Resources (read on demand):
- [CONTRIBUTING.md](CONTRIBUTING.md)
- [Build documentation](docs/build.md)
- [Server usage documentation](tools/server/README.md)
- [Server development documentation](tools/server/README-dev.md)
- [PEG parser](docs/development/parsing.md)
- [Auto parser](docs/autoparser.md)
- [Jinja engine](common/jinja/README.md)
- [PR template](.github/pull_request_template.md)
+6 -5
View File
@@ -53,28 +53,29 @@
/examples/speculative/ @ggerganov
/ggml/cmake/ @ggerganov
/ggml/include/ @ggerganov
/ggml/src/ggml-backend-meta.cpp @JohannesGaessler
/ggml/src/ggml-cann/ @ggml-org/ggml-cann
/ggml/src/ggml-common.h @ggerganov
/ggml/src/ggml-cpu/ @ggerganov
/ggml/src/ggml-cpu/spacemit/ @alex-spacemit
/ggml/src/ggml-cuda/ @ggml-org/ggml-cuda
/ggml/src/ggml-cuda/fattn-wmma* @IMbackK
/ggml/src/ggml-hip/ @IMbackK
/ggml/src/ggml-cuda/vendors/hip.h @IMbackK
/ggml/src/ggml-cuda/fattn-wmma* @IMbackK
/ggml/src/ggml-hexagon/ @ggml-org/ggml-hexagon
/ggml/src/ggml-hip/ @IMbackK
/ggml/src/ggml-impl.h @ggerganov
/ggml/src/ggml-metal/ @ggml-org/ggml-metal
/ggml/src/ggml-opencl/ @ggml-org/ggml-opencl
/ggml/src/ggml-hexagon/ @ggml-org/ggml-hexagon
/ggml/src/ggml-openvino/ @cavusmustafa @wine99
/ggml/src/ggml-opt.cpp @JohannesGaessler
/ggml/src/ggml-quants.* @ggerganov
/ggml/src/ggml-rpc/ @ggml-org/ggml-rpc
/ggml/src/ggml-sycl/ @ggml-org/ggml-sycl
/ggml/src/ggml-threading.* @ggerganov
/ggml/src/ggml-vulkan/ @ggml-org/ggml-vulkan
/ggml/src/ggml-virtgpu/ @kpouget
/ggml/src/ggml-vulkan/ @ggml-org/ggml-vulkan
/ggml/src/ggml-webgpu/ @ggml-org/ggml-webgpu
/ggml/src/ggml-zdnn/ @ggml-org/ggml-zdnn @Andreas-Krebbel @AlekseiNikiforovIBM
/ggml/src/ggml-openvino/ @cavusmustafa @wine99
/ggml/src/ggml.c @ggerganov
/ggml/src/ggml.cpp @ggerganov
/ggml/src/gguf.cpp @JohannesGaessler @Green-Sky
+4 -4
View File
@@ -296,7 +296,7 @@ void analyze_reasoning::compare_reasoning_presence() {
return p.literal(reasoning_content) + p.space() + p.optional(p.tag("post", (p.marker() + p.space())) + p.rest());
});
auto parser_wrapped = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
return p.tag("pre", p.marker() + p.space()) + p.literal(reasoning_content) + p.space() + p.tag("post", (p.marker() + p.space())) + p.rest();
return p.tag("pre", p.marker() + p.space()) + p.literal(reasoning_content) + p.tag("post", (p.space() + p.marker() + p.space())) + p.rest();
});
// try the more aggressive parse first, if it fails, fall back to the delimiter one
auto result = parser_wrapped.parse_anywhere_and_extract(comparison->output_B);
@@ -306,11 +306,11 @@ void analyze_reasoning::compare_reasoning_presence() {
if (result.result.success()) {
if (!result.tags["pre"].empty() && !result.tags["post"].empty()) {
mode = reasoning_mode::TAG_BASED;
start = trim_leading_whitespace(result.tags["pre"]);
end = trim_trailing_whitespace(result.tags["post"]);
start = result.tags["pre"];
end = result.tags["post"];
} else if (!result.tags["post"].empty()) {
mode = reasoning_mode::TAG_BASED;
end = trim_trailing_whitespace(result.tags["post"]);
end = result.tags["post"];
}
}
}
+21 -5
View File
@@ -544,6 +544,26 @@ bool common_chat_templates_was_explicit(const struct common_chat_templates * tmp
return tmpls->has_explicit_template;
}
// LFM2 format detection: template uses <|tool_list_start|>[...]<|tool_list_end|> around the tool list
// and <|tool_call_start|>[...]<|tool_call_end|> around each tool call
static bool is_lfm2_template(const std::string & src) {
return src.find("<|tool_list_start|>") != std::string::npos &&
src.find("<|tool_list_end|>") != std::string::npos;
}
common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates) {
common_chat_prompt_preset asr_preset;
asr_preset.system = "";
asr_preset.user = "Transcribe audio to text";
if (chat_templates && chat_templates->template_default && is_lfm2_template(chat_templates->template_default->source())) {
asr_preset.system = "Perform ASR.";
asr_preset.user = "";
}
return asr_preset;
}
std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant) {
if (!variant.empty()) {
if (variant == "tool_use") {
@@ -2053,10 +2073,7 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
return common_chat_params_init_kimi_k2(tmpl, params);
}
// LFM2 format detection: template uses <|tool_list_start|>[...]<|tool_list_end|> around the tool list
// and <|tool_call_start|>[...]<|tool_call_end|> around each tool call
if (src.find("<|tool_list_start|>") != std::string::npos &&
src.find("<|tool_list_end|>") != std::string::npos) {
if (is_lfm2_template(src)) {
LOG_DBG("Using specialized template: LFM2\n");
return common_chat_params_init_lfm2(tmpl, params);
}
@@ -2365,4 +2382,3 @@ std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_tem
GGML_ASSERT(chat_templates->template_default != nullptr);
return chat_templates->template_default->caps.to_map();
}
+8
View File
@@ -274,3 +274,11 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
const common_chat_template & tmpl,
const std::string & src,
autoparser::generation_params & params);
// specialized per-task preset
struct common_chat_prompt_preset {
std::string system;
std::string user;
};
common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates);
+5
View File
@@ -746,6 +746,11 @@ inline bool string_starts_with(std::string_view str, std::string_view prefix) {
str.compare(0, prefix.size(), prefix) == 0;
}
// remove when moving to c++20
inline bool string_starts_with(std::string_view str, char prefix) {
return !str.empty() && str.front() == prefix;
}
// remove when moving to c++20
inline bool string_ends_with(std::string_view str, std::string_view suffix) {
return str.size() >= suffix.size() &&
+40 -17
View File
@@ -1,9 +1,38 @@
#include "debug.h"
#include "common.h"
#include "log.h"
#include <cmath>
#include <regex>
#include <string>
#include <vector>
struct common_debug_cb_user_data::impl {
std::vector<uint8_t> data;
std::vector<std::regex> tensor_filters;
bool abort_on_nan{false};
};
common_debug_cb_user_data::common_debug_cb_user_data() : pimpl(std::make_unique<impl>()) {}
common_debug_cb_user_data::~common_debug_cb_user_data() = default;
common_debug_cb_user_data::common_debug_cb_user_data(common_params & params, const std::vector<std::string> & filter_patterns, bool abort_on_nan)
: pimpl(std::make_unique<impl>())
{
for (const auto & pattern : filter_patterns) {
try {
std::string anchored_pattern = "^" + pattern;
pimpl->tensor_filters.emplace_back(anchored_pattern, std::regex::optimize);
} catch (const std::regex_error & e) {
throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what());
}
}
pimpl->abort_on_nan = abort_on_nan;
params.cb_eval = common_debug_cb_eval;
params.cb_eval_user_data = this;
}
static std::string common_ggml_ne_string(const ggml_tensor * t) {
std::string str;
@@ -47,8 +76,7 @@ static float common_ggml_get_float_value(const uint8_t * data,
#define INDENT " "
template <bool abort>
void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
static void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n, bool abort_on_nan) {
GGML_ASSERT(n > 0);
float sum = 0;
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
@@ -94,7 +122,7 @@ void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * n
LOG(INDENT "sum = %f\n", sum);
}
if constexpr (abort) {
if (abort_on_nan) {
if (std::isnan(sum)) {
LOG("encountered NaN - aborting\n");
exit(0);
@@ -112,8 +140,9 @@ void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * n
* @param user_data user data to pass at each call back
* @return true to receive data or continue the graph, false otherwise
*/
template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
auto * cb_data = (base_callback_data *) user_data;
bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
auto * cb_data = (common_debug_cb_user_data *) user_data;
auto * pimpl = cb_data->pimpl.get();
const struct ggml_tensor * src0 = t->src[0];
const struct ggml_tensor * src1 = t->src[1];
@@ -122,10 +151,10 @@ template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, b
return true; // Always retrieve data
}
bool matches_filter = cb_data->tensor_filters.empty();
bool matches_filter = pimpl->tensor_filters.empty();
if (!matches_filter) {
for (const auto & filter : cb_data->tensor_filters) {
for (const auto & filter : pimpl->tensor_filters) {
if (std::regex_search(t->name, filter)) {
matches_filter = true;
break;
@@ -148,20 +177,14 @@ template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, b
if (!is_host) {
auto n_bytes = ggml_nbytes(t);
cb_data->data.resize(n_bytes);
ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
pimpl->data.resize(n_bytes);
ggml_backend_tensor_get(t, pimpl->data.data(), 0, n_bytes);
}
if (!ggml_is_quantized(t->type) && matches_filter) {
uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
common_debug_print_tensor<abort_on_nan>(data, t->type, t->ne, t->nb, 3);
uint8_t * data = is_host ? (uint8_t *) t->data : pimpl->data.data();
common_debug_print_tensor(data, t->type, t->ne, t->nb, 3, pimpl->abort_on_nan);
}
return true;
}
// Explicit template instantiations
template bool common_debug_cb_eval<false>(ggml_tensor *, bool, void *);
template bool common_debug_cb_eval<true>(ggml_tensor *, bool, void *);
template void common_debug_print_tensor<false>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);
template void common_debug_print_tensor<true>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);
+17 -29
View File
@@ -1,43 +1,31 @@
#pragma once
#include "common.h"
#include <memory>
#include <string>
#include <vector>
#include <regex>
// common debug functions and structs
// Print a tensor's detailed data
// data - the tensor's data in byte format
// type - the tensor's quantization type
// ne - the tensor dimensions array
// nb - the tensor strides array
// n - the number of rows/columns to fully print
template <bool abort_on_nan> void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n);
struct common_params;
// Intended to use as callback for ggml_backend_sched_eval_callback
// prints tensors that are processed in the computation graph
// by default prints all tensors, but can be configured by creating a `base_callback_data` instance with
// non-empty filter_patterns. See examples/debug.ccp for possible usage patterns
// The template parameter determines whether an error should be thrown whenever a NaN is encountered
// by default prints all tensors, but can be configured by creating a `common_debug_cb_user_data` instance with
// non-empty filter_patterns. See examples/debug.cpp for possible usage patterns
// `common_debug_cb_user_data` contains `abort_on_nan` flag that determines whether an error should be thrown whenever a NaN is encountered
// in a tensor (useful for stopping debug sessions on first erroneous tensor)
// The callback data will be passed as the third parameter (user_data)
template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data);
struct base_callback_data {
std::vector<uint8_t> data;
std::vector<std::regex> tensor_filters;
bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data);
base_callback_data() = default;
struct common_debug_cb_user_data {
struct impl;
std::unique_ptr<impl> pimpl;
base_callback_data(common_params & params, const std::vector<std::string> & filter_patterns) {
for (const auto & pattern : filter_patterns) {
try {
std::string anchored_pattern = "^" + pattern;
tensor_filters.emplace_back(anchored_pattern, std::regex::optimize);
} catch (const std::regex_error & e) {
throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what());
}
}
params.cb_eval = common_debug_cb_eval<false>;
params.cb_eval_user_data = this;
}
common_debug_cb_user_data();
~common_debug_cb_user_data();
common_debug_cb_user_data(const common_debug_cb_user_data &) = delete;
common_debug_cb_user_data & operator=(const common_debug_cb_user_data &) = delete;
common_debug_cb_user_data(common_params & params, const std::vector<std::string> & filter_patterns, bool abort_on_nan = false);
};
+1 -1
View File
@@ -627,7 +627,7 @@ static hf_cache::hf_file find_best_model(const hf_cache::hf_files & files,
if (!tag.empty()) {
tags.push_back(tag);
} else {
tags = {"Q4_K_M", "Q4_0"};
tags = {"Q4_K_M", "Q8_0"};
}
for (const auto & t : tags) {
+2 -2
View File
@@ -856,7 +856,7 @@ void common_memory_breakdown_print(const struct llama_context * ctx) {
ggml_backend_dev_memory(dev, &free, &total);
const size_t self = mb.model + mb.context + mb.compute;
const size_t unaccounted = total - self - free;
const int64_t unaccounted = static_cast<int64_t>(total) - static_cast<int64_t>(free) - static_cast<int64_t>(self);
table_data.push_back({
template_gpu,
@@ -867,7 +867,7 @@ void common_memory_breakdown_print(const struct llama_context * ctx) {
std::to_string(mb.model / MiB),
std::to_string(mb.context / MiB),
std::to_string(mb.compute / MiB),
std::to_string(unaccounted / MiB)});
std::to_string(unaccounted / static_cast<int64_t>(MiB))});
}
// print memory breakdown for host:
-1
View File
@@ -1,4 +1,3 @@
#include "log.h"
#include "value.h"
#include "runtime.h"
#include "caps.h"
+11 -5
View File
@@ -106,10 +106,16 @@ struct statement {
size_t pos; // position in source, for debugging
virtual ~statement() = default;
virtual std::string type() const { return "Statement"; }
// execute_impl must be overridden by derived classes
virtual value execute_impl(context &) { throw std::runtime_error("cannot exec " + type()); }
virtual value execute_impl(context &) { throw_exec_error(); }
// execute is the public method to execute a statement with error handling
value execute(context &);
private:
[[noreturn]] void throw_exec_error() const {
throw std::runtime_error("cannot exec " + type());
}
};
// Type Checking Utilities
@@ -143,7 +149,7 @@ struct program : public statement {
program() = default;
explicit program(statements && body) : body(std::move(body)) {}
std::string type() const override { return "Program"; }
value execute_impl(context &) override {
[[noreturn]] value execute_impl(context &) override {
throw std::runtime_error("Cannot execute program directly, use jinja::runtime instead");
}
};
@@ -195,7 +201,7 @@ struct break_statement : public statement {
}
};
value execute_impl(context &) override {
[[noreturn]] value execute_impl(context &) override {
throw break_statement::signal();
}
};
@@ -209,7 +215,7 @@ struct continue_statement : public statement {
}
};
value execute_impl(context &) override {
[[noreturn]] value execute_impl(context &) override {
throw continue_statement::signal();
}
};
@@ -509,7 +515,7 @@ struct slice_expression : public expression {
chk_type<expression>(this->step_expr);
}
std::string type() const override { return "SliceExpression"; }
value execute_impl(context &) override {
[[noreturn]] value execute_impl(context &) override {
throw std::runtime_error("must be handled by MemberExpression");
}
};
+13 -9
View File
@@ -590,6 +590,10 @@ static bool string_endswith(const std::string & str, const std::string & suffix)
return str.compare(str.length() - suffix.length(), suffix.length(), suffix) == 0;
}
[[noreturn]] static value string_join_not_implemented(const func_args &) {
throw not_implemented_exception("String join builtin not implemented");
}
const func_builtins & value_string_t::get_builtins() const {
static const func_builtins builtins = {
{"default", default_value},
@@ -851,9 +855,7 @@ const func_builtins & value_string_t::get_builtins() const {
res->val_str.mark_input_based_on(val_input->as_string());
return res;
}},
{"join", [](const func_args &) -> value {
throw not_implemented_exception("String join builtin not implemented");
}},
{"join", string_join_not_implemented},
};
return builtins;
}
@@ -884,6 +886,9 @@ const func_builtins & value_bool_t::get_builtins() const {
return builtins;
}
[[noreturn]] static value array_unique_not_implemented(const func_args &) {
throw not_implemented_exception("Array unique builtin not implemented");
}
const func_builtins & value_array_t::get_builtins() const {
static const func_builtins builtins = {
@@ -1084,13 +1089,14 @@ const func_builtins & value_array_t::get_builtins() const {
std::reverse(arr.begin(), arr.end());
return is_val<value_tuple>(val) ? mk_val<value_tuple>(std::move(arr)) : mk_val<value_array>(std::move(arr));
}},
{"unique", [](const func_args &) -> value {
throw not_implemented_exception("Array unique builtin not implemented");
}},
{"unique", array_unique_not_implemented},
};
return builtins;
}
[[noreturn]] static value object_join_not_implemented(const func_args &) {
throw not_implemented_exception("object join not implemented");
}
const func_builtins & value_object_t::get_builtins() const {
if (!has_builtins) {
@@ -1183,9 +1189,7 @@ const func_builtins & value_object_t::get_builtins() const {
});
return result;
}},
{"join", [](const func_args &) -> value {
throw not_implemented_exception("object join not implemented");
}},
{"join", object_join_not_implemented},
};
return builtins;
}
+21 -18
View File
@@ -129,27 +129,25 @@ struct value_t {
// Note: only for debugging and error reporting purposes
virtual std::string type() const { return ""; }
virtual int64_t as_int() const { throw std::runtime_error(type() + " is not an int value"); }
virtual double as_float() const { throw std::runtime_error(type() + " is not a float value"); }
virtual string as_string() const { throw std::runtime_error(type() + " is not a string value"); }
virtual bool as_bool() const { throw std::runtime_error(type() + " is not a bool value"); }
virtual const std::vector<value> & as_array() const { throw std::runtime_error(type() + " is not an array value"); }
virtual const std::vector<std::pair<value, value>> & as_ordered_object() const { throw std::runtime_error(type() + " is not an object value"); }
virtual value invoke(const func_args &) const { throw std::runtime_error(type() + " is not a function value"); }
virtual int64_t as_int() const { throw_type_error("is not an int value"); }
virtual double as_float() const { throw_type_error("is not a float value"); }
virtual string as_string() const { throw_type_error("is not a string value"); }
virtual bool as_bool() const { throw_type_error("is not a bool value"); }
virtual const std::vector<value> & as_array() const { throw_type_error("is not an array value"); }
virtual const std::vector<std::pair<value, value>> & as_ordered_object() const { throw_type_error("is not an object value"); }
virtual value invoke(const func_args &) const { throw_type_error("is not a function value"); }
virtual bool is_none() const { return false; }
virtual bool is_undefined() const { return false; }
virtual const func_builtins & get_builtins() const {
throw std::runtime_error("No builtins available for type " + type());
}
virtual const func_builtins & get_builtins() const { throw_type_error("has no builtins"); }
virtual bool has_key(const value &) { throw std::runtime_error(type() + " is not an object value"); }
virtual void insert(const value & /* key */, const value & /* val */) { throw std::runtime_error(type() + " is not an object value"); }
virtual value & at(const value & /* key */, value & /* default_val */) { throw std::runtime_error(type() + " is not an object value"); }
virtual value & at(const value & /* key */) { throw std::runtime_error(type() + " is not an object value"); }
virtual value & at(const std::string & /* key */, value & /* default_val */) { throw std::runtime_error(type() + " is not an object value"); }
virtual value & at(const std::string & /* key */) { throw std::runtime_error(type() + " is not an object value"); }
virtual value & at(int64_t /* idx */, value & /* default_val */) { throw std::runtime_error(type() + " is not an array value"); }
virtual value & at(int64_t /* idx */) { throw std::runtime_error(type() + " is not an array value"); }
virtual bool has_key(const value &) { throw_type_error("is not an object value"); }
virtual void insert(const value & /* key */, const value & /* val */) { throw_type_error("is not an object value"); }
virtual value & at(const value & /* key */, value & /* default_val */) { throw_type_error("is not an object value"); }
virtual value & at(const value & /* key */) { throw_type_error("is not an object value"); }
virtual value & at(const std::string & /* key */, value & /* default_val */) { throw_type_error("is not an object value"); }
virtual value & at(const std::string & /* key */) { throw_type_error("is not an object value"); }
virtual value & at(int64_t /* idx */, value & /* default_val */) { throw_type_error("is not an array value"); }
virtual value & at(int64_t /* idx */) { throw_type_error("is not an array value"); }
virtual bool is_numeric() const { return false; }
virtual bool is_hashable() const { return false; }
@@ -163,6 +161,11 @@ struct value_t {
// Note: only for debugging purposes
virtual std::string as_repr() const { return as_string().str(); }
private:
[[noreturn]] void throw_type_error(const char* expected) const {
throw std::runtime_error(type() + " " + expected);
}
protected:
virtual bool equivalent(const value_t &) const = 0;
virtual bool nonequal(const value_t & other) const { return !equivalent(other); }
+17 -9
View File
@@ -61,18 +61,26 @@ static bool common_speculative_are_compatible(
LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
if (vocab_type_tgt != vocab_type_dft) {
LOG_DBG("%s: draft model vocab type must match target model to use speculation but ", __func__);
LOG_DBG("vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
LOG_WRN("%s: draft model vocab type must match target model to use speculation but "
"vocab_type_dft = %d while vocab_type_tgt = %d\n", __func__, vocab_type_dft, vocab_type_tgt);
return false;
}
if (
llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft) ||
llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)
) {
LOG_DBG("%s: draft model special tokens must match target model to use speculation\n", __func__);
if (llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
(llama_vocab_get_add_bos(vocab_tgt) && llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft))) {
LOG_WRN("%s: draft model bos tokens must match target model to use speculation. add: %d - %d, id: %d - %d)\n",
__func__,
llama_vocab_get_add_bos(vocab_tgt), llama_vocab_get_add_bos(vocab_dft),
llama_vocab_bos(vocab_tgt), llama_vocab_bos(vocab_dft));
return false;
}
if (llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
(llama_vocab_get_add_eos(vocab_tgt) && llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft))) {
LOG_WRN("%s: draft model eos tokens must match target model to use speculation. add: %d - %d, id: %d - %d)\n",
__func__,
llama_vocab_get_add_eos(vocab_tgt), llama_vocab_get_add_eos(vocab_dft),
llama_vocab_eos(vocab_tgt), llama_vocab_eos(vocab_dft));
return false;
}
+27 -31
View File
@@ -272,6 +272,22 @@ class ModelBase:
return tensors
@staticmethod
def _scale_is_trivial(scale: Tensor) -> bool:
return scale.numel() <= 1 and abs(float(scale.float().sum()) - 1.0) < 1e-6
def _write_scale_tensor(self, scale_name: str, scale: Tensor):
if not self._scale_is_trivial(scale):
scale_f32 = scale.float().numpy().flatten()
logger.info(f" + {scale_name} (per-tensor scale, shape [{scale_f32.size}])")
self.gguf_writer.add_tensor(scale_name, scale_f32)
def _write_scales_tensor(self, scale_name: str, scales: list[float]):
if not np.allclose(scales, 1.0, atol=1e-6):
scale_vals = np.array(scales, dtype=np.float32)
logger.info(f" + {scale_name} (per-expert scale, shape [{len(scales)}])")
self.gguf_writer.add_tensor(scale_name, scale_vals)
def dequant_model(self):
# If all quantized tensors were already handled (e.g. pure NVFP4), skip
if self._is_nvfp4 and not any(k.endswith((".weight_scale", ".weight_scale_inv")) for k in self.model_tensors):
@@ -494,7 +510,7 @@ class ModelBase:
s = self.model_tensors[name]
self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s(), None)
tensors_to_remove.append(name)
if name.endswith((".k_scale", ".v_scale")):
if name.endswith((".input_scale", ".k_scale", ".v_scale")):
tensors_to_remove.append(name)
elif quant_method is not None:
raise NotImplementedError(f"Quant method is not yet supported: {quant_method!r}")
@@ -602,10 +618,6 @@ class ModelBase:
raw = np.concatenate([d_grouped, qs_grouped], axis=-1).reshape(out_features, n_super * 36)
return raw, [out_features, n_super * 64]
@staticmethod
def _nvfp4_scale2_is_trivial(scale2: Tensor) -> bool:
return scale2.numel() <= 1 and abs(float(scale2.float().sum()) - 1.0) < 1e-6
def _repack_nvfp4(self, name: str, weight: Tensor, scale: Tensor, scale2: Tensor, input_scale: Tensor):
if "language_model." in name:
name = name.replace("language_model.", "")
@@ -616,19 +628,8 @@ class ModelBase:
logger.info(f"Repacked {new_name} with shape {shape} and quantization NVFP4")
self.gguf_writer.add_tensor(new_name, raw, raw_dtype=gguf.GGMLQuantizationType.NVFP4)
# Emit per-tensor scale2 as a separate F32 tensor when non-trivial
if not self._nvfp4_scale2_is_trivial(scale2):
scale2_f32 = scale2.float().numpy().flatten()
scale_name = new_name.replace(".weight", ".scale")
logger.info(f" + {scale_name} (per-tensor NVFP4 scale2, shape [{scale2_f32.size}])")
self.gguf_writer.add_tensor(scale_name, scale2_f32)
# Emit per-tensor input_scale as a separate F32 tensor when non-trivial
if not self._nvfp4_scale2_is_trivial(input_scale):
input_scale_f32 = input_scale.float().numpy().flatten()
input_scale_name = new_name.replace(".weight", ".input_scale")
logger.info(f" + {input_scale_name} (per-tensor NVFP4 input_scale, shape [{input_scale_f32.size}])")
self.gguf_writer.add_tensor(input_scale_name, input_scale_f32)
self._write_scale_tensor(new_name.replace(".weight", ".scale"), scale2)
self._write_scale_tensor(new_name.replace(".weight", ".input_scale"), input_scale)
def _generate_nvfp4_tensors(self):
# Per-layer expert merging to avoid holding all experts in memory
@@ -719,21 +720,11 @@ class ModelBase:
logger.info(f"Repacked {new_name} with shape [{len(experts)}, {shape[0]}, {shape[1]}] and quantization NVFP4")
self.gguf_writer.add_tensor(new_name, merged, raw_dtype=gguf.GGMLQuantizationType.NVFP4)
# Emit per-expert scale2 tensor if any expert has non-trivial scale2
scales.sort(key=lambda x: x[0])
scale_vals = np.array([s[1] for s in scales], dtype=np.float32)
if not np.allclose(scale_vals, 1.0, atol=1e-6):
scale_name = new_name.replace(".weight", ".scale")
logger.info(f" + {scale_name} (per-expert NVFP4 scale2, shape [{len(scales)}])")
self.gguf_writer.add_tensor(scale_name, scale_vals)
self._write_scales_tensor(new_name.replace(".weight", ".scale"), [s[1] for s in scales])
# Emit per-expert input_scale tensor if any expert has non-trivial input_scale
input_scales.sort(key=lambda x: x[0])
input_scale_vals = np.array([s[1] for s in input_scales], dtype=np.float32)
if not np.allclose(input_scale_vals, 1.0, atol=1e-6):
input_scale_name = new_name.replace(".weight", ".input_scale")
logger.info(f" + {input_scale_name} (per-expert NVFP4 input_scale, shape [{len(input_scales)}])")
self.gguf_writer.add_tensor(input_scale_name, input_scale_vals)
self._write_scales_tensor(new_name.replace(".weight", ".input_scale"), [s[1] for s in input_scales])
del experts, merged
@@ -746,7 +737,12 @@ class ModelBase:
if (not quant_algo or not quant_layers) and quant_config_file.is_file():
with open(quant_config_file, "r", encoding="utf-8") as f:
quant_config = json.load(f).get("quantization") or {}
hf_quant_config = json.load(f)
quant_config = hf_quant_config.get("quantization") or {}
producer = hf_quant_config.get("producer") or {}
producer_name = (producer.get("name") or "").lower()
if quant_method is None:
self.hparams.setdefault("quantization_config", {})["quant_method"] = producer_name
quant_algo = quant_config.get("quant_algo", quant_algo)
quant_layers = quant_config.get("quantized_layers", quant_layers) or {}
+30
View File
@@ -31,6 +31,8 @@ SYCL cross-platform capabilities enable support for other vendor GPUs as well.
## Recommended Release
### Windows
The following releases are verified and recommended:
|Commit ID|Tag|Release|Verified Platform| Update date|
@@ -39,9 +41,22 @@ The following releases are verified and recommended:
|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc A770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19|
|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc A770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1||
### Ubuntu 24.04
The release packages for Ubuntu 24.04 x64 (FP32/FP16) only include the binary files of the llama.cpp SYCL backend. They require the target machine to have pre-installed Intel GPU drivers and oneAPI packages that are the same version as the build package. To get the version and installation info, refer to release.yml: ubuntu-24-sycl -> Download & Install oneAPI.
It is recommended to use them with Intel Docker.
The packages for FP32 and FP16 would have different accuracy and performance on LLMs. Please choose it acording to the test result.
## News
- 2026.04
- Optimize mul_mat by reorder feature for data type: Q4_K, Q5_K, Q_K, Q8_0.
- Fused MoE.
- Upgrate CI and built package for oneAPI 2025.3.3, support Ubuntu 24.04 built package.
- 2026.03
- Support Flash-Attention: less memory usage, performance impact depends on LLM.
@@ -229,6 +244,7 @@ Upon a successful installation, SYCL is enabled for the available intel devices,
|Verified release|
|-|
|2025.3.3 |
|2025.2.1|
|2025.1|
|2024.1|
@@ -339,6 +355,12 @@ Choose one of following methods to run.
./examples/sycl/test.sh
```
- Run llama-server:
```sh
./examples/sycl/start-svr.sh -m PATH/MODEL_FILE
```
2. Command line
Launch inference
@@ -627,10 +649,18 @@ Choose one of following methods to run.
1. Script
- Run test:
```
examples\sycl\win-test.bat
```
- Run llama-server:
```
examples\sycl\win-start-svr.bat -m PATH\MODEL_FILE
```
2. Command line
Launch inference
+14 -5
View File
@@ -249,18 +249,27 @@ build: 6a8cf8914 (6733)
```
- `GGML_HEXAGON_PROFILE=1`
Generates a host-side profile for the ggml-hexagon Ops.
Enables Op profiling:
- `GGML_HEXAGON_OPMASK=0x0`
Allows enabling specific stages of the processing pipeline:
- `1` Basic profile with per-op `usecs` and `cycles` counters
- `2` Extended profile with per-op `usecs`, `cycles` and default PMU counter data
- `0x1,...,0x8` Extended profile with per-op `usecs`, `cycles` and custom PMU counter data
The logging output can be either saved into a file for post-processing or it can be piped directly into the post-processing tool to generate the report.
Examples:
`GGML_HEXAGON_PROFILE=1 llama-completion ... |& ./scripts/snapdragon/ggml-hexagon-profile.py -`
- `GGML_HEXAGON_OPSTAGE=0x0`
Allows enabling specific stages of the Op processing pipeline:
- `0x1` Enable Op Queue (i.e., queuing Ops into NPU)
- `0x2` Enable Op Compute (MUL_MAT, etc.)
Examples:
`GGML_HEXAGON_OPMASK=0x1 llama-completion ...` - Ops are enqueued but NPU-side processing is stubbed out
`GGML_HEXAGON_OPMASK=0x3 llama-completion ...` - Full queuing and processing of Ops (default)
`GGML_HEXAGON_OPSTAGE=0x1 llama-completion ...` - Ops are enqueued to the NPU but dma & compute are disabled
`GGML_HEXAGON_OPSTAGE=0x3 llama-completion ...` - Full queuing and processing of Ops (default)
- `GGML_HEXAGON_OPFILTER=regex`
Allows filtering (disabling) Ops that match the regex pattern:
+3 -3
View File
@@ -26,7 +26,7 @@ Legend:
| CLAMP | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ❌ | ❌ |
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
| CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | | ❌ | ❌ |
| CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | | ❌ | ❌ |
| CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
| CONV_3D | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
@@ -60,7 +60,7 @@ Legend:
| GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
| HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| HARDSWISH | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| IM2COL | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | ❌ | ❌ |
| IM2COL | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | ❌ | ❌ |
| IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
| L2_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
@@ -105,7 +105,7 @@ Legend:
| SQR | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
| SQRT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
| SSM_CONV | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
| SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | | ❌ | ❌ |
| SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | | ❌ | ❌ |
| STEP | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
| SUM | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
+3837 -1664
View File
File diff suppressed because it is too large Load Diff
+9 -5
View File
@@ -202,10 +202,14 @@ static bool run(llama_context * ctx, const common_params & params) {
print_tokenized_prompt(ctx, tokens, params.prompt);
if (params.save_logits) {
output_data output {ctx, model, params};
std::filesystem::path model_path{params.model.path};
std::string model_name{model_path.stem().string()};
save_output_data(output, model_name, params.logits_output_dir);
try {
output_data output {ctx, model, params};
std::filesystem::path model_path{params.model.path};
std::string model_name{model_path.stem().string()};
save_output_data(output, model_name, params.logits_output_dir);
} catch (const std::exception & e) {
LOG_ERR("%s : error saving logits: %s\n", __func__, e.what());
}
}
return true;
@@ -223,7 +227,7 @@ int main(int argc, char ** argv) {
llama_backend_init();
llama_numa_init(params.numa);
std::optional<base_callback_data> cb_data;
std::optional<common_debug_cb_user_data> cb_data;
if (!params.save_logits) {
cb_data.emplace(params, params.tensor_filter);
}
+2 -3
View File
@@ -3,7 +3,6 @@
#include "debug.h"
#include "log.h"
#include "llama.h"
#include "llama-cpp.h"
#include <clocale>
#include <string>
@@ -38,7 +37,7 @@ static bool run(llama_context * ctx, const common_params & params) {
int main(int argc, char ** argv) {
std::setlocale(LC_NUMERIC, "C");
base_callback_data cb_data;
common_debug_cb_user_data cb_data;
common_params params;
@@ -53,7 +52,7 @@ int main(int argc, char ** argv) {
// pass the callback to the backend scheduler
// it will be executed for each node during the graph computation
params.cb_eval = common_debug_cb_eval<false>;
params.cb_eval = common_debug_cb_eval;
params.cb_eval_user_data = &cb_data;
params.warmup = false;
@@ -25,7 +25,11 @@ MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}"
OUTPUT_DIR="${OUTPUT_DIR:-../../models}"
TYPE="${OUTTYPE:-f16}"
METADATA_OVERRIDE="${METADATA_OVERRIDE:-}"
CONVERTED_MODEL="${OUTPUT_DIR}/${MODEL_NAME}.gguf"
if [[ -n "$MMPROJ" ]]; then
CONVERTED_MODEL="${OUTPUT_DIR}/mmproj-${MODEL_NAME}.gguf"
else
CONVERTED_MODEL="${OUTPUT_DIR}/${MODEL_NAME}.gguf"
fi
echo "Model path: ${MODEL_PATH}"
echo "Model name: ${MODEL_NAME}"
@@ -38,6 +42,7 @@ if [[ -n "$DEBUG" ]]; then
else
CMD_ARGS=("python")
fi
CMD_ARGS+=("../../convert_hf_to_gguf.py" "--verbose")
CMD_ARGS+=("${MODEL_PATH}")
CMD_ARGS+=("--outfile" "${CONVERTED_MODEL}")
@@ -50,7 +55,3 @@ CMD_ARGS+=("--outtype" "${TYPE}")
echo ""
echo "The environment variable CONVERTED_MODEL can be set to this path using:"
echo "export CONVERTED_MODEL=$(realpath ${CONVERTED_MODEL})"
if [[ -n "$MMPROJ" ]]; then
mmproj_file="${OUTPUT_DIR}/mmproj-$(basename "${CONVERTED_MODEL}")"
echo "The mmproj model was created in $(realpath "$mmproj_file")"
fi
+124
View File
@@ -0,0 +1,124 @@
#!/bin/bash
# MIT license
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: MIT
Help() {
cat << EOF
Usage: $(basename "$0") [OPTIONS]
This script processes files with specified options.
Options:
-h, --help Display this help message and exit.
-c, --context <value> Set context length. Bigger need more memory.
-p, --promote <value> Prompt to start generation with.
-m, --model <value> Full model file path.
-mg,--main-gpu <value> Set main GPU ID (0 - n) for single GPU mode.
-sm,--split-mode <value> How to split the model across multiple GPUs, one of:
- none: use one GPU only
- layer (default): split layers and KV across GPUs
- row: split rows across GPUs
-ngl,--n-gpu-layers <value> Max. number of layers to store in VRAM (default: -1)
-lv,--log-verbosity <value> Set the verbosity threshold. Messages with a higher verbosity will be
ignored. Values:
- 0: generic output
- 1: error
- 2: warning
- 3: info
- 4: debug
EOF
}
BIN_FILE=./build/bin/llama-server
SEED=0
GPUS_SETTING=""
MODEL_FILE=../models/Qwen3.5-4B-Q4_0.gguf
NGL=99
CONTEXT=4096
GGML_SYCL_DEVICE=-1
SPLIT_MODE=layer
LOG_VERBOSE=3
while [[ $# -gt 0 ]]; do
case "$1" in
-c|--context)
CONTEXT=$2
# Shift twice to consume both the option flag and its value
shift
shift
;;
-m|--model)
MODEL_FILE="$2"
# Shift twice to consume both the option flag and its value
shift
shift
;;
-mg|--main-gpu)
GGML_SYCL_DEVICE=$2
SPLIT_MODE=none
# Shift twice to consume both the option flag and its value
shift
shift
;;
-sm|--split-mode)
SPLIT_MODE=$2
# Shift twice to consume both the option flag and its value
shift
shift
;;
-ngl|--n-gpu-layers)
NGL=$2
# Shift twice to consume both the option flag and its value
shift
shift
;;
-lv|--log-verbosity)
LOG_VERBOSE=$2
# Shift twice to consume both the option flag and its value
shift
shift
;;
-h|--help)
Help
exit 0
;;
*)
# Handle unknown options or stop processing options
echo "Invalid option: $1"
# Optional: exit script or shift to treat remaining as positional args
exit 1
;;
esac
done
source /opt/intel/oneapi/setvars.sh
#export GGML_SYCL_DEBUG=1
#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
#support malloc device memory more than 4GB.
export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
echo "UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=${UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS}"
if [ $GGML_SYCL_DEVICE -ne -1 ]; then
echo "Use $GGML_SYCL_DEVICE as main GPU"
#use signle GPU only
GPUS_SETTING="-mg $GGML_SYCL_DEVICE -sm ${SPLIT_MODE}"
export ONEAPI_DEVICE_SELECTOR="level_zero:${$GGML_SYCL_DEVICE}"
echo "ONEAPI_DEVICE_SELECTOR=${ONEAPI_DEVICE_SELECTOR}"
else
echo "Use all Intel GPUs, including iGPU & dGPU"
GPUS_SETTING="-sm ${SPLIT_MODE}"
fi
echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 200 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap "
ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap --host 0.0.0.0 --port 8000
+5 -4
View File
@@ -38,7 +38,7 @@ SEED=0
GPUS_SETTING=""
INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
MODEL_FILE=models/llama-2-7b.Q4_0.gguf
MODEL_FILE=../models/llama-2-7b.Q4_0.gguf
NGL=99
CONTEXT=4096
GGML_SYCL_DEVICE=-1
@@ -122,9 +122,10 @@ if [ $GGML_SYCL_DEVICE -ne -1 ]; then
export ONEAPI_DEVICE_SELECTOR="level_zero:${$GGML_SYCL_DEVICE}"
echo "ONEAPI_DEVICE_SELECTOR=${ONEAPI_DEVICE_SELECTOR}"
else
echo "Use all Intel GPUs, including iGPU & dGPU"
echo "Use all Intel GPUs, including iGPU & dGPU"
GPUS_SETTING="-sm ${SPLIT_MODE}"
fi
echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap "
ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap
echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 200 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap "
ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 200 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap
+179
View File
@@ -0,0 +1,179 @@
:: MIT license
:: Copyright (C) 2024 Intel Corporation
:: SPDX-License-Identifier: MIT
@echo off
setlocal EnableExtensions EnableDelayedExpansion
set "BIN_FILE=.\build\bin\llama-server.exe"
set "SEED=0"
set "GPUS_SETTING="
set "MODEL_FILE=..\models\Qwen3.5-4B-Q4_0.gguf"
set "NGL=99"
set "CONTEXT=4096"
set "GGML_SYCL_DEVICE=-1"
set "SPLIT_MODE=layer"
set "LOG_VERBOSE=3"
if "%~1"=="" goto after_args
:parse_args
if "%~1"=="" goto after_args
if /I "%~1"=="-c" (
if "%~2"=="" goto missing_value
set "CONTEXT=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="--context" (
if "%~2"=="" goto missing_value
set "CONTEXT=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="-m" (
if "%~2"=="" goto missing_value
set "MODEL_FILE=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="--model" (
if "%~2"=="" goto missing_value
set "MODEL_FILE=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="-mg" (
if "%~2"=="" goto missing_value
set "GGML_SYCL_DEVICE=%~2"
set "SPLIT_MODE=none"
shift
shift
goto parse_args
)
if /I "%~1"=="--main-gpu" (
if "%~2"=="" goto missing_value
set "GGML_SYCL_DEVICE=%~2"
set "SPLIT_MODE=none"
shift
shift
goto parse_args
)
if /I "%~1"=="-sm" (
if "%~2"=="" goto missing_value
set "SPLIT_MODE=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="--split-mode" (
if "%~2"=="" goto missing_value
set "SPLIT_MODE=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="-ngl" (
if "%~2"=="" goto missing_value
set "NGL=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="--n-gpu-layers" (
if "%~2"=="" goto missing_value
set "NGL=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="-lv" (
if "%~2"=="" goto missing_value
set "LOG_VERBOSE=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="--log-verbosity" (
if "%~2"=="" goto missing_value
set "LOG_VERBOSE=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="-h" goto help
if /I "%~1"=="--help" goto help
echo Invalid option: %~1
exit /b 1
:missing_value
echo Missing value for option: %~1
exit /b 1
:help
echo Usage: %~n0 [OPTIONS]
echo.
echo This script processes files with specified options.
echo.
echo Options:
echo -h, --help Display this help message and exit.
echo -c, --context ^<value^> Set context length. Bigger need more memory.
echo -m, --model ^<value^> Full model file path.
echo -mg,--main-gpu ^<value^> Set main GPU ID (0 - n) for single GPU mode.
echo -sm,--split-mode ^<value^> How to split the model across multiple GPUs, one of:
echo - none: use one GPU only
echo - layer (default): split layers and KV across GPUs
echo - row: split rows across GPUs
echo -ngl,--n-gpu-layers ^<value^> Max. number of layers to store in VRAM (default: -1)
echo -lv,--log-verbosity ^<value^> Set the verbosity threshold. Messages with a higher verbosity will be
echo ignored. Values:
echo - 0: generic output
echo - 1: error
echo - 2: warning
echo - 3: info
echo - 4: debug
exit /b 0
:after_args
REM In Windows CMD, source is not available; call oneAPI setvars if present.
if exist "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" (
call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" >nul
) else (
echo Warning: oneAPI setvars.bat not found. Continuing without environment setup.
)
REM Support malloc device memory more than 4GB.
set "UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1"
echo UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=%UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS%
if not "%GGML_SYCL_DEVICE%"=="-1" (
echo Use %GGML_SYCL_DEVICE% as main GPU
REM Use single GPU only.
set "GPUS_SETTING=-mg %GGML_SYCL_DEVICE% -sm %SPLIT_MODE%"
set "ONEAPI_DEVICE_SELECTOR=level_zero:%GGML_SYCL_DEVICE%"
echo ONEAPI_DEVICE_SELECTOR=%ONEAPI_DEVICE_SELECTOR%
) else (
echo Use all Intel GPUs, including iGPU ^& dGPU
set "GPUS_SETTING=-sm %SPLIT_MODE%"
)
echo run cmd: ZES_ENABLE_SYSMAN=1 %BIN_FILE% -m "%MODEL_FILE%" -ngl %NGL% -s %SEED% -c %CONTEXT% %GPUS_SETTING% -lv %LOG_VERBOSE% --mmap --host 0.0.0.0 --port 8000
set "ZES_ENABLE_SYSMAN=1"
%BIN_FILE% -m "%MODEL_FILE%" -ngl %NGL% -s %SEED% -c %CONTEXT% %GPUS_SETTING% -lv %LOG_VERBOSE% --mmap --host 0.0.0.0 --port 8000
endlocal
+196 -6
View File
@@ -2,10 +2,200 @@
:: Copyright (C) 2024 Intel Corporation
:: SPDX-License-Identifier: MIT
set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
:: support malloc device memory more than 4GB.
set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
set LOAD_MODE="--mmap"
.\build\bin\llama-completion.exe -m models\llama-2-7b.Q4_0.gguf -no-cnv -p %INPUT2% -n 400 -e -ngl 99 -s 0 %LOAD_MODE%
@echo off
setlocal EnableExtensions EnableDelayedExpansion
REM MIT license
REM Copyright (C) 2024 Intel Corporation
REM SPDX-License-Identifier: MIT
set "BIN_FILE=.\build\bin\llama-completion.exe"
set "SEED=0"
set "GPUS_SETTING="
set "INPUT_PROMPT=Building a website can be done in 10 simple steps:^nStep 1:"
set "MODEL_FILE=..\models\llama-2-7b.Q4_0.gguf"
set "NGL=99"
set "CONTEXT=4096"
set "GGML_SYCL_DEVICE=-1"
set "SPLIT_MODE=layer"
set "LOG_VERBOSE=3"
if "%~1"=="" goto after_args
:parse_args
if "%~1"=="" goto after_args
if /I "%~1"=="-c" (
if "%~2"=="" goto missing_value
set "CONTEXT=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="--context" (
if "%~2"=="" goto missing_value
set "CONTEXT=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="-p" (
if "%~2"=="" goto missing_value
set "INPUT_PROMPT=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="--promote" (
if "%~2"=="" goto missing_value
set "INPUT_PROMPT=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="-m" (
if "%~2"=="" goto missing_value
set "MODEL_FILE=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="--model" (
if "%~2"=="" goto missing_value
set "MODEL_FILE=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="-mg" (
if "%~2"=="" goto missing_value
set "GGML_SYCL_DEVICE=%~2"
set "SPLIT_MODE=none"
shift
shift
goto parse_args
)
if /I "%~1"=="--main-gpu" (
if "%~2"=="" goto missing_value
set "GGML_SYCL_DEVICE=%~2"
set "SPLIT_MODE=none"
shift
shift
goto parse_args
)
if /I "%~1"=="-sm" (
if "%~2"=="" goto missing_value
set "SPLIT_MODE=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="--split-mode" (
if "%~2"=="" goto missing_value
set "SPLIT_MODE=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="-ngl" (
if "%~2"=="" goto missing_value
set "NGL=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="--n-gpu-layers" (
if "%~2"=="" goto missing_value
set "NGL=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="-lv" (
if "%~2"=="" goto missing_value
set "LOG_VERBOSE=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="--log-verbosity" (
if "%~2"=="" goto missing_value
set "LOG_VERBOSE=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="-h" goto help
if /I "%~1"=="--help" goto help
echo Invalid option: %~1
exit /b 1
:missing_value
echo Missing value for option: %~1
exit /b 1
:help
echo Usage: %~n0 [OPTIONS]
echo.
echo This script processes files with specified options.
echo.
echo Options:
echo -h, --help Display this help message and exit.
echo -c, --context ^<value^> Set context length. Bigger need more memory.
echo -p, --promote ^<value^> Prompt to start generation with.
echo -m, --model ^<value^> Full model file path.
echo -mg,--main-gpu ^<value^> Set main GPU ID (0 - n) for single GPU mode.
echo -sm,--split-mode ^<value^> How to split the model across multiple GPUs, one of:
echo - none: use one GPU only
echo - layer (default): split layers and KV across GPUs
echo - row: split rows across GPUs
echo -ngl,--n-gpu-layers ^<value^> Max. number of layers to store in VRAM (default: -1)
echo -lv,--log-verbosity ^<value^> Set the verbosity threshold. Messages with a higher verbosity will be
echo ignored. Values:
echo - 0: generic output
echo - 1: error
echo - 2: warning
echo - 3: info
echo - 4: debug
exit /b 0
:after_args
REM In Windows CMD, source is not available; call oneAPI setvars if present.
if exist "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" (
call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" >nul
) else (
echo Warning: oneAPI setvars.bat not found. Continuing without environment setup.
)
REM Support malloc device memory more than 4GB.
set "UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1"
echo UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=%UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS%
if not "%GGML_SYCL_DEVICE%"=="-1" (
echo Use %GGML_SYCL_DEVICE% as main GPU
REM Use single GPU only.
set "GPUS_SETTING=-mg %GGML_SYCL_DEVICE% -sm %SPLIT_MODE%"
set "ONEAPI_DEVICE_SELECTOR=level_zero:%GGML_SYCL_DEVICE%"
echo ONEAPI_DEVICE_SELECTOR=%ONEAPI_DEVICE_SELECTOR%
) else (
echo Use all Intel GPUs, including iGPU ^& dGPU
set "GPUS_SETTING=-sm %SPLIT_MODE%"
)
echo run cmd: ZES_ENABLE_SYSMAN=1 %BIN_FILE% -m %MODEL_FILE% -no-cnv -p "%INPUT_PROMPT%" -n 200 -e -ngl %NGL% -s %SEED% -c %CONTEXT% %GPUS_SETTING% -lv %LOG_VERBOSE% --mmap
set "ZES_ENABLE_SYSMAN=1"
%BIN_FILE% -m "%MODEL_FILE%" -no-cnv -p "%INPUT_PROMPT%" -n 200 -e -ngl %NGL% -s %SEED% -c %CONTEXT% %GPUS_SETTING% -lv %LOG_VERBOSE% --mmap
endlocal
+1 -1
View File
@@ -473,7 +473,7 @@ target_link_libraries(ggml-base PRIVATE Threads::Threads)
find_library(MATH_LIBRARY m)
if (MATH_LIBRARY)
if (NOT WIN32 OR NOT DEFINED ENV{ONEAPI_ROOT})
target_link_libraries(ggml-base PRIVATE m)
target_link_libraries(ggml-base PRIVATE ${MATH_LIBRARY})
endif()
endif()
+50 -16
View File
@@ -1205,40 +1205,57 @@ static void ggml_backend_meta_buffer_set_tensor(ggml_backend_buffer_t buffer, gg
if (split_state.n_segments != 1) {
GGML_ASSERT(split_state.axis >= 0 && split_state.axis < GGML_MAX_DIMS);
GGML_ASSERT(offset == 0);
GGML_ASSERT(size == ggml_nbytes(tensor));
GGML_ASSERT(tensor->ne[3] == 1);
size_t offset_data = 0;
std::vector<size_t> simple_offsets(n_bufs, 0);
if (split_state.axis == GGML_BACKEND_SPLIT_AXIS_0) {
GGML_ASSERT(tensor->ne[2] == 1);
const size_t row_stride = tensor->nb[1];
GGML_ASSERT(offset % row_stride == 0);
GGML_ASSERT(size % row_stride == 0);
const int64_t r_start = offset / row_stride;
const int64_t r_count = size / row_stride;
GGML_ASSERT(r_start + r_count <= tensor->ne[1]);
const int64_t blck_size = ggml_blck_size(tensor->type);
for (size_t s = 0; s < split_state.n_segments; s++) {
for (size_t j = 0; j < n_bufs; j++) {
ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
GGML_ASSERT(split_state.ne[s*n_bufs + j] % blck_size == 0);
const size_t nbytes = split_state.ne[s*n_bufs + j]/blck_size * tensor->nb[0];
ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_data, simple_offsets[j], nbytes,
tensor->ne[1], simple_tensor->nb[1], tensor->nb[1]);
ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_data,
simple_offsets[j] + r_start * simple_tensor->nb[1], nbytes,
r_count, simple_tensor->nb[1], tensor->nb[1]);
offset_data += nbytes;
simple_offsets[j] += nbytes;
}
}
GGML_ASSERT(offset_data*tensor->ne[1] == size);
GGML_ASSERT(offset_data*r_count == size);
return;
}
GGML_ASSERT(split_state.axis == GGML_BACKEND_SPLIT_AXIS_1);
const size_t row_stride = tensor->nb[2];
GGML_ASSERT(offset % row_stride == 0);
GGML_ASSERT(size % row_stride == 0);
const int64_t r_start = offset / row_stride;
const int64_t r_count = size / row_stride;
GGML_ASSERT(r_start + r_count <= tensor->ne[2]);
for (size_t s = 0; s < split_state.n_segments; s++) {
for (size_t j = 0; j < n_bufs; j++) {
ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
const size_t nbytes = split_state.ne[s*n_bufs + j] * tensor->nb[1];
ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_data, simple_offsets[j], nbytes,
tensor->ne[2], simple_tensor->nb[2], tensor->nb[2]);
ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_data,
simple_offsets[j] + r_start * simple_tensor->nb[2], nbytes,
r_count, simple_tensor->nb[2], tensor->nb[2]);
offset_data += nbytes;
simple_offsets[j] += nbytes;
}
}
GGML_ASSERT(offset_data*tensor->ne[2] == size);
GGML_ASSERT(offset_data*r_count == size);
return;
}
@@ -1295,40 +1312,57 @@ static void ggml_backend_meta_buffer_get_tensor(ggml_backend_buffer_t buffer, co
if (split_state.n_segments != 1) {
GGML_ASSERT(split_state.axis >= 0 && split_state.axis < GGML_MAX_DIMS);
GGML_ASSERT(offset == 0);
GGML_ASSERT(size == ggml_nbytes(tensor));
GGML_ASSERT(tensor->ne[3] == 1);
size_t offset_data = 0;
std::vector<size_t> simple_offsets(n_bufs, 0);
if (split_state.axis == GGML_BACKEND_SPLIT_AXIS_0) {
GGML_ASSERT(tensor->ne[2] == 1);
const size_t row_stride = tensor->nb[1];
GGML_ASSERT(offset % row_stride == 0);
GGML_ASSERT(size % row_stride == 0);
const int64_t r_start = offset / row_stride;
const int64_t r_count = size / row_stride;
GGML_ASSERT(r_start + r_count <= tensor->ne[1]);
const int64_t blck_size = ggml_blck_size(tensor->type);
for (size_t s = 0; s < split_state.n_segments; s++) {
for (size_t j = 0; j < n_bufs; j++) {
const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
GGML_ASSERT(split_state.ne[s*n_bufs + j] % blck_size == 0);
const size_t nbytes = split_state.ne[s*n_bufs + j]/blck_size * tensor->nb[0];
ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data, simple_offsets[j], nbytes,
tensor->ne[1], simple_tensor->nb[1], tensor->nb[1]);
ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data,
simple_offsets[j] + r_start * simple_tensor->nb[1], nbytes,
r_count, simple_tensor->nb[1], tensor->nb[1]);
offset_data += nbytes;
simple_offsets[j] += nbytes;
}
}
GGML_ASSERT(offset_data*tensor->ne[1] == size);
GGML_ASSERT(offset_data*r_count == size);
return;
}
GGML_ASSERT(split_state.axis == GGML_BACKEND_SPLIT_AXIS_1);
const size_t row_stride = tensor->nb[2];
GGML_ASSERT(offset % row_stride == 0);
GGML_ASSERT(size % row_stride == 0);
const int64_t r_start = offset / row_stride;
const int64_t r_count = size / row_stride;
GGML_ASSERT(r_start + r_count <= tensor->ne[2]);
for (size_t s = 0; s < split_state.n_segments; s++) {
for (size_t j = 0; j < n_bufs; j++) {
const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
const size_t nbytes = split_state.ne[s*n_bufs + j] * tensor->nb[1];
ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data, simple_offsets[j], nbytes,
tensor->ne[2], simple_tensor->nb[2], tensor->nb[2]);
ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data,
simple_offsets[j] + r_start * simple_tensor->nb[2], nbytes,
r_count, simple_tensor->nb[2], tensor->nb[2]);
offset_data += nbytes;
simple_offsets[j] += nbytes;
}
}
GGML_ASSERT(offset_data*tensor->ne[2] == size);
GGML_ASSERT(offset_data*r_count == size);
return;
}
+16 -16
View File
@@ -2005,12 +2005,12 @@ void tinygemm_kernel_amx(int M, int N, int KB, const void * RESTRICT _A, const v
const int lda = KB * sizeof(TA);
//const int ldb = KB * sizeof(TB);
static thread_local packed_B_t Tile0[TILE_N * TILE_K];
static thread_local packed_B_t Tile1[TILE_N * TILE_K];
static thread_local int8_t Tile23[TILE_M * TILE_K];
alignas(64) static thread_local packed_B_t Tile0[TILE_N * TILE_K];
alignas(64) static thread_local packed_B_t Tile1[TILE_N * TILE_K];
alignas(64) static thread_local int8_t Tile23[TILE_M * TILE_K];
static thread_local int32_t TileC0[TILE_M * TILE_N * 4];
static thread_local int32_t TileC1[TILE_M * TILE_N * 4];
alignas(64) static thread_local int32_t TileC0[TILE_M * TILE_N * 4];
alignas(64) static thread_local int32_t TileC1[TILE_M * TILE_N * 4];
// double buffering C to interleave avx512 and amx
int32_t * C_cur = TileC0;
@@ -2187,21 +2187,21 @@ void tinygemm_kernel_amx(int M, int N, int KB, const void * RESTRICT _A, const v
const int m1 = std::max(M - TILE_M, 0);
//const int lda = KB * sizeof(TA);
static thread_local int8_t Tile0[TILE_N * TILE_K];
static thread_local int8_t Tile1[TILE_N * TILE_K];
static thread_local int8_t Tile23[TILE_M * TILE_K];
alignas(64) static thread_local int8_t Tile0[TILE_N * TILE_K];
alignas(64) static thread_local int8_t Tile1[TILE_N * TILE_K];
alignas(64) static thread_local int8_t Tile23[TILE_M * TILE_K];
// mat mul result for each group
static thread_local int32_t Tile4[TILE_M * TILE_N];
static thread_local int32_t Tile5[TILE_M * TILE_N];
static thread_local int32_t Tile6[TILE_M * TILE_N];
static thread_local int32_t Tile7[TILE_M * TILE_N];
alignas(64) static thread_local int32_t Tile4[TILE_M * TILE_N];
alignas(64) static thread_local int32_t Tile5[TILE_M * TILE_N];
alignas(64) static thread_local int32_t Tile6[TILE_M * TILE_N];
alignas(64) static thread_local int32_t Tile7[TILE_M * TILE_N];
// sum of each QK_K block, contains 8 groups, int32
static thread_local int32_t Sumi4[TILE_M * TILE_N];
static thread_local int32_t Sumi5[TILE_M * TILE_N];
static thread_local int32_t Sumi6[TILE_M * TILE_N];
static thread_local int32_t Sumi7[TILE_M * TILE_N];
alignas(64) static thread_local int32_t Sumi4[TILE_M * TILE_N];
alignas(64) static thread_local int32_t Sumi5[TILE_M * TILE_N];
alignas(64) static thread_local int32_t Sumi6[TILE_M * TILE_N];
alignas(64) static thread_local int32_t Sumi7[TILE_M * TILE_N];
const int k_group_size = std::is_same<TB, block_q6_K>::value ? 16 : 32;
for (int i = 0; i < KB; ++i) {
+19 -27
View File
@@ -2300,9 +2300,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
#if defined __AVX2__
const __m256i m4 = _mm256_set1_epi8(0xF);
const __m256i m2 = _mm256_set1_epi8(3);
const __m256i m32s = _mm256_set1_epi8(32);
const __m256i m3 = _mm256_set1_epi8(3);
const __m256i m15 = _mm256_set1_epi8(15);
__m256 acc = _mm256_setzero_ps();
@@ -2314,53 +2313,45 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
const uint8_t * GGML_RESTRICT qh = x[i].qh;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
const __m256i q8sums = _mm256_loadu_si256((const __m256i*)y[i].bsums);
const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
const __m256i scales_16 = _mm256_cvtepi8_epi16(scales);
const __m256i q8sclsub = _mm256_slli_epi32(_mm256_madd_epi16(q8sums, scales_16), 5);
__m256i sumi = _mm256_setzero_si256();
int is = 0;
for (int j = 0; j < QK_K/128; ++j) {
const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0));
const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1));
const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2));
const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3));
is += 4;
const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
const __m256i q4bits2 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
const __m256i q4bitsH = _mm256_loadu_si256((const __m256i*)qh); qh += 32;
const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(q4bitsH, m2), 4);
const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 2), m2), 4);
const __m256i q4h_2 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 4), m2), 4);
const __m256i q4h_3 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 6), m2), 4);
const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(q4bitsH, m3), 4);
const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(q4bitsH, _mm256_set1_epi8(12)), 2);
const __m256i q4h_2 = _mm256_and_si256(q4bitsH, _mm256_set1_epi8(48));
const __m256i q4h_3 = _mm256_srli_epi16(_mm256_and_si256(q4bitsH, _mm256_set1_epi8(-64)), 2);
const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(q4bits2, m4), q4h_1);
const __m256i q4_2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_2);
const __m256i q4_3 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits2, 4), m4), q4h_3);
const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m15), q4h_0);
const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(q4bits2, m15), q4h_1);
const __m256i q4_2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m15), q4h_2);
const __m256i q4_3 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits2, 4), m15), q4h_3);
const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
__m256i q8s_0 = _mm256_maddubs_epi16(m32s, q8_0);
__m256i q8s_1 = _mm256_maddubs_epi16(m32s, q8_1);
__m256i q8s_2 = _mm256_maddubs_epi16(m32s, q8_2);
__m256i q8s_3 = _mm256_maddubs_epi16(m32s, q8_3);
__m256i p16_0 = _mm256_maddubs_epi16(q4_0, q8_0);
__m256i p16_1 = _mm256_maddubs_epi16(q4_1, q8_1);
__m256i p16_2 = _mm256_maddubs_epi16(q4_2, q8_2);
__m256i p16_3 = _mm256_maddubs_epi16(q4_3, q8_3);
p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
p16_2 = _mm256_sub_epi16(p16_2, q8s_2);
p16_3 = _mm256_sub_epi16(p16_3, q8s_3);
const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0));
const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1));
const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2));
const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3));
is += 4;
p16_0 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_0), p16_0);
p16_1 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_1), p16_1);
@@ -2372,6 +2363,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
}
sumi = _mm256_sub_epi32(sumi, q8sclsub);
acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
}
+6 -13
View File
@@ -1036,12 +1036,12 @@ inline static float ggml_gelu_quick_f32(float x) {
return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
}
//inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
// const uint16_t * i16 = (const uint16_t *) x;
// for (int i = 0; i < n; ++i) {
// y[i] = ggml_table_gelu_quick_f16[i16[i]];
// }
//}
inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
const uint16_t * i16 = (const uint16_t *) x;
for (int i = 0; i < n; ++i) {
y[i] = ggml_table_gelu_quick_f16[i16[i]];
}
}
#ifdef GGML_GELU_QUICK_FP16
inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
@@ -1060,13 +1060,6 @@ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float *
}
#endif
inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
for (int i = 0; i < n; ++i) {
float v = GGML_CPU_FP16_TO_FP32(x[i]);
y[i] = GGML_CPU_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v))));
}
}
// Sigmoid Linear Unit (SiLU) function
inline static float ggml_silu_f32(float x) {
return x/(1.0f + expf(-x));
+61 -78
View File
@@ -1,96 +1,79 @@
#include "concat.cuh"
// contiguous kernels
static __global__ void concat_f32_dim0(const float * x, const float * y, float * dst, const int ne0, const int ne00) {
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
if (nidx >= ne0) {
return;
}
template <int dim>
static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE) concat_f32_cont(const float * x,
const float * y,
float * dst,
int64_t ne00,
int64_t ne01,
int64_t ne02,
int64_t ne0,
int64_t ne1,
int64_t ne2) {
static_assert(dim >= 0 && dim <= 2, "dim must be in [0, 2]");
int offset_dst =
nidx +
blockIdx.y * ne0 +
blockIdx.z * ne0 * gridDim.y;
const int64_t n = ne0 * ne1 * ne2;
if (nidx < ne00) { // src0
int offset_src =
nidx +
blockIdx.y * ne00 +
blockIdx.z * ne00 * gridDim.y;
dst[offset_dst] = x[offset_src];
} else {
int offset_src =
(nidx - ne00) +
blockIdx.y * (ne0 - ne00) +
blockIdx.z * (ne0 - ne00) * gridDim.y;
dst[offset_dst] = y[offset_src];
for (int64_t i = (int64_t) blockIdx.x * blockDim.x + threadIdx.x; i < n; i += (int64_t) blockDim.x * gridDim.x) {
if constexpr (dim == 0) {
const int64_t row = i / ne0;
const int64_t i0 = i - row * ne0;
if (i0 < ne00) {
dst[i] = x[row * ne00 + i0];
} else {
dst[i] = y[row * (ne0 - ne00) + (i0 - ne00)];
}
} else if constexpr (dim == 1) {
const int64_t dst_plane = ne0 * ne1;
const int64_t src0_plane = ne0 * ne01;
const int64_t src1_plane = dst_plane - src0_plane;
const int64_t i2 = i / dst_plane;
const int64_t i01 = i - i2 * dst_plane;
if (i01 < src0_plane) {
dst[i] = x[i2 * src0_plane + i01];
} else {
dst[i] = y[i2 * src1_plane + (i01 - src0_plane)];
}
} else {
const int64_t src0_size = ne0 * ne1 * ne02;
if (i < src0_size) {
dst[i] = x[i];
} else {
dst[i] = y[i - src0_size];
}
}
}
}
static __global__ void concat_f32_dim1(const float * x, const float * y, float * dst, const int ne0, const int ne01) {
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
if (nidx >= ne0) {
return;
}
static void concat_f32_cuda(const float * x,
const float * y,
float * dst,
int64_t ne00,
int64_t ne01,
int64_t ne02,
int64_t ne0,
int64_t ne1,
int64_t ne2,
int dim,
cudaStream_t stream) {
const int64_t n = ne0 * ne1 * ne2;
const int num_blocks = (n + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
int offset_dst =
nidx +
blockIdx.y * ne0 +
blockIdx.z * ne0 * gridDim.y;
if (blockIdx.y < (unsigned)ne01) { // src0
int offset_src =
nidx +
blockIdx.y * ne0 +
blockIdx.z * ne0 * ne01;
dst[offset_dst] = x[offset_src];
} else {
int offset_src =
nidx +
(blockIdx.y - ne01) * ne0 +
blockIdx.z * ne0 * (gridDim.y - ne01);
dst[offset_dst] = y[offset_src];
}
}
static __global__ void concat_f32_dim2(const float * x, const float * y, float * dst, const int ne0, const int ne02) {
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
if (nidx >= ne0) {
return;
}
int offset_dst =
nidx +
blockIdx.y * ne0 +
blockIdx.z * ne0 * gridDim.y;
if (blockIdx.z < (unsigned)ne02) { // src0
int offset_src =
nidx +
blockIdx.y * ne0 +
blockIdx.z * ne0 * gridDim.y;
dst[offset_dst] = x[offset_src];
} else {
int offset_src =
nidx +
blockIdx.y * ne0 +
(blockIdx.z - ne02) * ne0 * gridDim.y;
dst[offset_dst] = y[offset_src];
}
}
static void concat_f32_cuda(const float * x, const float * y, float * dst, int ne00, int ne01, int ne02, int ne0, int ne1, int ne2, int dim, cudaStream_t stream) {
int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
dim3 gridDim(num_blocks, ne1, ne2);
if (dim == 0) {
concat_f32_dim0<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne00);
concat_f32_cont<0>
<<<num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
return;
}
if (dim == 1) {
concat_f32_dim1<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne01);
concat_f32_cont<1>
<<<num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
return;
}
concat_f32_dim2<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
concat_f32_cont<2><<<num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
}
// non-contiguous kernel (slow)
+138 -139
View File
@@ -3478,10 +3478,10 @@ template <ggml_type type, int mmq_x, bool need_check>
static __global__ void mul_mat_q(
const char * __restrict__ x, const int * __restrict__ y, const int32_t * __restrict__ ids_dst,
const int32_t * __restrict__ expert_bounds, float * __restrict__ dst, float * __restrict__ tmp_fixup,
const int ncols_x, const int nrows_x, const int ncols_dst, const int stride_row_x, const int ncols_y, const int stride_col_dst,
const int channel_ratio, const int nchannels_y, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
const int sample_ratio, const int nsamples_y, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
const int ncols_max) {
const uint3 blocks_per_ne00, const int nrows_x, const int ncols_dst, const int stride_row_x, const int ncols_y, const int stride_col_dst,
const uint3 channel_ratio, const uint3 nchannels_y, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
const uint3 sample_ratio, const uint3 nsamples_y, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
const uint3 ntx) {
// Skip unused template specializations for faster compilation:
if (mmq_x > get_mmq_x_max_device() || mmq_x % mmq_get_granularity_device(mmq_x) != 0) {
@@ -3495,8 +3495,7 @@ static __global__ void mul_mat_q(
constexpr int qk = ggml_cuda_type_traits<type>::qk;
constexpr int mmq_y = get_mmq_y_device();
const int ntx = (ncols_max + mmq_x - 1) / mmq_x; // Number of tiles x
const int nty = (nrows_x + mmq_y - 1) / mmq_y; // Number of tiles y
const uint32_t nty = (nrows_x + mmq_y - 1) / mmq_y; // Number of tiles y
// Initialize the ids for writing back data with just the index.
// For regular matrix multiplications this is never changed.
@@ -3517,8 +3516,9 @@ static __global__ void mul_mat_q(
// On non-CDNA AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
#if (defined(GGML_USE_HIP) && !defined(CDNA)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
{
const int wt = blockIdx.z / nchannels_y;
const int zt = blockIdx.z - wt*nchannels_y;
const uint2 tmp2 = fast_div_modulo(blockIdx.z, nchannels_y);
const int wt = tmp2.x;
const int zt = tmp2.y;
const int jt = blockIdx.y;
const int it = blockIdx.x;
@@ -3561,40 +3561,40 @@ static __global__ void mul_mat_q(
const int tile_x_max_i = nrows_x - it*mmq_y - 1;
const int tile_y_max_j = col_diff - jt*mmq_x - 1;
const int offset_x = (wt/sample_ratio)*stride_sample_x + (zt/channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;
const int offset_x = fastdiv(wt, sample_ratio)*stride_sample_x + fastdiv(zt, channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;
constexpr bool fixup = false;
mul_mat_q_process_tile<type, mmq_x, need_check, fixup>
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst,
tile_x_max_i, tile_y_max_j, 0, ncols_x/qk);
tile_x_max_i, tile_y_max_j, 0, blocks_per_ne00.z);
return;
}
#endif // (defined(GGML_USE_HIP) && !defined(CDNA4) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
constexpr int ITER_K = get_iter_k(type);
const int64_t blocks_per_ne00 = ncols_x / qk;
constexpr int blocks_per_iter = ITER_K / qk;
constexpr int ITER_K = get_iter_k(type);
constexpr int blocks_per_iter = ITER_K / qk;
// kbc == k block continuous, current index in continuous ijk space.
int64_t kbc = (int64_t) blockIdx.x *nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
int64_t kbc_stop = (int64_t)(blockIdx.x + 1)*nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
int kbc = int64_t(blockIdx.x) *(nsamples_y.z*nchannels_y.z*ntx.z*nty*blocks_per_ne00.z) / gridDim.x;
int kbc_stop = int64_t(blockIdx.x + 1)*(nsamples_y.z*nchannels_y.z*ntx.z*nty*blocks_per_ne00.z) / gridDim.x;
kbc -= (kbc % blocks_per_ne00) % blocks_per_iter;
kbc_stop -= (kbc_stop % blocks_per_ne00) % blocks_per_iter;
kbc -= fastmodulo(kbc, blocks_per_ne00) % blocks_per_iter;
kbc_stop -= fastmodulo(kbc_stop, blocks_per_ne00) % blocks_per_iter;
// kb0 == k index when doing the matrix multiplication for an output tile.
int kb0_start = kbc % blocks_per_ne00;
int kb0_stop = min(blocks_per_ne00, kb0_start + kbc_stop - kbc);
while (kbc < kbc_stop && kb0_stop == blocks_per_ne00) {
int tmp = kbc;
const int it = tmp / (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
tmp -= it * (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
const int wt = tmp / (nchannels_y*ntx*blocks_per_ne00);
tmp -= wt * (nchannels_y*ntx*blocks_per_ne00);
const int zt = tmp / (ntx*blocks_per_ne00);
tmp -= zt * (ntx*blocks_per_ne00);
const int jt = tmp / blocks_per_ne00;
int kb0_start = fastmodulo(kbc, blocks_per_ne00);
int kb0_stop = min(blocks_per_ne00.z, uint32_t(kb0_start + kbc_stop - kbc));
while (kbc < kbc_stop && kb0_stop == int(blocks_per_ne00.z)) {
int tmp = fastdiv(kbc, blocks_per_ne00);
uint2 tmp2 = fast_div_modulo(tmp, ntx);
const int jt = tmp2.y;
tmp = tmp2.x;
tmp2 = fast_div_modulo(tmp, nchannels_y);
const int zt = tmp2.y;
tmp = tmp2.x;
tmp2 = fast_div_modulo(tmp, nsamples_y);
const int wt = tmp2.y;
const int it = tmp2.x;
// Defaults for regular matrix multiplication:
int col_low = 0;
@@ -3612,11 +3612,11 @@ static __global__ void mul_mat_q(
offset_dst = 0;
if (jt*mmq_x >= col_diff) {
kbc += blocks_per_ne00;
kbc -= kbc % blocks_per_ne00;
kbc += blocks_per_ne00.z;
kbc -= fastmodulo(kbc, blocks_per_ne00);
kb0_start = 0;
kb0_stop = min(blocks_per_ne00, kbc_stop - kbc);
kb0_stop = min(blocks_per_ne00.z, uint32_t(kbc_stop - kbc));
continue;
}
@@ -3641,32 +3641,34 @@ static __global__ void mul_mat_q(
const int tile_x_max_i = nrows_x - it*mmq_y - 1;
const int tile_y_max_j = col_diff - jt*mmq_x - 1;
const int offset_x = (wt/sample_ratio)*stride_sample_x + (zt/channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;
const int offset_x = fastdiv(wt, sample_ratio)*stride_sample_x + fastdiv(zt, channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;
constexpr bool fixup = false; // All but (potentially) the last iterations write their data to dst rather than the fixup buffer.
mul_mat_q_process_tile<type, mmq_x, need_check, fixup>
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst,
tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop);
kbc += blocks_per_ne00;
kbc -= kbc % blocks_per_ne00;
kbc += blocks_per_ne00.z;
kbc -= fastmodulo(kbc, blocks_per_ne00);
kb0_start = 0;
kb0_stop = min(blocks_per_ne00, kbc_stop - kbc);
kb0_stop = min(blocks_per_ne00.z, uint32_t(kbc_stop - kbc));
}
if (kbc >= kbc_stop) {
return;
}
int tmp = kbc;
const int it = tmp / (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
tmp -= it * (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
const int wt = tmp / (nchannels_y*ntx*blocks_per_ne00);
tmp -= wt * (nchannels_y*ntx*blocks_per_ne00);
const int zt = tmp / (ntx*blocks_per_ne00);
tmp -= zt * (ntx*blocks_per_ne00);
const int jt = tmp / blocks_per_ne00;
int tmp = fastdiv(kbc, blocks_per_ne00);
uint2 tmp2 = fast_div_modulo(tmp, ntx);
const int jt = tmp2.y;
tmp = tmp2.x;
tmp2 = fast_div_modulo(tmp, nchannels_y);
const int zt = tmp2.y;
tmp = tmp2.x;
tmp2 = fast_div_modulo(tmp, nsamples_y);
const int wt = tmp2.y;
const int it = tmp2.x;
// Defaults for regular matrix multiplication:
int col_low = 0;
@@ -3708,7 +3710,7 @@ static __global__ void mul_mat_q(
const int tile_x_max_i = nrows_x - it*mmq_y - 1;
const int tile_y_max_j = col_diff - jt*mmq_x - 1;
const int offset_x = (wt/sample_ratio)*stride_sample_x + (zt/channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;
const int offset_x = fastdiv(wt, sample_ratio)*stride_sample_x + fastdiv(zt, channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;
constexpr bool fixup = true; // Last index writes its data to fixup buffer to avoid data races with other blocks.
mul_mat_q_process_tile<type, mmq_x, need_check, fixup>
@@ -3717,46 +3719,37 @@ static __global__ void mul_mat_q(
}
template <ggml_type type, int mmq_x, bool need_check>
static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,
const int32_t * expert_bounds,
float * __restrict__ dst,
const float * __restrict__ tmp_last_tile,
const int ncols_x,
const int nrows_x,
const int ncols_dst,
const size_t stride_col_dst,
const int nchannels_y,
const size_t stride_channel_dst,
const int nsamples_y,
const size_t stride_sample_dst,
const int ncols_max) {
constexpr int mmq_y = get_mmq_y_device();
constexpr int qk = ggml_cuda_type_traits<type>::qk;
constexpr int ITER_K = get_iter_k(type);
__launch_bounds__(ggml_cuda_get_physical_warp_size()*mmq_get_nwarps_device()/2, 1)
static __global__ void mul_mat_q_stream_k_fixup(
const int32_t * __restrict__ ids_dst, const int32_t * __restrict__ expert_bounds, float * __restrict__ dst,
float * __restrict__ tmp_last_tile, const uint3 blocks_per_ne00, const int nrows_x, const int ncols_dst,
const int stride_col_dst, const uint3 nchannels_y, const int stride_channel_dst, const uint3 nsamples_y,
const int stride_sample_dst, const uint3 ntx) {
constexpr int mmq_y = get_mmq_y_device();
constexpr int qk = ggml_cuda_type_traits<type>::qk;
constexpr int ITER_K = get_iter_k(type);
constexpr int blocks_per_iter = ITER_K / qk;
constexpr int blocks_per_iter = ITER_K / qk;
const int64_t blocks_per_ne00 = ncols_x / qk;
constexpr int nwarps = mmq_get_nwarps_device();
constexpr int nwarps = mmq_get_nwarps_device()/2;
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
float sum[mmq_x*mmq_y / (nwarps*warp_size)] = {0.0f};
float sum[mmq_x / nwarps] = {0.0f};
const int i = blockIdx.y*warp_size + threadIdx.x;
const int ntx = (ncols_max + mmq_x - 1) / mmq_x;
const int nty = (nrows_x + mmq_y - 1) / mmq_y;
const int nty = (nrows_x + mmq_y - 1) / mmq_y;
const int bidx0 = blockIdx.x;
// kbc == k block continuous, current index in continuous ijk space.
int64_t kbc0 = (int64_t) bidx0 *nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
int64_t kbc0_stop = (int64_t)(bidx0 + 1)*nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
int kbc0 = int64_t(blockIdx.x) *(nsamples_y.z*nchannels_y.z*ntx.z*nty*blocks_per_ne00.z) / gridDim.x;
int kbc0_stop = int64_t(blockIdx.x + 1)*(nsamples_y.z*nchannels_y.z*ntx.z*nty*blocks_per_ne00.z) / gridDim.x;
kbc0 -= (kbc0 % blocks_per_ne00) % blocks_per_iter;
kbc0_stop -= (kbc0_stop % blocks_per_ne00) % blocks_per_iter;
kbc0 -= fastmodulo(kbc0, blocks_per_ne00) % blocks_per_iter;
kbc0_stop -= fastmodulo(kbc0_stop, blocks_per_ne00) % blocks_per_iter;
const bool did_not_have_any_data = kbc0 == kbc0_stop;
const bool wrote_beginning_of_tile = kbc0 % blocks_per_ne00 == 0;
const bool did_not_write_last = kbc0/blocks_per_ne00 == kbc0_stop/blocks_per_ne00 && kbc0_stop % blocks_per_ne00 != 0;
const bool wrote_beginning_of_tile = fastmodulo(kbc0, blocks_per_ne00) == 0;
const bool did_not_write_last = fastdiv(kbc0, blocks_per_ne00) == fastdiv(kbc0_stop, blocks_per_ne00) && fastmodulo(kbc0_stop, blocks_per_ne00) != 0;
if (did_not_have_any_data || wrote_beginning_of_tile || did_not_write_last) {
return;
}
@@ -3765,11 +3758,11 @@ static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,
// Iterate over previous blocks and sum up partial sums written to fixup buffer.
// All CUDA blocks that get here must have a previous block that needs a fixup.
int64_t bidx = bidx0 - 1;
int64_t kbc_stop = kbc0;
int bidx = bidx0 - 1;
int kbc_stop = kbc0;
while(true) {
int64_t kbc = bidx*nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
kbc -= (kbc % blocks_per_ne00) % blocks_per_iter;
int kbc = int64_t(bidx)*(nsamples_y.z*nchannels_y.z*ntx.z*nty*blocks_per_ne00.z) / gridDim.x;
kbc -= fastmodulo(kbc, blocks_per_ne00) % blocks_per_iter;
if (kbc == kbc_stop) { // Did not have any data.
bidx--;
@@ -3779,20 +3772,16 @@ static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,
any_fixup = true;
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
const int j = j0 + threadIdx.y;
#pragma unroll
for (int i0 = 0; i0 < mmq_y; i0 += warp_size) {
const int i = i0 + threadIdx.x;
sum[(j0/nwarps) * (mmq_y/warp_size) + i0/warp_size] += tmp_last_tile[bidx*(mmq_x*mmq_y) + j*mmq_y + i];
}
sum[j0/nwarps] += tmp_last_tile[bidx*(mmq_x*mmq_y) + j*mmq_y + i];
}
// If this block started in a previous tile we are done and don't need to combine additional partial results.
if (kbc % blocks_per_ne00 == 0 || kbc/blocks_per_ne00 < kbc0/blocks_per_ne00) {
if (fastmodulo(kbc, blocks_per_ne00) == 0 || fastdiv(kbc, blocks_per_ne00) < fastdiv(kbc0, blocks_per_ne00)) {
break;
}
bidx--;
@@ -3803,14 +3792,16 @@ static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,
return;
}
int tmp = kbc0;
const int it = tmp / (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
tmp -= it * (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
const int wt = tmp / (nchannels_y*ntx*blocks_per_ne00);
tmp -= wt * (nchannels_y*ntx*blocks_per_ne00);
const int zt = tmp / (ntx*blocks_per_ne00);
tmp -= zt * (ntx*blocks_per_ne00);
const int jt = tmp / blocks_per_ne00;
int tmp = fastdiv(kbc0, blocks_per_ne00);
uint2 tmp2 = fast_div_modulo(tmp, ntx);
const int jt = tmp2.y;
tmp = tmp2.x;
tmp2 = fast_div_modulo(tmp, nchannels_y);
const int zt = tmp2.y;
tmp = tmp2.x;
tmp2 = fast_div_modulo(tmp, nsamples_y);
const int wt = tmp2.y;
const int it = tmp2.x;
if (!ids_dst) {
const int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst + it*mmq_y;
@@ -3818,6 +3809,9 @@ static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,
const int i_max = nrows_x - it*mmq_y - 1;
const int j_max = ncols_dst - jt*mmq_x - 1;
if (need_check && i > i_max) {
return;
}
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
@@ -3827,16 +3821,7 @@ static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,
return;
}
#pragma unroll
for (int i0 = 0; i0 < mmq_y; i0 += warp_size) {
const int i = i0 + threadIdx.x;
if (need_check && i > i_max) {
continue;
}
dst[j*stride_col_dst + i] += sum[(j0/nwarps) * (mmq_y/warp_size) + i0/warp_size];
}
dst[j*stride_col_dst + i] += sum[j0/nwarps];
}
return;
}
@@ -3856,6 +3841,9 @@ static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,
const int i_max = nrows_x - it*mmq_y - 1;
const int j_max = col_diff - jt*mmq_x - 1;
if (need_check && i > i_max) {
return;
}
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
@@ -3865,16 +3853,7 @@ static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,
return;
}
#pragma unroll
for (int i0 = 0; i0 < mmq_y; i0 += warp_size) {
const int i = i0 + threadIdx.x;
if (need_check && i > i_max) {
continue;
}
dst[ids_dst_shared[j]*stride_col_dst + i] += sum[(j0/nwarps) * (mmq_y/warp_size) + i0/warp_size];
}
dst[ids_dst_shared[j]*stride_col_dst + i] += sum[j0/nwarps];
}
}
@@ -3922,29 +3901,44 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
const int channel_ratio = args.nchannels_y / args.nchannels_x;
const int sample_ratio = args.nsamples_y / args.nsamples_x;
const uint3 blocks_per_ne00_fd = init_fastdiv_values(args.ncols_x / ggml_cuda_type_traits<type>::qk);
const uint3 ntx_fd = init_fastdiv_values(ntx);
const uint3 nchannels_y_fd = init_fastdiv_values(args.nchannels_y);
const uint3 nsamples_y_fd = init_fastdiv_values(args.nsamples_y);
const uint3 channel_ratio_fd = init_fastdiv_values(channel_ratio);
const uint3 sample_ratio_fd = init_fastdiv_values(sample_ratio);
if (!args.use_stream_k) {
if (args.nrows_x % mmq_y == 0) {
constexpr bool need_check = false;
mul_mat_q<type, mmq_x, need_check><<<block_nums_xy_tiling, block_dims, nbytes_shared, stream>>>
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr,
args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
args.ncols_max);
blocks_per_ne00_fd, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
channel_ratio_fd, nchannels_y_fd, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
sample_ratio_fd, nsamples_y_fd, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
ntx_fd);
} else {
constexpr bool need_check = true;
mul_mat_q<type, mmq_x, need_check><<<block_nums_xy_tiling, block_dims, nbytes_shared, stream>>>
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr,
args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
args.ncols_max);
blocks_per_ne00_fd, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
channel_ratio_fd, nchannels_y_fd, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
sample_ratio_fd, nsamples_y_fd, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
ntx_fd);
}
return;
}
const dim3 block_nums_stream_k(nsm, 1, 1);
const bool fixup_needed = ntx*nty*ntzw % nsm != 0;
// For the stream-k kernel it is possible to run it with tiling by setting the number of CUDA blocks equal to the number of tiles.
// This is worthwhile if the efficiency of tiling is high and skipping the fixup kernel is more important.
const int ntiles_dst = ntx * nty * ntzw;
const int tiles_nwaves = (ntiles_dst + nsm - 1) / nsm;
const int tiles_efficiency_percent = 100 * ntiles_dst / (nsm*tiles_nwaves);
const dim3 block_nums_stream_k(GGML_CUDA_CC_IS_NVIDIA(cc) && tiles_efficiency_percent >= 90 ? ntiles_dst : nsm, 1, 1);
GGML_ASSERT(ntiles_dst * blocks_per_ne00_fd.z < (1 << 30)); // Assert that variable kbc will not overflow.
const bool fixup_needed = ntiles_dst % block_nums_stream_k.x != 0;
ggml_cuda_pool & pool = ctx.pool(id);
ggml_cuda_pool_alloc<float> tmp_fixup(pool);
@@ -3952,40 +3946,45 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
tmp_fixup.alloc(block_nums_stream_k.x * mmq_x*mmq_y);
}
const dim3 block_nums_fixup(block_nums_stream_k.x, mmq_y/warp_size, 1);
const dim3 block_dims_fixup(block_dims.x, block_dims.y/2, block_dims.z);
if (args.nrows_x % mmq_y == 0) {
constexpr bool need_check = false;
mul_mat_q<type, mmq_x, need_check><<<block_nums_stream_k, block_dims, nbytes_shared, stream>>>
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr,
args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
args.ncols_max);
blocks_per_ne00_fd, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
channel_ratio_fd, nchannels_y_fd, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
sample_ratio_fd, nsamples_y_fd, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
ntx_fd);
if (!fixup_needed) {
return;
}
mul_mat_q_stream_k_fixup<type, mmq_x, need_check><<<block_nums_stream_k, block_dims, 0, stream>>>
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_dst,
args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst,
args.ncols_max);
CUDA_CHECK(cudaGetLastError());
mul_mat_q_stream_k_fixup<type, mmq_x, need_check><<<block_nums_fixup, block_dims_fixup, 0, stream>>>
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, blocks_per_ne00_fd, args.nrows_x, args.ncols_dst,
args.nrows_dst, nchannels_y_fd, args.stride_channel_dst, nsamples_y_fd, args.stride_sample_dst,
ntx_fd);
} else {
constexpr bool need_check = true;
mul_mat_q<type, mmq_x, need_check><<<block_nums_stream_k, block_dims, nbytes_shared, stream>>>
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr,
args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
args.ncols_max);
blocks_per_ne00_fd, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
channel_ratio_fd, nchannels_y_fd, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
sample_ratio_fd, nsamples_y_fd, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
ntx_fd);
if (!fixup_needed) {
return;
}
mul_mat_q_stream_k_fixup<type, mmq_x, need_check><<<block_nums_stream_k, block_dims, 0, stream>>>
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_dst,
args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst,
args.ncols_max);
CUDA_CHECK(cudaGetLastError());
mul_mat_q_stream_k_fixup<type, mmq_x, need_check><<<block_nums_fixup, block_dims_fixup, 0, stream>>>
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, blocks_per_ne00_fd, args.nrows_x, args.ncols_dst,
args.nrows_dst, nchannels_y_fd, args.stride_channel_dst, nsamples_y_fd, args.stride_sample_dst,
ntx_fd);
}
}
+291 -150
View File
@@ -12,9 +12,12 @@
#include <cstddef>
#include <stdexcept>
#include <string>
#include <sstream>
#include <iomanip>
#include <unordered_set>
#include <unordered_map>
#include <regex>
#include <queue>
#ifdef _WIN32
# include <sal.h>
@@ -41,18 +44,26 @@
#include "htp_iface.h"
#include "htp-drv.h"
using intvec = std::vector<int>;
using uintvec = std::vector<unsigned int>;
using u32vec = std::vector<uint32_t>;
static size_t opt_ndev = 1;
static size_t opt_nhvx = 0; // use all
static int opt_arch = 0; // autodetect
static int opt_etm = 0;
static int opt_verbose = 0;
static int opt_profile = 0;
static int opt_profile = 0; // profiling mode (0-disabled, 1-basic, 2-pmu)
static int opt_hostbuf = 1; // hostbuf ON by default
static int opt_use_hmx = 1; // when set, enable HMX; when 0, use HVX only
// Default PMU events, if profiling with PMU (mode=2) is enabled
// See https://docs.qualcomm.com/doc/80-N2040-60/topic/pmu-events.html
// https://docs.qualcomm.com/doc/80-N2040-61/topic/hvx-pmu-events.html
static u32vec opt_pmu_evt { 0x3, 0x111, 0x100, 0x105, 0x240, 0x256, 0x7D, 0x8C };
// Enable all stages by default
static int opt_opmask = HTP_OPMASK_QUEUE | HTP_OPMASK_COMPUTE;
static int opt_opsync = 0; // synchronous ops
static int opt_opstage = HTP_OPSTAGE_QUEUE | HTP_OPSTAGE_COMPUTE;
static int opt_opbatch = 1024; // max number of ops in a batch
static int opt_opqueue = 16; // max number of pending batches
static std::regex* opt_opfilter = NULL; // regex of ops to not claim
@@ -104,19 +115,26 @@ static void ggml_hexagon_dump_op_supp(const std::string &sess_name, const struct
}
static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const ggml_tensor * op,
uint32_t op_usec, uint32_t op_cycles, uint32_t op_pkts, uint64_t call_usec) {
uint32_t op_usec, uint32_t op_cycles, const uint32_t pmu[]) {
if (!opt_profile) return;
op_desc desc(op);
GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : %s : op-usec %u op-cycles %u op-pkts %u (%f) call-usec %llu\n", sess_name.c_str(),
ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs,
op_usec, op_cycles, op_pkts, (float) op_cycles / op_pkts, (unsigned long long) call_usec);
char pmu_str[256] = "";
if (opt_profile > 1) {
static_assert(HTP_PROF_PMU_NCNT == 8, "current implementation assumes 8 PMU counters");
sprintf(pmu_str, " pmu [%u,%u,%u,%u,%u,%u,%u,%u]",
pmu[0], pmu[1], pmu[2], pmu[3], pmu[4], pmu[5], pmu[6], pmu[7]);
}
GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : usec %u cycles %u%s\n", sess_name.c_str(),
ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, op_usec, op_cycles, pmu_str);
}
// ** backend sessions
struct ggml_hexagon_opbatch;
struct ggml_hexagon_opshm;
struct ggml_hexagon_opqueue;
struct ggml_hexagon_session {
std::string name;
@@ -132,8 +150,8 @@ struct ggml_hexagon_session {
bool valid_iface;
std::atomic<int> op_pending;
ggml_hexagon_opbatch *op_batch;
ggml_hexagon_opshm *op_shm;
ggml_hexagon_opbatch* op_batch;
ggml_hexagon_opqueue* op_queue;
ggml_backend_buffer_type buffer_type = {};
ggml_backend_buffer_type repack_buffer_type = {};
@@ -1521,65 +1539,14 @@ static ggml_backend_buffer_type_i ggml_backend_hexagon_repack_buffer_type_interf
// Backend session implementation
struct ggml_hexagon_opshm {
ggml_hexagon_shared_buffer *sbuf;
std::vector<bool> block_mask;
size_t block_size;
uint8_t * base() const { return this->sbuf->base; }
int fd() const { return this->sbuf->fd; }
size_t n_blocks() const { return this->block_mask.size(); }
ggml_hexagon_opshm(ggml_hexagon_session *sess, size_t max_batch, size_t max_pending) {
size_t n_bufs = HTP_OP_MAX_BUFS;
size_t n_ops = max_batch;
size_t n_tensors = n_ops + n_ops * HTP_OP_MAX_INPUTS;
block_mask.resize(max_pending, true);
block_size = sizeof(htp_buf_desc) * n_bufs +
sizeof(htp_tensor) * n_tensors +
sizeof(htp_op_desc) * n_ops;
sbuf = new ggml_hexagon_shared_buffer(sess, block_size * block_mask.size(), true /* pinned */);
if (opt_verbose) {
GGML_LOG_INFO("ggml-hex: %s allocated shared buf %zu : block-size %zu max-batch %zu max-pending %zu\n",
sess->c_name(), (size_t) sbuf->size, block_size, max_batch, max_pending);
}
}
~ggml_hexagon_opshm() {
delete sbuf;
}
uint8_t * allocate() {
auto it = std::find(block_mask.begin(), block_mask.end(), true);
if (it == block_mask.end())
return nullptr;
unsigned int i = std::distance(block_mask.begin(), it);
uint8_t* addr = sbuf->base + (i * block_size);
block_mask[i] = false;
HEX_VERBOSE("ggml-hex: %s allocated op shm #%u %p\n", sbuf->sess->c_name(), i, (void*) addr);
return addr;
}
void release(uint8_t * addr) {
int i = (addr - sbuf->base) / block_size;
block_mask[i] = true;
HEX_VERBOSE("ggml-hex: %s released op shm #%u %p\n", sbuf->sess->c_name(), i, (void*) addr);
}
};
struct ggml_hexagon_opbatch {
const char* name;
ggml_hexagon_session* sess;
std::vector<htp_buf_desc> buffers;
std::vector<htp_tensor> tensors;
std::vector<htp_op_desc> ops;
std::vector<const ggml_tensor*> ops; // pointers to original ops
std::vector<htp_buf_desc> h_bufs; // htp buffer descriptors
std::vector<htp_tensor> h_tens; // htp tensor descriptors
std::vector<htp_op_desc> h_ops; // htp op descriptors
std::unordered_map<int, int> b_map; // buffer fd to index
std::unordered_map<const ggml_tensor*, int> t_map; // tensor ptr to index
@@ -1606,19 +1573,21 @@ struct ggml_hexagon_opbatch {
d_map.clear();
}
ggml_hexagon_opbatch(ggml_hexagon_session *sess, size_t max_batch) {
name = sess->c_name();
ggml_hexagon_opbatch(ggml_hexagon_session *sess, size_t batch_size) {
this->sess = sess;
n_bufs_max = HTP_OP_MAX_BUFS;
n_ops_max = max_batch;
n_ops_max = batch_size;
n_tens_max = n_ops_max + n_ops_max * HTP_OP_MAX_INPUTS;
b_vmem_max = HTP_OP_MAX_VMEM;
buffers.resize(n_bufs_max);
tensors.resize(n_tens_max);
ops.resize(n_ops_max);
h_bufs.resize(n_bufs_max);
h_tens.resize(n_tens_max);
h_ops.resize(n_ops_max);
b_map.reserve(n_bufs_max);
t_map.reserve(n_tens_max);
d_map.reserve(n_tens_max);
@@ -1640,7 +1609,7 @@ struct ggml_hexagon_opbatch {
b_map.insert({sbuf->fd, bi});
htp_buf_desc &b = buffers[bi];
htp_buf_desc &b = h_bufs[bi];
b.base = (uint64_t) sbuf->base;
b.fd = sbuf->fd;
b.size = sbuf->size;
@@ -1664,7 +1633,7 @@ struct ggml_hexagon_opbatch {
// First lookup by tensor data
auto range = d_map.equal_range(t->data);
for (auto it = range.first; it != range.second; ++it) {
htp_tensor * h = &tensors[it->second];
htp_tensor * h = &h_tens[it->second];
if (same_shape(h, t)) { return it->second; }
}
@@ -1682,7 +1651,7 @@ struct ggml_hexagon_opbatch {
uint64_t t_offset = (uint8_t *) t->data - sbuf->base;
size_t t_size = ggml_nbytes(t);
htp_tensor &h = tensors[ti];
htp_tensor &h = h_tens[ti];
h.bi = add_buffer(sbuf);
h.data = t_offset;
h.size = t_size;
@@ -1737,65 +1706,170 @@ struct ggml_hexagon_opbatch {
// assumes that fit_op() was called first and returned true
void add_op(htp_op_code opcode, const struct ggml_tensor * t) {
// Add new op
htp_op_desc &o = ops[n_ops++];
unsigned int n = n_ops++;
GGML_ASSERT(n_ops <= n_ops_max);
ops[n] = t;
htp_op_desc &o = h_ops[n];
memcpy(&o.params, &t->op_params, sizeof(t->op_params));
o.opcode = opcode;
o.flags = 0;
if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
if (!(opt_opstage & HTP_OPSTAGE_COMPUTE)) {
o.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
}
ggml_hexagon_dump_op_exec(name, t, o.flags);
ggml_hexagon_dump_op_exec(sess->c_name(), t, o.flags);
for (unsigned int i=0; i < HTP_OP_MAX_INPUTS; i++) {
o.src[i] = t->src[i] ? add_tensor(t->src[i]) : 0xffff;
}
o.dst = add_tensor(t);
}
};
size_t flush(uint8_t * mem_addr, size_t mem_size) {
static_assert(sizeof(htp_buf_desc) % 8 == 0, "sizeof(htp_buf_desc) must be multiple of 8");
static_assert(sizeof(htp_tensor) % 8 == 0, "sizeof(htp_tensor) must be multiple of 8");
static_assert(sizeof(htp_op_desc) % 8 == 0, "sizeof(htp_op_desc) must be multiple of 8");
struct ggml_hexagon_opqueue {
// Shared buffer for storing batches
ggml_hexagon_shared_buffer *shm_buf;
size_t shm_blk_size;
const size_t b_size = sizeof(htp_buf_desc) * n_bufs;
const size_t t_size = sizeof(htp_tensor) * n_tens;
const size_t o_size = sizeof(htp_op_desc) * n_ops;
using opvec = std::vector<const ggml_tensor*>;
const size_t m_size = b_size + t_size + o_size;
GGML_ASSERT(m_size <= mem_size);
std::queue<unsigned int> done; // completed batch ids
std::vector<opvec> op_cache; // per batch op cache
std::vector<uint64_t> start_usec; // per batch start time
uint8_t * b_ptr = (uint8_t *) mem_addr;
uint8_t * t_ptr = (uint8_t *) b_ptr + b_size;
uint8_t * o_ptr = (uint8_t *) t_ptr + t_size;
ggml_hexagon_opqueue(ggml_hexagon_session *sess, size_t batch_size, size_t depth) {
size_t n_bufs = HTP_OP_MAX_BUFS;
size_t n_ops = batch_size;
size_t n_tensors = n_ops + n_ops * HTP_OP_MAX_INPUTS;
memcpy(b_ptr, (void *) buffers.data(), b_size);
memcpy(t_ptr, (void *) tensors.data(), t_size);
memcpy(o_ptr, (void *) ops.data(), o_size);
shm_blk_size = sizeof(htp_buf_desc) * n_bufs +
sizeof(htp_tensor) * n_tensors +
sizeof(htp_op_desc) * n_ops +
sizeof(htp_prof_desc) * n_ops;
HEX_VERBOSE("ggml-hex: %s flush-opbatch : n-bufs %u n-tensors %u n-ops %u vmem %zu : b-size %zu t-size %zu o-size %zu\n",
name, n_bufs, n_tens, n_ops, b_vmem, b_size, t_size, o_size);
shm_buf = new ggml_hexagon_shared_buffer(sess, shm_blk_size * depth, true /* pinned */);
op_cache.resize(depth);
start_usec.resize(depth, 0);
// init done queue
for (unsigned int i = 0; i < depth; i++) { done.push(i); }
if (opt_verbose) {
GGML_LOG_INFO("ggml-hex: %s allocated op-queue : batch-size %zu depth %zu shm-size %zu shm-block-size %zu\n",
sess->c_name(), batch_size, depth, shm_buf->size, shm_blk_size);
}
}
~ggml_hexagon_opqueue() {
delete shm_buf;
}
// push new batch
bool push(htp_opbatch_req& req, dspqueue_buffer& dbuf, ggml_hexagon_opbatch* op_batch) {
static_assert(sizeof(htp_opbatch_req) % 8 == 0, "sizeof(htp_opbatch_req) must be multiple of 8");
static_assert(sizeof(htp_opbatch_rsp) % 8 == 0, "sizeof(htp_opbatch_rsp) must be multiple of 8");
static_assert(sizeof(htp_buf_desc) % 8 == 0, "sizeof(htp_buf_desc) must be multiple of 8");
static_assert(sizeof(htp_tensor) % 8 == 0, "sizeof(htp_tensor) must be multiple of 8");
static_assert(sizeof(htp_op_desc) % 8 == 0, "sizeof(htp_op_desc) must be multiple of 8");
static_assert(sizeof(htp_prof_desc) % 8 == 0, "sizeof(htp_prof_desc) must be multiple of 8");
if (done.empty()) { return false; }
req.id = done.front(); done.pop(); // batch id
req.n_bufs = op_batch->n_bufs;
req.n_tensors = op_batch->n_tens;
req.n_ops = op_batch->n_ops;
op_cache[req.id] = op_batch->ops;
start_usec[req.id] = ggml_time_us();
const size_t b_size = sizeof(htp_buf_desc) * req.n_bufs;
const size_t t_size = sizeof(htp_tensor) * req.n_tensors;
const size_t o_size = sizeof(htp_op_desc) * req.n_ops;
const size_t p_size = sizeof(htp_prof_desc) * req.n_ops;
dbuf.ptr = shm_buf->base + (req.id * shm_blk_size);
dbuf.fd = shm_buf->fd;
dbuf.flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
dbuf.offset = (uint8_t*) dbuf.ptr - (uint8_t*) shm_buf->base;
dbuf.size = b_size + t_size + o_size + p_size;
GGML_ASSERT(dbuf.size <= shm_blk_size);
uint8_t * m_ptr = (uint8_t*) dbuf.ptr;
uint8_t * b_ptr = m_ptr; m_ptr += b_size;
uint8_t * t_ptr = m_ptr; m_ptr += t_size;
uint8_t * o_ptr = m_ptr;
memcpy(b_ptr, (void *) op_batch->h_bufs.data(), b_size);
memcpy(t_ptr, (void *) op_batch->h_tens.data(), t_size);
memcpy(o_ptr, (void *) op_batch->h_ops.data(), o_size);
HEX_VERBOSE("ggml-hex: %s op-queue push batch #%u : n-bufs %u n-tensors %u n-ops %u vmem %zu : b-size %zu t-size %zu o-size %zu m-size %zu\n",
shm_buf->sess->c_name(), req.id, req.n_bufs, req.n_tensors, req.n_ops, op_batch->b_vmem,
b_size, t_size, o_size, (size_t) dbuf.size);
op_batch->reset();
if (opt_verbose > 1) {
htp_buf_desc *b = (htp_buf_desc*) b_ptr;
for (unsigned int i=0; i < n_bufs; i++) {
GGML_LOG_DEBUG("ggml-hex: %s htp-buf #%u : fd %d base %p size %zu\n", name, i,
for (unsigned int i=0; i < req.n_bufs; i++) {
GGML_LOG_DEBUG("ggml-hex: %s htp-buf #%u : fd %d base %p size %zu\n", shm_buf->sess->c_name(), i,
b[i].fd, (void *) b[i].base, (size_t) b[i].size);
}
htp_tensor *t = (htp_tensor*) t_ptr;
for (unsigned int i=0; i < n_tens; i++) {
for (unsigned int i=0; i < req.n_tensors; i++) {
GGML_LOG_DEBUG("ggml-hex: %s htp-tensor #%u : bi %u offset %u size %u : %zu:%zu:%zu:%zu\n",
name, i, t[i].bi, t[i].data, t[i].size,
shm_buf->sess->c_name(), i, t[i].bi, t[i].data, t[i].size,
(size_t) t[i].ne[0], (size_t) t[i].ne[1], (size_t) t[i].ne[2], (size_t) t[i].ne[3]);
}
}
reset();
return true;
}
return m_size;
void pop(htp_opbatch_rsp rsp, dspqueue_buffer dbuf) {
GGML_ASSERT(rsp.id < op_cache.size());
done.push(rsp.id);
const size_t b_size = sizeof(htp_buf_desc) * rsp.n_bufs;
const size_t t_size = sizeof(htp_tensor) * rsp.n_tensors;
const size_t o_size = sizeof(htp_op_desc) * rsp.n_ops;
const size_t p_size = sizeof(htp_prof_desc) * rsp.n_ops;
const size_t m_size = b_size + t_size + o_size + p_size;
GGML_ASSERT(m_size <= shm_blk_size);
HEX_VERBOSE("ggml-hex: %s op-queue pop batch #%u : n-bufs %u n-tensors %u n-ops %u : m-size %zu b-size %zu t-size %zu o-size %zu\n",
shm_buf->sess->c_name(), rsp.id, rsp.n_bufs, rsp.n_tensors, rsp.n_ops,
(size_t) dbuf.size, b_size, t_size, o_size);
uint8_t * m_ptr = (uint8_t*) dbuf.ptr;
uint8_t * p_ptr = m_ptr + (b_size + t_size + o_size);
if (opt_profile && rsp.n_ops > 0) {
auto & ops = op_cache[rsp.id];
uint64_t batch_usec = ggml_time_us() - start_usec[rsp.id];
uint32_t htp_usec = 0;
GGML_ASSERT(rsp.n_ops <= ops.size());
const htp_prof_desc * pd = (const htp_prof_desc *) p_ptr;
for (uint32_t i = 0; i < rsp.n_ops; i++) {
htp_usec += pd[i].usecs;
ggml_hexagon_dump_op_prof(shm_buf->sess->name, ops[i], pd[i].usecs, pd[i].cycles, pd[i].pmu);
}
GGML_LOG_DEBUG("ggml-hex: %s profile-batch n-ops %u batch-dur-usec %lld htp-ops-usec %u\n",
shm_buf->sess->c_name(), rsp.n_ops, (long long) batch_usec, htp_usec);
}
}
};
@@ -1824,17 +1898,12 @@ void ggml_hexagon_session::flush_pending(bool all) {
GGML_ABORT("ggml-hex: %s dspcall : bad response : size %u dspbufs %u\n", this->c_name(), rsp_size, n_dbufs);
}
op_shm->release((uint8_t*) dbuf.ptr);
if (rsp.status != HTP_STATUS_OK) {
GGML_LOG_ERROR("ggml-hex: %s dspcall : dsp-rsp: %s\n", this->c_name(), status_to_str(rsp.status));
// TODO: handle errors
}
// FIXME: profile will be per opreq
// this->prof_usecs = rsp.prof_usecs;
// this->prof_cycles = rsp.prof_cycles;
// this->prof_pkts = rsp.prof_pkts;
op_queue->pop(rsp, dbuf);
this->op_pending--; // atomic dec
@@ -1845,28 +1914,17 @@ void ggml_hexagon_session::flush_pending(bool all) {
void ggml_hexagon_session::flush_batch() {
if (op_batch->empty()) { return; }
htp_opbatch_req req;
req.n_bufs = op_batch->n_bufs;
req.n_tensors = op_batch->n_tens;
req.n_ops = op_batch->n_ops;
htp_opbatch_req req {};
dspqueue_buffer dbuf{};
dspqueue_buffer dbuf;
dbuf.fd = op_shm->fd();
dbuf.flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
dbuf.ptr = op_shm->allocate();
if (!dbuf.ptr) {
if (!op_queue->push(req, dbuf, op_batch)) {
flush_pending(false);
dbuf.ptr = op_shm->allocate();
op_queue->push(req, dbuf, op_batch);
}
dbuf.offset = (uint8_t*) dbuf.ptr - (uint8_t*) op_shm->base();
dbuf.size = op_batch->flush((uint8_t*) dbuf.ptr, op_shm->block_size);
// Bump pending flag (cleared in the session::flush once we get the response)
this->op_pending++; // atomic inc
HEX_VERBOSE("ggml-hex: %s: queue-opbatch : %p size %u\n", this->c_name(), dbuf.ptr, dbuf.size);
int err = dspqueue_write(this->queue, 0, 1, &dbuf, sizeof(req), (const uint8_t*) &req, DSPQUEUE_TIMEOUT);
if (err != 0) {
GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", this->c_name(), (unsigned) err);
@@ -2016,25 +2074,33 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
}
if (opt_etm) {
err = htp_iface_enable_etm(this->handle);
err = htp_iface_etm(this->handle, 1);
if (err != 0) {
GGML_LOG_ERROR("ggml-hex: failed to enable ETM tracing: 0x%08x\n", (unsigned) err);
}
}
// Start the DSP-side service. We need to pass the queue ID to the
// DSP in a FastRPC call; the DSP side will import the queue and start
// listening for packets in a callback.
if (opt_profile) {
htp_iface_pmu_conf pmu_conf{};
std::copy(opt_pmu_evt.begin(), opt_pmu_evt.end(), pmu_conf.events);
err = htp_iface_profiler(this->handle, opt_profile, &pmu_conf);
if (err != 0) {
GGML_LOG_ERROR("ggml-hex: failed to enable profiling: 0x%08x\n", (unsigned) err);
}
}
// Allocate buffers and state for op batching
this->op_batch = new ggml_hexagon_opbatch(this, opt_opbatch);
this->op_queue = new ggml_hexagon_opqueue(this, opt_opbatch, opt_opqueue);
// Start processing op batch requests
err = htp_iface_start(this->handle, dev_id, this->queue_id, opt_nhvx, opt_use_hmx);
if (err != 0) {
GGML_LOG_ERROR("ggml-hex: failed to start session: 0x%08x\n", (unsigned) err);
throw std::runtime_error("ggml-hex: iface start failed (see log for details)");
}
this->valid_iface = true;
// Allocate buffers and state for op batching
this->op_batch = new ggml_hexagon_opbatch(this, opt_opbatch);
this->op_shm = new ggml_hexagon_opshm(this, opt_opbatch, opt_opqueue);
}
void ggml_hexagon_session::release() noexcept(true) {
@@ -2043,7 +2109,7 @@ void ggml_hexagon_session::release() noexcept(true) {
int err;
delete this->op_batch;
delete this->op_shm;
delete this->op_queue;
// Stop the DSP-side service and close the queue
if (this->valid_iface) {
@@ -2054,12 +2120,20 @@ void ggml_hexagon_session::release() noexcept(true) {
}
if (opt_etm) {
err = htp_iface_disable_etm(this->handle);
err = htp_iface_etm(this->handle, 0);
if (err != 0) {
GGML_LOG_ERROR("ggml-hex: warn : failed to disable ETM tracing: 0x%08x\n", (unsigned) err);
}
}
if (opt_profile) {
htp_iface_pmu_conf pmu_conf{};
err = htp_iface_profiler(this->handle, 0, &pmu_conf);
if (err != 0) {
GGML_LOG_ERROR("ggml-hex: warn : failed to disable profiling: 0x%08x\n", (unsigned) err);
}
}
if (this->valid_queue) {
err = dspqueue_close(queue);
if (err != 0) {
@@ -2077,7 +2151,7 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n
repack_buffer_type.device = dev;
op_batch = nullptr;
op_shm = nullptr;
op_queue = nullptr;
try {
allocate(dev_id);
@@ -2619,6 +2693,39 @@ static bool ggml_hexagon_supported_diag(const struct ggml_hexagon_session * sess
return true;
}
static bool ggml_hexagon_supported_solve_tri(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
const struct ggml_tensor * src0 = op->src[0]; // A
const struct ggml_tensor * src1 = op->src[1]; // B
const struct ggml_tensor * dst = op; // X
if (!src0 || !src1) {
return false;
}
if (src0->type != GGML_TYPE_F32 || src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
return false;
}
if (src0->ne[0] != src0->ne[1]) {
return false;
}
if (src0->ne[1] != src1->ne[1]) {
return false;
}
if (src0->ne[2] != src1->ne[2] || src0->ne[3] != src1->ne[3]) {
return false;
}
if (dst->ne[0] != src1->ne[0] || dst->ne[1] != src1->ne[1] || dst->ne[2] != src1->ne[2] || dst->ne[3] != src1->ne[3]) {
return false;
}
GGML_UNUSED(sess);
return true;
}
static const char * ggml_backend_hexagon_name(ggml_backend_t backend) {
auto sess = static_cast<ggml_hexagon_session *>(backend->context);
return sess->c_name();
@@ -2657,7 +2764,7 @@ static htp_op_code op_remap_to_htp(const ggml_tensor * t) {
case GGML_OP_CUMSUM: return HTP_OP_CUMSUM;
case GGML_OP_FILL: return HTP_OP_FILL;
case GGML_OP_DIAG: return HTP_OP_DIAG;
case GGML_OP_SOLVE_TRI: return HTP_OP_SOLVE_TRI;
case GGML_OP_UNARY:
switch (ggml_get_unary_op(t)) {
case GGML_UNARY_OP_SILU: return HTP_OP_UNARY_SILU;
@@ -2698,7 +2805,7 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
for (int i = 0; i < graph->n_nodes; ++i) {
ggml_tensor * n = graph->nodes[i];
if (op_is_compute(n)) {
if (op_is_compute(n) && (opt_opstage & HTP_OPSTAGE_QUEUE)) {
sess->enqueue_op(op_remap_to_htp(n), n);
}
}
@@ -3203,6 +3310,10 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
supp = ggml_hexagon_supported_diag(sess, op);
break;
case GGML_OP_SOLVE_TRI:
supp = ggml_hexagon_supported_solve_tri(sess, op);
break;
default:
break;
}
@@ -3338,6 +3449,26 @@ static void * ggml_backend_hexagon_get_proc_address(ggml_backend_reg_t reg, cons
return NULL;
}
template<typename T> std::vector<T> str_to_vec(const char* str) {
std::stringstream ss(str);
std::vector<T> v;
std::string t;
while (std::getline(ss, t, ',')) {
v.push_back(std::stoul(t, nullptr, 0));
}
return v;
}
template<typename T, int BASE=10> std::string vec_to_str(std::vector<T> v) {
std::stringstream ss;
ss << std::setbase(BASE) << std::showbase;
for (auto i : v) { ss << i << ','; }
auto str = ss.str(); str.pop_back(); // drop last comma
return str;
}
static void ggml_hexagon_init(ggml_backend_reg * reg) {
// Basic sanity checks to make sure definitions match
static_assert((unsigned int) HTP_TYPE_Q4_0 == (unsigned int) GGML_TYPE_Q4_0,
@@ -3351,8 +3482,7 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
const char * str_verbose = getenv("GGML_HEXAGON_VERBOSE");
const char * str_hostbuf = getenv("GGML_HEXAGON_HOSTBUF");
const char * str_opmask = getenv("GGML_HEXAGON_OPMASK");
const char * str_opsync = getenv("GGML_HEXAGON_OPSYNC");
const char * str_opstage = getenv("GGML_HEXAGON_OPSTAGE");
const char * str_opbatch = getenv("GGML_HEXAGON_OPBATCH");
const char * str_opqueue = getenv("GGML_HEXAGON_OPQUEUE");
const char * str_opfilter= getenv("GGML_HEXAGON_OPFILTER");
@@ -3365,19 +3495,30 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
auto RE_ICASE = std::regex_constants::icase;
opt_opfilter = str_opfilter ? new std::regex(str_opfilter, RE_ICASE) : NULL;
opt_verbose = str_verbose ? atoi(str_verbose) : 0;
opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
opt_opmask = str_opmask ? strtoul(str_opmask, NULL, 0) : opt_opmask;
opt_opsync = str_opsync ? atoi(str_opsync) : opt_opsync;
opt_opbatch = str_opbatch ? strtoul(str_opbatch, NULL, 0) : opt_opbatch;
opt_opqueue = str_opqueue ? strtoul(str_opqueue, NULL, 0) : opt_opqueue;
opt_profile = str_profile ? atoi(str_profile) : 0;
opt_etm = str_etm ? atoi(str_etm) : 0;
opt_nhvx = str_nhvx ? strtoul(str_nhvx, NULL, 0) : opt_nhvx;
opt_use_hmx = str_use_hmx ? atoi(str_use_hmx) : opt_use_hmx;
opt_ndev = str_ndev ? strtoul(str_ndev, NULL, 0) : opt_ndev;
opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
opt_opfilter = str_opfilter ? new std::regex(str_opfilter, RE_ICASE) : NULL;
opt_verbose = str_verbose ? atoi(str_verbose) : 0;
opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
opt_opstage = str_opstage ? strtoul(str_opstage, NULL, 0) : opt_opstage;
opt_opbatch = str_opbatch ? strtoul(str_opbatch, NULL, 0) : opt_opbatch;
opt_opqueue = str_opqueue ? strtoul(str_opqueue, NULL, 0) : opt_opqueue;
opt_etm = str_etm ? atoi(str_etm) : 0;
opt_nhvx = str_nhvx ? strtoul(str_nhvx, NULL, 0) : opt_nhvx;
opt_use_hmx = str_use_hmx ? atoi(str_use_hmx) : opt_use_hmx;
opt_ndev = str_ndev ? strtoul(str_ndev, NULL, 0) : opt_ndev;
opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
if (str_profile) {
opt_pmu_evt = [&]() -> std::vector<uint32_t> {
auto v = str_to_vec<uint32_t>(str_profile);
switch (v.size()) {
case 1: opt_profile = v[0]; return opt_pmu_evt; // mode with default pmu events
case 8: opt_profile = 2; return v; // mode with custom pmu events
default: opt_profile = 0; return {}; // garbage input
}}();
if (opt_profile == 1) opt_pmu_evt = {};
GGML_LOG_INFO("ggml-hex: Profiling mode %u : pmu-evt [ %s ]\n", opt_profile,
vec_to_str<uint32_t, 16>(opt_pmu_evt).c_str());
}
if (opt_ndev > GGML_HEXAGON_MAX_SESSIONS) {
opt_ndev = GGML_HEXAGON_MAX_SESSIONS;
+1
View File
@@ -36,6 +36,7 @@ add_library(${HTP_LIB} SHARED
cumsum-ops.c
fill-ops.c
diag-ops.c
solve-tri-ops.c
)
target_compile_definitions(${HTP_LIB} PRIVATE
+28
View File
@@ -4,6 +4,7 @@
#include <stdbool.h>
#include <stdint.h>
#include <qurt_memory.h>
#include <qurt.h>
#include "hexagon_types.h"
#include "hexagon_protos.h"
@@ -100,4 +101,31 @@ static inline void hex_pause() {
asm volatile(" pause(#255)\n");
}
#ifndef HEX_NUM_PMU_COUNTERS
#define HEX_NUM_PMU_COUNTERS 8
#endif
static inline void hex_get_pmu(uint32_t counters[]) {
#if __HVX_ARCH__ >= 79
asm volatile("%0 = upmucnt0" : "=r"(counters[0]));
asm volatile("%0 = upmucnt1" : "=r"(counters[1]));
asm volatile("%0 = upmucnt2" : "=r"(counters[2]));
asm volatile("%0 = upmucnt3" : "=r"(counters[3]));
asm volatile("%0 = upmucnt4" : "=r"(counters[4]));
asm volatile("%0 = upmucnt5" : "=r"(counters[5]));
asm volatile("%0 = upmucnt6" : "=r"(counters[6]));
asm volatile("%0 = upmucnt7" : "=r"(counters[7]));
#else
counters[0] = qurt_pmu_get(QURT_PMUCNT0);
counters[1] = qurt_pmu_get(QURT_PMUCNT1);
counters[2] = qurt_pmu_get(QURT_PMUCNT2);
counters[3] = qurt_pmu_get(QURT_PMUCNT3);
counters[4] = qurt_pmu_get(QURT_PMUCNT4);
counters[5] = qurt_pmu_get(QURT_PMUCNT5);
counters[6] = qurt_pmu_get(QURT_PMUCNT6);
counters[7] = qurt_pmu_get(QURT_PMUCNT7);
// qurt_pmu_get_pmucnt(counters);
#endif
}
#endif /* HEX_UTILS_H */
+1 -1
View File
@@ -1683,7 +1683,7 @@ int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict
__fp16 *vtcm_scales = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, 256);
assert((size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base) <= vtcm_budget);
FARF(HIGH, "hmx-mm: m=%d k=%d n=%d wtype=%d block M=%zu N=%zu K=%zu vtcm=%zu/%zu", __func__, m, k, n, weight_type,
FARF(HIGH, "hmx-mm: m=%d k=%d n=%d wtype=%d block M=%zu N=%zu K=%zu vtcm=%zu/%zu", m, k, n, weight_type,
M_BLOCK_SIZE, N_BLOCK_SIZE, K_BLOCK_SIZE, (size_t) (vtcm_ptr - (uint8_t *) ctx->vtcm_base), vtcm_budget);
// initialize eye tile (32x32 identity matrix)
+5 -1
View File
@@ -10,6 +10,7 @@
#include <dspqueue.h>
#include <stdatomic.h>
#include <stdint.h>
#include <stdbool.h>
#define HTP_MAX_NTHREADS 10
#define HTP_MAX_MMAPS 16
@@ -66,7 +67,9 @@ struct htp_context {
int thread_id;
int thread_prio;
int hmx_enabled;
bool hmx_enabled;
bool etm;
uint32_t profiler;
uint8_t * vtcm_base;
size_t vtcm_size;
@@ -100,5 +103,6 @@ int op_ssm_conv(struct htp_ops_context * octx);
int op_cumsum(struct htp_ops_context * octx);
int op_fill(struct htp_ops_context * octx);
int op_diag(struct htp_ops_context * octx);
int op_solve_tri(struct htp_ops_context * octx);
#endif /* HTP_CTX_H */
+28 -10
View File
@@ -42,9 +42,9 @@ enum htp_data_type {
// Mask to enable various stages of the Ops.
// Used for debugging and profiling.
enum htp_op_mask {
HTP_OPMASK_QUEUE = (1 << 0), // Enable Queueing (ie calls into the DSP)
HTP_OPMASK_COMPUTE = (1 << 1), // Enable Compute
enum htp_op_stage {
HTP_OPSTAGE_QUEUE = (1 << 0), // Enable Queueing (ie calls into NPU)
HTP_OPSTAGE_COMPUTE = (1 << 1), // Enable Compute
};
// Do not reorder first 4 (used as an index)
@@ -82,7 +82,7 @@ enum htp_op_code {
HTP_OP_CUMSUM,
HTP_OP_FILL,
HTP_OP_DIAG,
HTP_OP_SOLVE_TRI,
HTP_OP_INVALID
};
@@ -137,27 +137,45 @@ struct htp_op_desc {
int32_t params[HTP_OP_MAX_PARAMS]; // Params for the op, e.g. epsilon of RMS norm
uint16_t src[HTP_OP_MAX_INPUTS]; // Input tensors indices
uint16_t dst; // Output tensor index
};
// the rest is filled in-place by the NPU
uint32_t prof_usecs; // Number of usec per request
uint32_t prof_cycles; // Number of cycles per request
uint32_t prof_pkts; // Number of instruction packets per request
uint32_t unused;
enum htp_profiler_mode {
HTP_PROF_DISABLED = 0,
HTP_PROF_BASIC = 1,
HTP_PROF_PMU = 2,
};
#define HTP_PROF_PMU_NCNT 8
// Profile descriptor
struct htp_prof_desc {
uint32_t opcode; // GGML/HTP Op
uint32_t usecs; // Number of usec
uint32_t cycles; // Number of cycles
uint32_t pad; // Unused
uint32_t pmu[HTP_PROF_PMU_NCNT]; // PMU counters
};
struct htp_opbatch_req {
uint32_t id; // Batch id
uint32_t n_bufs; // Number of buffers
uint32_t n_tensors; // Number of tensors
uint32_t n_ops; // Number of ops
uint32_t flags; // unused
uint32_t pad; // unused
// struct htp_buf_desc bufs[]; -- dspqueue buf 0
// struct htp_tensor tensors[]; -- dspqueue buf 0
// struct htp_op_desc ops[]; -- dspqueue buf 0
};
struct htp_opbatch_rsp {
uint32_t id; // Batch id
uint32_t status; // HTP_STATUS_...
// struct htp_op_req ops[]; -- dspqueue buf 0
uint32_t n_bufs; // Number of buffers
uint32_t n_tensors; // Number of tensors
uint32_t n_ops; // Number of op profile descriptors
uint32_t pad; // unused
// struct htp_prof_desc profs[]; -- dspqueue buf 0
};
#endif /* HTP_OPS_H */
+6 -2
View File
@@ -6,13 +6,17 @@
#include "AEEStdDef.idl"
#include "remote.idl"
struct htp_iface_pmu_conf {
uint32 events[8];
};
interface htp_iface : remote_handle64 {
AEEResult start(in uint32 sess_id, in uint64 dsp_queue_id, in uint32 n_hvx, in uint32 use_hmx);
AEEResult stop();
AEEResult mmap(in uint32 fd, in uint32 size, in uint32 pinned);
AEEResult munmap(in uint32 fd);
AEEResult enable_etm();
AEEResult disable_etm();
AEEResult profiler(in uint32 mode, in htp_iface_pmu_conf pmu);
AEEResult etm(in uint32 enable);
};
#endif /* HTP_IDL */
+24
View File
@@ -256,6 +256,18 @@ static inline HVX_Vector hvx_vec_mul_f16_f16(HVX_Vector a, HVX_Vector b)
return Q6_Vhf_equals_Wqf32(Q6_Wqf32_vmpy_VhfVhf(a, b));
}
static inline HVX_Vector hvx_vec_add_f32_f32(HVX_Vector a, HVX_Vector b) {
return Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a, b));
}
static inline HVX_Vector hvx_vec_sub_f32_f32(HVX_Vector a, HVX_Vector b) {
return Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a, b));
}
static inline HVX_Vector hvx_vec_mul_f32_f32(HVX_Vector a, HVX_Vector b) {
return Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a, b));
}
#else
static inline HVX_Vector hvx_vec_add_f16_f16(HVX_Vector a, HVX_Vector b)
@@ -273,6 +285,18 @@ static inline HVX_Vector hvx_vec_mul_f16_f16(HVX_Vector a, HVX_Vector b)
return Q6_Vhf_vmpy_VhfVhf(a, b);
}
static inline HVX_Vector hvx_vec_add_f32_f32(HVX_Vector a, HVX_Vector b) {
return Q6_Vsf_vadd_VsfVsf(a, b);
}
static inline HVX_Vector hvx_vec_sub_f32_f32(HVX_Vector a, HVX_Vector b) {
return Q6_Vsf_vsub_VsfVsf(a, b);
}
static inline HVX_Vector hvx_vec_mul_f32_f32(HVX_Vector a, HVX_Vector b) {
return Q6_Vsf_vmpy_VsfVsf(a, b);
}
#endif // __HVX_ARCH__ < 79
#endif /* HVX_BASE_H */
+145 -54
View File
@@ -27,6 +27,7 @@
#include "htp-ctx.h"
#include "htp-ops.h"
#include "htp-ops.h"
#include "htp_iface.h"
#include "worker-pool.h"
AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
@@ -100,6 +101,74 @@ AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
}
}
#if __HVX_ARCH__ >= 75
{
// Set HMX clock
HAP_power_request_t request;
memset(&request, 0, sizeof(HAP_power_request_t));
request.type = HAP_power_set_HMX_v2;
request.hmx_v2.set_clock = TRUE;
request.hmx_v2.target_corner = HAP_DCVS_EXP_VCORNER_MAX;
request.hmx_v2.min_corner = HAP_DCVS_EXP_VCORNER_MAX;
request.hmx_v2.max_corner = HAP_DCVS_EXP_VCORNER_MAX;
request.hmx_v2.perf_mode = HAP_CLK_PERF_HIGH;
FARF(ALWAYS, "Setting HMX clock\n");
err = HAP_power_set((void *) &ctx, &request);
if (err != AEE_SUCCESS) {
FARF(ERROR, "Error setting HMX clock.");
return err;
}
}
#endif
return AEE_SUCCESS;
}
AEEResult htp_iface_etm(remote_handle64 handle, uint32_t enable) {
int err = enable ? HAP_user_etm_enable() : HAP_user_etm_disable();
if (err) {
if (err == AEE_EVERSIONNOTSUPPORT) {
FARF(ERROR, "API HAP_user_etm_enable/disable is not supported\n");
} else {
FARF(ERROR, "Error executing HAP_user_etm_enable/disable with error code : 0x%x\n", err);
}
}
return err;
}
AEEResult htp_iface_profiler(remote_handle64 handle, uint32_t mode, const htp_iface_pmu_conf* pmu_conf) {
struct htp_context * ctx = (struct htp_context *) handle;
if (!ctx) {
return AEE_EBADPARM;
}
if (mode == HTP_PROF_PMU) {
const uint32_t* events = pmu_conf->events;
// Pack 4 event IDs (low 8 bits) into each 32-bit config register
uint32_t evtcfg = 0, evtcfg1 = 0, cfg = 0, i = 0;
for (; i < HEX_NUM_PMU_COUNTERS/2; i++) {
evtcfg |= ((events[i + 0] & 0xFF) << (i * 8));
evtcfg1 |= ((events[i + 4] & 0xFF) << (i * 8));
}
// For events >255 pack high 2 bits of all 8 event IDs into cfg register
// 2 bits per counter: bits [1:0] for counter 0, [3:2] for counter 1, etc.
for (i = 0; i < HEX_NUM_PMU_COUNTERS; i++) {
cfg |= (((events[i] >> 8) & 3) << (i * 2));
}
FARF(ALWAYS, "Configuring PMU registers: evtcfg = 0x%x, evtcfg1 = 0x%x, pmucfg = 0x%x", evtcfg, evtcfg1, cfg);
// Configure PMU registers
qurt_pmu_set(QURT_PMUCFG, cfg);
qurt_pmu_set(QURT_PMUEVTCFG, evtcfg);
qurt_pmu_set(QURT_PMUEVTCFG1, evtcfg1);
qurt_pmu_enable(1);
}
ctx->profiler = mode;
return AEE_SUCCESS;
}
@@ -129,35 +198,19 @@ AEEResult htp_iface_close(remote_handle64 handle) {
}
}
if (ctx->profiler) {
qurt_pmu_enable(1);
}
if (ctx->etm) {
HAP_user_etm_disable();
}
free(ctx);
return AEE_SUCCESS;
}
AEEResult htp_iface_enable_etm(remote_handle64 handle) {
int err = HAP_user_etm_enable();
if (err) {
if (err == AEE_EVERSIONNOTSUPPORT) {
FARF(ERROR, "API HAP_user_etm_enable is not supported\n");
} else {
FARF(ERROR, "Error executing HAP_user_etm_enable with error code : 0x%x\n", err);
}
}
return err;
}
AEEResult htp_iface_disable_etm(remote_handle64 handle) {
int err = HAP_user_etm_disable();
if (err) {
if (err == AEE_EVERSIONNOTSUPPORT) {
FARF(ERROR, "API HAP_user_etm_disable is not supported\n");
} else {
FARF(ERROR, "Error executing HAP_user_etm_disable with error code : 0x%x\n", err);
}
}
return err;
}
AEEResult htp_iface_mmap(remote_handle64 handle, int fd, uint32_t size, uint32_t pinned) {
AEEResult htp_iface_mmap(remote_handle64 handle, uint32 fd, uint32 size, uint32 pinned) {
struct htp_context * ctx = (struct htp_context *) handle;
if (!ctx) {
return AEE_EBADPARM;
@@ -204,7 +257,7 @@ AEEResult htp_iface_mmap(remote_handle64 handle, int fd, uint32_t size, uint32_t
return AEE_ENOMEMORY;
}
AEEResult htp_iface_munmap(remote_handle64 handle, int fd) {
AEEResult htp_iface_munmap(remote_handle64 handle, uint32 fd) {
struct htp_context * ctx = (struct htp_context *) handle;
if (!ctx) {
return AEE_EBADPARM;
@@ -434,19 +487,39 @@ static void htp_error_callback(dspqueue_t queue, int error, void * context) {
struct profile_data {
uint64_t usecs;
uint64_t cycles;
uint64_t pkts;
uint32_t pmu_counters[HEX_NUM_PMU_COUNTERS];
};
static inline void profile_start(struct profile_data * d) {
d->usecs = HAP_perf_get_qtimer_count();
d->cycles = hex_get_cycles();
d->pkts = hex_get_pktcnt();
static inline void profile_start(uint32_t mode, struct profile_data * d) {
switch (mode) {
case HTP_PROF_PMU:
hex_get_pmu(d->pmu_counters);
// fallthrough
case HTP_PROF_BASIC:
d->usecs = HAP_perf_get_qtimer_count();
d->cycles = hex_get_cycles();
break;
default:
break;
}
}
static inline void profile_stop(struct profile_data * d) {
d->usecs = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - d->usecs);
d->cycles = hex_get_cycles() - d->cycles;
d->pkts = hex_get_pktcnt() - d->pkts;
static inline void profile_stop(uint32_t mode, struct profile_data * d) {
uint32_t pmu_counters[HEX_NUM_PMU_COUNTERS];
switch (mode) {
case HTP_PROF_PMU:
hex_get_pmu(pmu_counters);
for (int i = 0; i < HEX_NUM_PMU_COUNTERS; i++) {
d->pmu_counters[i] = pmu_counters[i] - d->pmu_counters[i];
}
// fallthrough
case HTP_PROF_BASIC:
d->usecs = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - d->usecs);
d->cycles = hex_get_cycles() - d->cycles;
break;
default:
break;
}
}
static int execute_op(struct htp_ops_context * octx) {
@@ -520,6 +593,9 @@ static int execute_op(struct htp_ops_context * octx) {
case HTP_OP_DIAG:
return op_diag(octx);
case HTP_OP_SOLVE_TRI:
return op_solve_tri(octx);
case HTP_OP_INVALID:
break;
@@ -726,30 +802,33 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
continue;
}
// Reset poll count for valid requests
poll_count = DSPQUEUE_POLL_COUNT;
const uint32_t n_bufs = req.n_bufs;
const uint32_t n_tens = req.n_tensors;
const uint32_t n_ops = req.n_ops;
const uint32_t b_size = sizeof(struct htp_buf_desc) * n_bufs;
const uint32_t t_size = sizeof(struct htp_tensor) * n_tens;
const uint32_t o_size = sizeof(struct htp_op_desc) * n_ops;
const uint32_t b_size = sizeof(struct htp_buf_desc) * n_bufs;
const uint32_t t_size = sizeof(struct htp_tensor) * n_tens;
const uint32_t o_size = sizeof(struct htp_op_desc) * n_ops;
const uint32_t p_size = sizeof(struct htp_prof_desc) * n_ops;
if (dbuf.size < b_size + t_size + o_size) {
if (dbuf.size < b_size + t_size + o_size + p_size) {
FARF(ERROR, "invalid opbatch memory block size %u", dbuf.size);
break;
}
// Reset poll count for valid requests
poll_count = DSPQUEUE_POLL_COUNT;
uint8_t * m_ptr = dbuf.ptr;
struct htp_buf_desc* bufs = (struct htp_buf_desc*) m_ptr; m_ptr += b_size;
struct htp_tensor* tens = (struct htp_tensor*) m_ptr; m_ptr += t_size;
struct htp_op_desc* ops = (struct htp_op_desc*) m_ptr;
FARF(HIGH, "processing opbatch: n-bufs %u n-tensors %u n-ops %u : m-size %u b-size %u t-size %u o-size %u",
FARF(HIGH, "processing opbatch #%u: n-bufs %u n-tensors %u n-ops %u : m-size %u b-size %u t-size %u o-size %u", req.id,
n_bufs, n_tens, n_ops, dbuf.size, b_size, t_size, o_size);
// Setup descriptor pointers
uint8_t * m_ptr = dbuf.ptr;
struct htp_buf_desc* bufs = (struct htp_buf_desc*) m_ptr; m_ptr += b_size;
struct htp_tensor* tens = (struct htp_tensor*) m_ptr; m_ptr += t_size;
struct htp_op_desc* ops = (struct htp_op_desc*) m_ptr; m_ptr += o_size;
struct htp_prof_desc* pds = (struct htp_prof_desc*) m_ptr;
prep_op_bufs(ctx, bufs, n_bufs);
prep_tensors(ctx, bufs, tens, n_tens);
@@ -760,22 +839,34 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
for (uint32_t i=0; i < n_ops; i++) {
struct profile_data prof;
profile_start(&prof);
profile_start(ctx->profiler, &prof);
proc_op_req(octx, tens, i, &ops[i]);
profile_stop(&prof);
ops[i].prof_usecs = prof.usecs;
ops[i].prof_cycles = prof.cycles;
ops[i].prof_pkts = prof.pkts;
profile_stop(ctx->profiler, &prof);
if (ctx->profiler) {
pds[i].opcode = ops[i].opcode;
pds[i].usecs = prof.usecs;
pds[i].cycles = prof.cycles;
for (int j = 0; j < HEX_NUM_PMU_COUNTERS; j++) {
pds[i].pmu[j] = prof.pmu_counters[j];
}
}
}
// dspqueue_write_early_wakeup_noblock(ctx->queue, 10, 0);
struct htp_opbatch_rsp rsp;
rsp.status = HTP_STATUS_OK; // FIXME
rsp.id = req.id;
rsp.status = HTP_STATUS_OK;
rsp.n_bufs = n_bufs;
rsp.n_tensors = n_tens;
rsp.n_ops = n_ops;
dbuf.flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
err = dspqueue_write(queue, 0, 1, &dbuf, sizeof(rsp), (const uint8_t *) &rsp, DSPQUEUE_TIMEOUT_NONE);
if (err != 0) {
FARF(ERROR, "dspqueue_write failed: 0x%08x", (unsigned) err);
+4
View File
@@ -3017,6 +3017,10 @@ int op_matmul(struct htp_ops_context * octx) {
const int act_stride = (int)(src1->nb[1] / sizeof(float));
const int wgt_stride = (int)(src0->nb[1] / sizeof(__fp16));
if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
return HTP_STATUS_OK;
}
if (src0->type == HTP_TYPE_F16) {
if (is_batched) {
hmx_matmul_w16a32_batched_params_t batch_params = {
+267
View File
@@ -0,0 +1,267 @@
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
#include <HAP_farf.h>
#include <HAP_perf.h>
#include <string.h>
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "htp-ctx.h"
#include "htp-ops.h"
#include "hvx-types.h"
#include "hvx-utils.h"
struct htp_solve_tri_context {
struct htp_ops_context * octx;
uint32_t jobs_per_thread;
uint32_t total_jobs;
uint32_t k_chunks;
uint32_t col_block;
};
static inline void solve_tri_row_scalar(const float * A_row,
const float * B_row,
float * X,
uint32_t row,
uint32_t k,
uint32_t col0,
uint32_t coln,
float inv_diag) {
for (uint32_t col = col0; col < col0 + coln; ++col) {
float sum = 0.0f;
for (uint32_t t = 0; t < row; ++t) {
sum += A_row[t] * X[t * k + col];
}
X[row * k + col] = (B_row[col] - sum) * inv_diag;
}
}
static inline HVX_Vector hvx_load_partial_f32(const float * src, uint32_t n) {
HVX_Vector v = *((const HVX_UVector *) src);
HVX_VectorPred mask = Q6_Q_vsetq2_R(n * sizeof(float));
return Q6_V_vmux_QVV(mask, v, Q6_V_vzero());
}
static inline void solve_tri_row_hvx(const float * A_row,
const float * B_row,
float * X,
uint32_t row,
uint32_t k,
uint32_t col0,
uint32_t coln,
float inv_diag) {
const bool full = (coln == VLEN_FP32);
HVX_Vector sum_v = Q6_V_vzero();
for (uint32_t t = 0; t < row; ++t) {
const float a = A_row[t];
const float * x_row_col = X + t * k + col0;
HVX_Vector x_v = full ? *((const HVX_UVector *) x_row_col) : hvx_load_partial_f32(x_row_col, coln);
HVX_Vector a_v = hvx_vec_splat_f32(a);
sum_v = hvx_vec_add_f32_f32(sum_v, hvx_vec_mul_f32_f32(x_v, a_v));
}
const float * b_row_col = B_row + col0;
float * x_out_col = X + row * k + col0;
HVX_Vector b_v = full ? *((const HVX_UVector *) b_row_col) : hvx_load_partial_f32(b_row_col, coln);
HVX_Vector inv_diag_v = hvx_vec_splat_f32(inv_diag);
HVX_Vector out_v = hvx_vec_mul_f32_f32(hvx_vec_sub_f32_f32(b_v, sum_v), inv_diag_v);
hvx_vec_store_u((void *) x_out_col, coln * sizeof(float), out_v);
}
// Batch-level thread: each job is one full batch.
static void solve_tri_batch_thread_f32(unsigned int nth, unsigned int ith, void * data) {
struct htp_solve_tri_context * sctx = (struct htp_solve_tri_context *) data;
struct htp_ops_context * octx = sctx->octx;
const struct htp_tensor * src0 = octx->src[0]; // A
const struct htp_tensor * src1 = octx->src[1]; // B
const struct htp_tensor * dst = octx->dst; // X
const uint32_t n = src0->ne[0];
const uint32_t k = src1->ne[0];
const uint32_t ne02 = src0->ne[2];
const uint32_t col_block = VLEN_FP32;
const uint32_t k_full = (k / col_block) * col_block;
const uint32_t start_batch = sctx->jobs_per_thread * ith;
const uint32_t end_batch = MIN(start_batch + sctx->jobs_per_thread, sctx->total_jobs);
uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count();
for (uint32_t batch = start_batch; batch < end_batch; ++batch) {
const uint32_t i03 = batch / ne02;
const uint32_t i02 = batch - i03 * ne02;
const float * A_batch =
(const float *) ((const uint8_t *) (uintptr_t) src0->data + i02 * src0->nb[2] + i03 * src0->nb[3]);
const float * B_batch =
(const float *) ((const uint8_t *) (uintptr_t) src1->data + i02 * src1->nb[2] + i03 * src1->nb[3]);
float * X_batch = (float *) ((uint8_t *) (uintptr_t) dst->data + i02 * dst->nb[2] + i03 * dst->nb[3]);
for (uint32_t row = 0; row < n; ++row) {
const float diag = A_batch[row * n + row];
const float inv_diag = 1.0f / diag;
const float * A_row = A_batch + row * n;
const float * B_row = B_batch + row * k;
uint32_t col0 = 0;
for (; col0 < k_full; col0 += col_block) {
solve_tri_row_hvx(A_row, B_row, X_batch, row, k, col0, col_block, inv_diag);
}
if (col0 < k) {
const uint32_t coln = k - col0;
if (coln >= 8) {
solve_tri_row_hvx(A_row, B_row, X_batch, row, k, col0, coln, inv_diag);
} else {
solve_tri_row_scalar(A_row, B_row, X_batch, row, k, col0, coln, inv_diag);
}
}
}
}
t2 = HAP_perf_get_qtimer_count();
FARF(HIGH, "solve-tri-batch %d/%d: A=(%ux%u) B=(%ux%u) batch %u:%u usec %u\n",
ith, nth, n, n, k, n, start_batch, end_batch,
(unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
}
// Chunk-level thread: each job is one (batch, col_chunk) pair.
static void solve_tri_chunk_thread_f32(unsigned int nth, unsigned int ith, void * data) {
struct htp_solve_tri_context * sctx = (struct htp_solve_tri_context *) data;
struct htp_ops_context * octx = sctx->octx;
const struct htp_tensor * src0 = octx->src[0]; // A
const struct htp_tensor * src1 = octx->src[1]; // B
const struct htp_tensor * dst = octx->dst; // X
const uint32_t n = src0->ne[0];
const uint32_t k = src1->ne[0];
const uint32_t ne02 = src0->ne[2];
const uint32_t start_job = sctx->jobs_per_thread * ith;
const uint32_t end_job = MIN(start_job + sctx->jobs_per_thread, sctx->total_jobs);
uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count();
for (uint32_t job = start_job; job < end_job; ++job) {
const uint32_t batch = job / sctx->k_chunks;
const uint32_t chunk = job - batch * sctx->k_chunks;
const uint32_t i03 = batch / ne02;
const uint32_t i02 = batch - i03 * ne02;
const uint32_t col0 = chunk * sctx->col_block;
const uint32_t coln = MIN(sctx->col_block, k - col0);
const float * A_batch =
(const float *) ((const uint8_t *) (uintptr_t) src0->data + i02 * src0->nb[2] + i03 * src0->nb[3]);
const float * B_batch =
(const float *) ((const uint8_t *) (uintptr_t) src1->data + i02 * src1->nb[2] + i03 * src1->nb[3]);
float * X_batch = (float *) ((uint8_t *) (uintptr_t) dst->data + i02 * dst->nb[2] + i03 * dst->nb[3]);
const bool use_hvx = (coln >= 8);
for (uint32_t row = 0; row < n; ++row) {
const float diag = A_batch[row * n + row];
const float inv_diag = 1.0f / diag;
const float * A_row = A_batch + row * n;
const float * B_row = B_batch + row * k;
if (use_hvx) {
solve_tri_row_hvx(A_row, B_row, X_batch, row, k, col0, coln, inv_diag);
} else {
solve_tri_row_scalar(A_row, B_row, X_batch, row, k, col0, coln, inv_diag);
}
}
}
t2 = HAP_perf_get_qtimer_count();
FARF(HIGH, "solve-tri-chunk %d/%d: A=(%ux%u) B=(%ux%u) job %u:%u usec %u\n",
ith, nth, n, n, k, n, start_job, end_job,
(unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
}
int op_solve_tri(struct htp_ops_context * octx) {
const struct htp_tensor * src0 = octx->src[0]; // A
const struct htp_tensor * src1 = octx->src[1]; // B
const struct htp_tensor * dst = octx->dst; // X
if (src0->type != HTP_TYPE_F32 || src1->type != HTP_TYPE_F32 || dst->type != HTP_TYPE_F32) {
return HTP_STATUS_NO_SUPPORT;
}
// left=true, lower=true, uni=false only
if (src0->ne[0] != src0->ne[1]) {
return HTP_STATUS_INVAL_PARAMS;
}
if (src0->ne[1] != src1->ne[1]) {
return HTP_STATUS_INVAL_PARAMS;
}
if (src0->ne[2] != src1->ne[2] || src0->ne[3] != src1->ne[3]) {
return HTP_STATUS_INVAL_PARAMS;
}
if (dst->ne[0] != src1->ne[0] || dst->ne[1] != src1->ne[1] || dst->ne[2] != src1->ne[2] ||
dst->ne[3] != src1->ne[3]) {
return HTP_STATUS_INVAL_PARAMS;
}
if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
return HTP_STATUS_OK;
}
const uint32_t k = src1->ne[0];
const uint32_t col_block = VLEN_FP32;
const uint32_t k_chunks = (k + col_block - 1) / col_block;
const uint32_t total_batches = src0->ne[2] * src0->ne[3];
const bool batched = total_batches >= (uint32_t) octx->n_threads;
FARF(HIGH, "solve-tri: (%ux%ux%ux%u) x (%ux%ux%ux%u) -> (%ux%ux%ux%u) : batched %d\n",
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], batched);
if (batched) {
// Batch-level parallelism
const uint32_t n_threads = MIN((uint32_t) octx->n_threads, total_batches);
struct htp_solve_tri_context sctx = {
.octx = octx,
.jobs_per_thread = (total_batches + n_threads - 1) / n_threads,
.total_jobs = total_batches,
.k_chunks = k_chunks,
.col_block = col_block,
};
worker_pool_run_func(octx->ctx->worker_pool, solve_tri_batch_thread_f32, &sctx, n_threads);
} else {
// Chunk-level parallelism
const uint32_t total_jobs = total_batches * k_chunks;
const uint32_t n_threads = MIN((uint32_t) octx->n_threads, MAX(total_jobs, 1));
struct htp_solve_tri_context sctx = {
.octx = octx,
.jobs_per_thread = (total_jobs + n_threads - 1) / n_threads,
.total_jobs = total_jobs,
.k_chunks = k_chunks,
.col_block = col_block,
};
worker_pool_run_func(octx->ctx->worker_pool, solve_tri_chunk_thread_f32, &sctx, n_threads);
}
return HTP_STATUS_OK;
}
+1 -1
View File
@@ -8,7 +8,7 @@ CatalogFile = libggml-htp.cat
PnpLockDown = 1
[DestinationDirs]
Drivers_Dir = 6
Drivers_Dir = 13
[SourceDisksNames]
1 = %DiskId%
+23 -3
View File
@@ -677,7 +677,15 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm(ggml_meta
const ggml_type tsrc1 = op->src[1]->type;
const bool bc_inp = op->src[0]->ne[0] % 32 != 0;
const bool bc_out = op->ne[0] % 64 != 0 || op->ne[1] % 32 != 0;
constexpr int NRA = SZ_SIMDGROUP * N_MM_BLOCK_Y * N_MM_SIMD_GROUP_Y;
constexpr int NRB = SZ_SIMDGROUP * N_MM_BLOCK_X * N_MM_SIMD_GROUP_X;
const bool has_tensor = ggml_metal_device_get_props(ggml_metal_library_get_device(lib))->has_tensor;
const bool bc_out = has_tensor
? (op->ne[0] % NRA != 0 || op->ne[1] % NRB != 0)
: (op->ne[0] % 64 != 0 || op->ne[1] % 32 != 0);
snprintf(base, 256, "kernel_mul_mm_%s_%s", ggml_type_name(tsrc0), ggml_type_name(tsrc1));
snprintf(name, 256, "%s_bci=%d_bco=%d", base, bc_inp, bc_out);
@@ -694,8 +702,20 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm(ggml_meta
ggml_metal_cv_free(cv);
}
// when the output size is not multiple of 64x32, we need extra smem to prevent out-of-bounds writes
res.smem = bc_out ? 8192 : 4096 + 2048;
if (has_tensor) {
res.nr0 = NRA;
res.nr1 = NRB;
const size_t smem_a = NRA * N_MM_NK_TOTAL * sizeof(ggml_fp16_t);
res.smem = smem_a;
} else {
res.nr0 = 64;
res.nr1 = 32;
res.smem = bc_out ? 8192 : (4096 + 2048);
}
res.nsg = N_MM_SIMD_GROUP_X * N_MM_SIMD_GROUP_Y;
return res;
}
+2
View File
@@ -102,6 +102,8 @@ ggml_metal_library_t ggml_metal_library_init_from_source(ggml_metal_device_t dev
void ggml_metal_library_free(ggml_metal_library_t lib);
ggml_metal_device_t ggml_metal_library_get_device(ggml_metal_library_t lib);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline (ggml_metal_library_t lib, const char * name);
struct ggml_metal_pipeline_with_params ggml_metal_library_compile_pipeline(ggml_metal_library_t lib, const char * base, const char * name, ggml_metal_cv_t cv);
+23 -19
View File
@@ -95,8 +95,8 @@ int ggml_metal_pipeline_max_theads_per_threadgroup(struct ggml_metal_pipeline_wi
struct ggml_metal_library {
id<MTLLibrary> obj;
id<MTLDevice> device;
ggml_metal_device_t dev;
ggml_metal_pipelines_t pipelines; // cache of compiled pipelines
NSLock * lock;
@@ -251,7 +251,7 @@ ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev) {
ggml_metal_library_t res = calloc(1, sizeof(struct ggml_metal_library));
res->obj = library;
res->device = device;
res->dev = dev;
res->pipelines = ggml_metal_pipelines_init();
res->lock = [NSLock new];
@@ -318,7 +318,7 @@ ggml_metal_library_t ggml_metal_library_init_from_source(ggml_metal_device_t dev
}
res->obj = library;
res->device = device;
res->dev = dev;
res->pipelines = ggml_metal_pipelines_init();
res->lock = [NSLock new];
@@ -341,6 +341,10 @@ void ggml_metal_library_free(ggml_metal_library_t lib) {
free(lib);
}
ggml_metal_device_t ggml_metal_library_get_device(ggml_metal_library_t lib) {
return lib->dev;
}
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline(ggml_metal_library_t lib, const char * name) {
[lib->lock lock];
@@ -405,7 +409,8 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_compile_pipeline(ggml_
return res;
}
id<MTLComputePipelineState> obj = [lib->device newComputePipelineStateWithFunction:mtl_function error:&error];
id<MTLDevice> device = ggml_metal_device_get_obj(lib->dev);
id<MTLComputePipelineState> obj = [device newComputePipelineStateWithFunction:mtl_function error:&error];
[mtl_function release];
@@ -699,7 +704,7 @@ ggml_metal_device_t ggml_metal_device_init(int device) {
" auto sB = tB.slice(0, 0); \n"
" mm.run(sB, sA, cT); \n"
" \n"
" auto tC = tensor<device float, dextents<int32_t, 2>, tensor_inline>(C, dextents<int32_t, 2>(4, 4)); \n"
" auto tC = tensor<device float, dextents<int32_t, 2>, tensor_inline>(C, dextents<int32_t, 2>(16, 16)); \n"
" \n"
" cT.store(tC); \n"
"}";
@@ -749,7 +754,7 @@ ggml_metal_device_t ggml_metal_device_init(int device) {
" auto sB = tB.slice(0, 0); \n"
" mm.run(sB, sA, cT); \n"
" \n"
" auto tC = tensor<device float, dextents<int32_t, 2>, tensor_inline>(C, dextents<int32_t, 2>(4, 4)); \n"
" auto tC = tensor<device float, dextents<int32_t, 2>, tensor_inline>(C, dextents<int32_t, 2>(16, 16)); \n"
" \n"
" cT.store(tC); \n"
"}";
@@ -814,7 +819,7 @@ ggml_metal_device_t ggml_metal_device_init(int device) {
}
// print MTL GPU family:
GGML_LOG_INFO("%s: GPU name: %s\n", __func__, dev->props.name);
GGML_LOG_INFO("%s: GPU name: %s (%s)\n", __func__, dev->props.name, dev->props.desc);
// determine max supported GPU family
// https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
@@ -931,13 +936,13 @@ void ggml_metal_device_rsets_keep_alive(ggml_metal_device_t dev) {
}
struct ggml_metal_event {
void * obj; // id<MTLEvent>
void * obj; // id<MTLSharedEvent>
atomic_int value;
};
void ggml_metal_event_encode_signal(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf_raw) {
id<MTLEvent> event = (id<MTLEvent>)ev->obj;
id<MTLSharedEvent> event = (id<MTLSharedEvent>)ev->obj;
id<MTLCommandBuffer> cmd_buf = (id<MTLCommandBuffer>) cmd_buf_raw;
@@ -945,7 +950,7 @@ void ggml_metal_event_encode_signal(ggml_metal_event_t ev, ggml_metal_cmd_buf_t
}
void ggml_metal_event_encode_wait(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf_raw) {
id<MTLEvent> event = (id<MTLEvent>)ev->obj;
id<MTLSharedEvent> event = (id<MTLSharedEvent>)ev->obj;
id<MTLCommandBuffer> cmd_buf = (id<MTLCommandBuffer>) cmd_buf_raw;
@@ -953,7 +958,7 @@ void ggml_metal_event_encode_wait(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cm
}
ggml_metal_event_t ggml_metal_device_event_init(ggml_metal_device_t dev) {
id<MTLEvent> event = [dev->mtl_device newEvent];
id<MTLSharedEvent> event = [dev->mtl_device newSharedEvent];
ggml_metal_event_t ev = calloc(1, sizeof(struct ggml_metal_event));
@@ -964,7 +969,7 @@ ggml_metal_event_t ggml_metal_device_event_init(ggml_metal_device_t dev) {
}
void ggml_metal_device_event_free(ggml_metal_device_t dev, ggml_metal_event_t ev) {
id<MTLEvent> event = ev->obj;
id<MTLSharedEvent> event = ev->obj;
[event release];
free(ev);
@@ -973,14 +978,13 @@ void ggml_metal_device_event_free(ggml_metal_device_t dev, ggml_metal_event_t ev
}
void ggml_metal_device_event_synchronize(ggml_metal_device_t dev, ggml_metal_event_t ev) {
@autoreleasepool {
id<MTLEvent> event = ev->obj;
id<MTLCommandBuffer> cmd_buf = [dev->mtl_queue commandBuffer];
[cmd_buf encodeWaitForEvent:event value:atomic_load_explicit(&ev->value, memory_order_relaxed)];
[cmd_buf commit];
[cmd_buf waitUntilCompleted];
id<MTLSharedEvent> event = ev->obj;
const bool res = [event waitUntilSignaledValue:atomic_load_explicit(&ev->value, memory_order_relaxed) timeoutMS:60000];
if (!res) {
GGML_ABORT("%s: failed to wait for event\n", __func__);
}
GGML_UNUSED(dev);
}
void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total) {
+13
View File
@@ -1,6 +1,19 @@
#ifndef GGML_METAL_IMPL
#define GGML_METAL_IMPL
// kernel parameters for mat-mat threadgroups
//
// TODO: become function constants
#define SZ_SIMDGROUP 16
#define N_MM_NK 2
#define N_MM_NK_TOTAL (SZ_SIMDGROUP * N_MM_NK)
#define N_MM_BLOCK_X 4
#define N_MM_BLOCK_Y 2
#define N_MM_SIMD_GROUP_X 2
#define N_MM_SIMD_GROUP_Y 2
// kernel parameters for mat-vec threadgroups
//
// N_R0: number of src0 rows to process per simdgroup
+6 -1
View File
@@ -2195,7 +2195,12 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
const size_t smem = pipeline.smem;
ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
ggml_metal_encoder_dispatch_threadgroups(enc, ((ne11 + 31)/32), ((ne01 + 63)/64), ne12*ne13, 128, 1, 1);
const int nr0 = pipeline.nr0;
const int nr1 = pipeline.nr1;
const int nsg = pipeline.nsg;
ggml_metal_encoder_dispatch_threadgroups(enc, ((ne11 + nr1 - 1) / nr1), ((ne01 + nr0 - 1) / nr0), ne12 * ne13, 32, nsg, 1);
} else {
auto pipeline = ggml_metal_library_get_pipeline_mul_mv(lib, op);
+134 -99
View File
@@ -9306,7 +9306,137 @@ constant bool FC_mul_mm_bc_inp [[function_constant(FC_MUL_MM + 0)]];
constant bool FC_mul_mm_bc_out [[function_constant(FC_MUL_MM + 1)]];
// each block_q contains 16*nl weights
template<typename S0, typename S0_4x4, typename S0_8x8, typename S1, typename S1_2x4, typename S1_8x8, typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread S0_4x4 &), typename T0, typename T0_4x4, typename T1, typename T1_2x4>
#ifdef GGML_METAL_HAS_TENSOR
template<
typename SA, typename SA_4x4, typename SA_8x8,
typename SB, typename SB_2x4, typename SB_8x8,
typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread SA_4x4 &),
typename T0, typename T0_4x4, typename T1, typename T1_2x4>
kernel void kernel_mul_mm(
constant ggml_metal_kargs_mul_mm & args,
device const char * srcA,
device const char * srcB,
device char * dst,
threadgroup char * shmem [[threadgroup(0)]],
uint3 tgpig [[threadgroup_position_in_grid]],
ushort tiitg [[thread_index_in_threadgroup]],
ushort sgitg [[simdgroup_index_in_threadgroup]]) {
(void) sgitg;
// Matrix dimensions: A(M,K) x B(K,N) -> C(M,N)
const int K = args.ne00;
const int M = args.ne0;
const int N = args.ne1;
// Batch dimension handling
const int im = tgpig.z;
const int i12 = im % args.ne12;
const int i13 = im / args.ne12;
// Batch offsets for srcA and srcB
const uint64_t offset0 = (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
// Tile dimensions
constexpr int NRB = SZ_SIMDGROUP * N_MM_BLOCK_X * N_MM_SIMD_GROUP_X;
constexpr int NRA = SZ_SIMDGROUP * N_MM_BLOCK_Y * N_MM_SIMD_GROUP_Y;
// Tile offsets in output matrix
const int ra = tgpig.y * NRA;
const int rb = tgpig.x * NRB;
// Threadgroup memory for dequantized A tile only
threadgroup SA * sa = (threadgroup SA *)(shmem);
// Work-item count for A loading
constexpr int A_WORK_ITEMS = NRA * N_MM_NK;
constexpr int NUM_THREADS = N_SIMDWIDTH * N_MM_SIMD_GROUP_X * N_MM_SIMD_GROUP_Y;
// tA wraps threadgroup memory
auto tA = tensor(sa, dextents<int32_t, 2>(N_MM_NK_TOTAL, NRA));
// tB wraps device memory directly
device T1 * ptrB = (device T1 *)(srcB + args.nb12*i12 + args.nb13*i13);
const int strideB = args.nb11 / sizeof(T1);
auto tB = tensor(ptrB, dextents<int32_t, 2>(K, N), array<int, 2>({1, strideB}));
// Configure matmul operation
mpp::tensor_ops::matmul2d<
mpp::tensor_ops::matmul2d_descriptor(
NRB, NRA, N_MM_NK_TOTAL, false, true, true,
mpp::tensor_ops::matmul2d_descriptor::mode::multiply_accumulate),
execution_simdgroups<N_MM_SIMD_GROUP_X * N_MM_SIMD_GROUP_Y>> mm;
auto cT = mm.get_destination_cooperative_tensor<decltype(tB), decltype(tA), float>();
// Accumulate partial results over K dimension
for (int loop_k = 0; loop_k < K; loop_k += N_MM_NK_TOTAL) {
// === PHASE 1: Dequantization of A into threadgroup memory ===
for (int work = tiitg; work < A_WORK_ITEMS; work += NUM_THREADS) {
const int row = work / N_MM_NK;
const int k_chunk = work % N_MM_NK;
const int k_pos = loop_k + k_chunk * 16;
const short k_base = k_chunk * 16;
// Bounds check: skip device read if row is out of matrix bounds
if (ra + row < M) {
if (is_same<T0_4x4, block_q>::value && FC_mul_mm_bc_inp) {
// Element-wise reads when K is not aligned (nb01 not aligned for half4x4/float4x4).
// MSL spec Table 2.5: half4x4 requires 8-byte alignment. When K is odd,
// nb01 = K*2 is not 8-byte aligned, so odd-row pointers are misaligned.
// Mirrors the legacy kernel's existing guard.
device const T0 * row_ptr = (device const T0 *)(srcA + args.nb01 * (ra + row) + offset0);
FOR_UNROLL (short i = 0; i < 16; i++) {
sa[row * N_MM_NK_TOTAL + (k_base + i)] = (k_pos + i < K) ? (SA) row_ptr[k_pos + i] : (SA)0;
}
} else {
const int block_idx = k_pos / (16 * nl);
const short il = (k_pos / 16) % nl;
device const block_q * row_ptr = (device const block_q *)(srcA + args.nb01 * (ra + row) + offset0);
SA_4x4 temp_a;
dequantize_func(row_ptr + block_idx, il, temp_a);
FOR_UNROLL (short i = 0; i < 16; i++) {
// Zero-pad A for K positions beyond valid range (handles partial K iterations)
sa[row * N_MM_NK_TOTAL + (k_base + i)] = (k_pos + i < K) ? temp_a[i/4][i%4] : (SA)0;
}
}
} else {
// Zero-pad rows beyond matrix bounds
FOR_UNROLL (short i = 0; i < 16; i++) {
sa[row * N_MM_NK_TOTAL + (k_base + i)] = (SA)0;
}
}
}
threadgroup_barrier(mem_flags::mem_threadgroup);
// === PHASE 2: Tensor matmul ===
auto mA = tA.slice(0, 0);
auto mB = tB.slice(loop_k, rb);
mm.run(mB, mA, cT);
threadgroup_barrier(mem_flags::mem_threadgroup);
}
// Store result tile to output matrix (with batch offset)
// cT.store handles bounds checking via tD's extents (M, N)
device float * dstBatch = (device float *)dst + im * N * M;
auto tD = tensor(dstBatch, dextents<int32_t, 2>(M, N), array<int, 2>({1, M}));
cT.store(tD.slice(ra, rb));
}
#else
template<
typename S0, typename S0_4x4, typename S0_8x8,
typename S1, typename S1_2x4, typename S1_8x8,
typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread S0_4x4 &),
typename T0, typename T0_4x4, typename T1, typename T1_2x4>
kernel void kernel_mul_mm(
constant ggml_metal_kargs_mul_mm & args,
device const char * src0,
@@ -9320,10 +9450,6 @@ kernel void kernel_mul_mm(
threadgroup S0 * sa = (threadgroup S0 *)(shmem);
threadgroup S1 * sb = (threadgroup S1 *)(shmem + 4096);
#ifdef GGML_METAL_HAS_TENSOR
threadgroup float * sc = (threadgroup float *)(shmem);
#endif
constexpr int NR0 = 64;
constexpr int NR1 = 32;
@@ -9363,7 +9489,6 @@ kernel void kernel_mul_mm(
+ args.nb11*(r1 + lr1)
+ args.nb10*iy);
#ifndef GGML_METAL_HAS_TENSOR
S0_8x8 ma[4];
S1_8x8 mb[2];
@@ -9372,19 +9497,8 @@ kernel void kernel_mul_mm(
for (short i = 0; i < 8; i++){
mc[i] = make_filled_simdgroup_matrix<float, 8>(0.f);
}
#else
auto tA = tensor<threadgroup S0, dextents<int32_t, 2>, tensor_inline>(sa, dextents<int32_t, 2>(NK, NR0));
auto tB = tensor<threadgroup S1, dextents<int32_t, 2>, tensor_inline>(sb, dextents<int32_t, 2>(NR1, NK ));
mpp::tensor_ops::matmul2d<
mpp::tensor_ops::matmul2d_descriptor(NR1, NR0, NK, false, true, false, mpp::tensor_ops::matmul2d_descriptor::mode::multiply_accumulate),
execution_simdgroups<4>> mm;
auto cT = mm.get_destination_cooperative_tensor<decltype(tA), decltype(tB), float>();
#endif
for (int loop_k = 0; loop_k < args.ne00; loop_k += NK) {
#ifndef GGML_METAL_HAS_TENSOR
// load data and store to threadgroup memory
if (is_same<T0_4x4, block_q>::value && FC_mul_mm_bc_inp) {
threadgroup_barrier(mem_flags::mem_threadgroup);
@@ -9454,66 +9568,6 @@ kernel void kernel_mul_mm(
*(threadgroup S1_2x4 *)(sb + 64*ib + 8*ly) = (S1_2x4)(*((device T1_2x4 *) y));
}
#else
// load data and store to threadgroup memory
if (is_same<T0_4x4, block_q>::value && FC_mul_mm_bc_inp) {
threadgroup_barrier(mem_flags::mem_threadgroup);
// no need for dequantization
for (short i = 0; i < 16; i++) {
const short sx = 2*il0 + i/8;
const short sy = (tiitg/NL0)/8;
const short lx = i%8;
const short ly = (tiitg/NL0)%8;
//const short lx = (tiitg/NL0)%8;
//const short ly = i%8;
*(sa + NK*(8*sy + ly) + 8*sx + lx) = loop_k + 16*il + i < args.ne00 ? *((device T0 *) x + i) : 0;
}
} else {
S0_4x4 temp_a;
dequantize_func(x, il, temp_a);
threadgroup_barrier(mem_flags::mem_threadgroup);
FOR_UNROLL (short i = 0; i < 16; i++) {
const short sx = 2*il0 + i/8;
const short sy = (tiitg/NL0)/8;
const short lx = i%8;
const short ly = (tiitg/NL0)%8;
//const short lx = (tiitg/NL0)%8;
//const short ly = i%8;
*(sa + NK*(8*sy + ly) + 8*sx + lx) = temp_a[i/4][i%4];
}
}
if (FC_mul_mm_bc_inp) {
for (short i = 0; i < 8; ++i) {
const short sx = (tiitg%NL1);
const short sy = (tiitg/NL1)/8;
const short lx = i;
const short ly = (tiitg/NL1)%8;
//const short lx = (tiitg/NL1)%8;
//const short ly = i;
*(sb + NK*(8*sy + ly) + 8*sx + lx) = loop_k + iy + i < args.ne00 ? (S1) *((device T1 *) y + i) : 0;
}
} else {
const short sx = (tiitg%NL1);
const short sy = (tiitg/NL1)/8;
//const short lx = i;
const short ly = (tiitg/NL1)%8;
//const short lx = (tiitg/NL1)%8;
//const short ly = i;
*(threadgroup S1_2x4 *)(sb + NK*(8*sy + ly) + 8*sx) = (S1_2x4)(*((device T1_2x4 *) y));
}
#endif
il = (il + 2 < nl) ? il + 2 : il % 2;
x = (il < 2) ? x + (2 + nl - 1)/nl : x;
@@ -9522,7 +9576,6 @@ kernel void kernel_mul_mm(
threadgroup_barrier(mem_flags::mem_threadgroup);
#ifndef GGML_METAL_HAS_TENSOR
// load matrices from threadgroup memory and conduct outer products
threadgroup const S0 * lsma = (sa + 4*64*(sgitg%2));
threadgroup const S1 * lsmb = (sb + 2*64*(sgitg/2));
@@ -9549,24 +9602,10 @@ kernel void kernel_mul_mm(
lsma += 8*64;
lsmb += 4*64;
}
#else
auto sA = tA.slice(0, 0);
auto sB = tB.slice(0, 0);
mm.run(sB, sA, cT);
#endif
}
if (!FC_mul_mm_bc_out || (r0 + NR0 <= args.ne0 && r1 + NR1 <= args.ne1)) {
// if no bounds checks on the output are needed, we can directly write to device memory
#ifdef GGML_METAL_HAS_TENSOR
device float * C = (device float *) dst +
r0 + \
r1 * args.ne0 + im*args.ne1*args.ne0;
auto tC = tensor<device float, dextents<int32_t, 2>, tensor_inline>(C, dextents<int32_t, 2>(args.ne0, NR1));
cT.store(tC);
#else
device float * C = (device float *) dst +
(r0 + 32*(sgitg & 1)) + \
(r1 + 16*(sgitg >> 1)) * args.ne0 + im*args.ne1*args.ne0;
@@ -9574,21 +9613,15 @@ kernel void kernel_mul_mm(
for (short i = 0; i < 8; i++) {
simdgroup_store(mc[i], C + 8*(i%4) + 8*args.ne0*(i/4), args.ne0, 0, false);
}
#endif
} else {
// block is smaller than 64x32, we should avoid writing data outside of the matrix
threadgroup_barrier(mem_flags::mem_threadgroup);
threadgroup float * temp_str = ((threadgroup float *) shmem) + 32*(sgitg&1) + (16*(sgitg >> 1))*NR0;
#ifdef GGML_METAL_HAS_TENSOR
auto tC = tensor<threadgroup float, dextents<int32_t, 2>, tensor_inline>(sc, dextents<int32_t, 2>(NR0, NR1));
cT.store(tC);
#else
for (short i = 0; i < 8; i++) {
simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*NR0*(i/4), NR0, 0, false);
}
#endif
threadgroup_barrier(mem_flags::mem_threadgroup);
@@ -9614,6 +9647,8 @@ kernel void kernel_mul_mm(
}
}
#endif // GGML_METAL_HAS_TENSOR
template<short ne20> // n_expert_used
kernel void kernel_mul_mm_id_map0(
constant ggml_metal_kargs_mul_mm_id_map0 & args,
@@ -9789,7 +9824,7 @@ kernel void kernel_mul_mm_id(
const short ib = 8*sx + sy;
*(sa + 64*ib + 8*ly + lx) = loop_k + 16*il + i < args.ne00 ? *((device T0 *) x + i) : 0;
*(sa + 64*ib + 8*ly + lx) = loop_k + 16*il + i < args.ne00 ? (S0) *((device T0 *) x + i) : (S0) 0;
}
} else {
S0_4x4 temp_a;
+5
View File
@@ -96,6 +96,8 @@ set(GGML_OPENCL_KERNELS
mul_mv_q6_k_f32_flat
mul_mv_q8_0_f32
mul_mv_q8_0_f32_flat
mul_mv_iq4_nl_f32
mul_mv_iq4_nl_f32_flat
mul_mv_mxfp4_f32
mul_mv_mxfp4_f32_flat
mul_mv_id_q4_0_f32_8x_flat
@@ -110,12 +112,15 @@ set(GGML_OPENCL_KERNELS
mul_mm_q4_0_f32_l4_lm
mul_mm_q4_1_f32_l4_lm
mul_mm_q8_0_f32_l4_lm
mul_mm_iq4_nl_f32_l4_lm
mul_mm_q4_k_f32_l4_lm
mul_mm_q5_k_f32_l4_lm
mul_mm_q6_k_f32_l4_lm
mul_mm_q8_0_f32_8x4
gemv_noshuffle_q4_1_f32
gemm_noshuffle_q4_1_f32
gemv_noshuffle_iq4_nl_f32
gemm_noshuffle_iq4_nl_f32
gemv_noshuffle_general_q8_0_f32
gemv_noshuffle_q4_k_f32
gemm_noshuffle_q4_k_f32
+594
View File
@@ -545,6 +545,9 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_convert_block_q5_K_noshuffle;
cl_kernel kernel_restore_block_q5_K_noshuffle;
cl_kernel kernel_convert_block_q6_K, kernel_restore_block_q6_K;
cl_kernel kernel_convert_block_iq4_nl, kernel_restore_block_iq4_nl;
cl_kernel kernel_convert_block_iq4_nl_noshuffle;
cl_kernel kernel_restore_block_iq4_nl_noshuffle;
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
cl_kernel kernel_mul_mv_q4_1_f32;
cl_kernel kernel_mul_mv_q4_1_f32_flat;
@@ -556,6 +559,8 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_mul_mv_q6_K_f32_flat;
cl_kernel kernel_mul_mv_mxfp4_f32, kernel_mul_mv_mxfp4_f32_flat;
cl_kernel kernel_mul_mv_q8_0_f32, kernel_mul_mv_q8_0_f32_flat;
cl_kernel kernel_mul_mv_iq4_nl_f32;
cl_kernel kernel_mul_mv_iq4_nl_f32_flat;
cl_kernel kernel_solve_tri_f32;
cl_kernel kernel_im2col_f32, kernel_im2col_f16;
cl_kernel kernel_argsort_f32_i32;
@@ -594,6 +599,7 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_mul_mm_q4_k_f32_l4_lm;
cl_kernel kernel_mul_mm_q5_k_f32_l4_lm;
cl_kernel kernel_mul_mm_q6_k_f32_l4_lm;
cl_kernel kernel_mul_mm_iq4_nl_f32_l4_lm;
std::vector<ProfilingInfo> profiling_info;
@@ -734,6 +740,8 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_gemm_noshuffle_q6_K_f32;
cl_kernel kernel_gemv_noshuffle_q5_k_f32;
cl_kernel kernel_gemm_noshuffle_q5_k_f32;
cl_kernel kernel_gemv_noshuffle_iq4_nl_f32;
cl_kernel kernel_gemm_noshuffle_iq4_nl_f32;
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
void free() {
@@ -954,6 +962,10 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
CL_CHECK((backend_ctx->kernel_restore_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q6_K", &err), err));
CL_CHECK((backend_ctx->kernel_convert_block_q6_K_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q6_K_noshuffle", &err), err));
CL_CHECK((backend_ctx->kernel_restore_block_q6_K_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q6_K_noshuffle", &err), err));
CL_CHECK((backend_ctx->kernel_convert_block_iq4_nl = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_iq4_nl", &err), err));
CL_CHECK((backend_ctx->kernel_restore_block_iq4_nl = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_iq4_nl", &err), err));
CL_CHECK((backend_ctx->kernel_convert_block_iq4_nl_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_iq4_nl_noshuffle", &err), err));
CL_CHECK((backend_ctx->kernel_restore_block_iq4_nl_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_iq4_nl_noshuffle", &err), err));
GGML_LOG_CONT(".");
}
@@ -1359,6 +1371,40 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
GGML_LOG_CONT(".");
}
// mul_mv_iq4_nl_f32
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "mul_mv_iq4_nl_f32.cl.h"
};
#else
const std::string kernel_src = read_file("mul_mv_iq4_nl_f32.cl");
#endif
cl_program prog =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_mul_mv_iq4_nl_f32 = clCreateKernel(prog, "kernel_mul_mv_iq4_nl_f32", &err), err));
CL_CHECK(clReleaseProgram(prog));
GGML_LOG_CONT(".");
}
// mul_mv_iq4_nl_f32_flat
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "mul_mv_iq4_nl_f32_flat.cl.h"
};
#else
const std::string kernel_src = read_file("mul_mv_iq4_nl_f32_flat.cl");
#endif
cl_program prog =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_mul_mv_iq4_nl_f32_flat = clCreateKernel(prog, "kernel_mul_mv_iq4_nl_f32_flat", &err), err));
CL_CHECK(clReleaseProgram(prog));
GGML_LOG_CONT(".");
}
// mul_mv_mxfp4_f32
{
#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -1567,6 +1613,23 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
GGML_LOG_CONT(".");
}
// mul_mm_iq4_nl_f32_l4_lm
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "mul_mm_iq4_nl_f32_l4_lm.cl.h"
};
#else
const std::string kernel_src = read_file("mul_mm_iq4_nl_f32_l4_lm.cl");
#endif
cl_program prog =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_mul_mm_iq4_nl_f32_l4_lm = clCreateKernel(prog, "kernel_mul_mm_iq4_nl_f32_l4_lm", &err), err));
CL_CHECK(clReleaseProgram(prog));
GGML_LOG_CONT(".");
}
// mul_mm_q4_k_f32_l4_lm
{
#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -2647,6 +2710,45 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
GGML_LOG_CONT(".");
}
// gemm_noshuffle_iq4_nl_f32
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "gemm_noshuffle_iq4_nl_f32.cl.h"
};
#else
const std::string kernel_src = read_file("gemm_noshuffle_iq4_nl_f32.cl");
#endif
cl_program prog = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_gemm_noshuffle_iq4_nl_f32 = clCreateKernel(prog, "kernel_gemm_noshuffle_iq4_nl_f32", &err), err));
CL_CHECK(clReleaseProgram(prog));
GGML_LOG_CONT(".");
}
// gemv_noshuffle_iq4_nl_f32
{
std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
" -cl-mad-enable ";
if (backend_ctx->has_vector_subgroup_broadcast) {
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAST ";
}
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "gemv_noshuffle_iq4_nl_f32.cl.h"
};
#else
const std::string kernel_src = read_file("gemv_noshuffle_iq4_nl_f32.cl");
#endif
cl_program prog = build_program_from_source(
backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_gemv_compile_opts);
CL_CHECK((backend_ctx->kernel_gemv_noshuffle_iq4_nl_f32 = clCreateKernel(prog, "kernel_gemv_noshuffle_iq4_nl_f32", &err), err));
CL_CHECK(clReleaseProgram(prog));
GGML_LOG_CONT(".");
}
// mul_mm_q8_0_f32_8x4
{
#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -3597,6 +3699,30 @@ struct ggml_tensor_extra_cl_q8_0 {
}
};
struct ggml_tensor_extra_cl_iq4_nl {
cl_mem q = nullptr;
cl_mem q_img = nullptr;
cl_mem d = nullptr;
cl_mem d_img = nullptr;
size_t size_q = 0;
size_t size_d = 0;
~ggml_tensor_extra_cl_iq4_nl() {
reset();
}
void reset() {
if (q != nullptr) { CL_CHECK(clReleaseMemObject(q)); q = nullptr; }
if (d != nullptr) { CL_CHECK(clReleaseMemObject(d)); d = nullptr; }
q_img = nullptr;
d_img = nullptr;
size_q = 0;
size_d = 0;
}
};
struct ggml_tensor_extra_cl_q4_K {
// Quantized values
cl_mem q = nullptr;
@@ -4097,6 +4223,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
return op->src[1]->type == GGML_TYPE_F32;
} else if (op->src[0]->type == GGML_TYPE_Q4_0 || op->src[0]->type == GGML_TYPE_Q4_1 ||
op->src[0]->type == GGML_TYPE_MXFP4 ||
op->src[0]->type == GGML_TYPE_IQ4_NL ||
op->src[0]->type == GGML_TYPE_Q4_K ||
op->src[0]->type == GGML_TYPE_Q5_K ||
op->src[0]->type == GGML_TYPE_Q6_K) {
@@ -4295,6 +4422,12 @@ struct ggml_backend_opencl_buffer_context {
for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0_in_use) {
delete e;
}
for (ggml_tensor_extra_cl_iq4_nl * e : temp_tensor_extras_iq4_nl) {
delete e;
}
for (ggml_tensor_extra_cl_iq4_nl * e : temp_tensor_extras_iq4_nl_in_use) {
delete e;
}
for (ggml_tensor_extra_cl_q4_K * e : temp_tensor_extras_q4_K) {
delete e;
}
@@ -4390,6 +4523,21 @@ struct ggml_backend_opencl_buffer_context {
return extra;
}
ggml_tensor_extra_cl_iq4_nl * ggml_opencl_alloc_temp_tensor_extra_iq4_nl() {
ggml_tensor_extra_cl_iq4_nl * extra;
if (temp_tensor_extras_iq4_nl.empty()) {
extra = new ggml_tensor_extra_cl_iq4_nl();
} else {
extra = temp_tensor_extras_iq4_nl.back();
temp_tensor_extras_iq4_nl.pop_back();
}
temp_tensor_extras_iq4_nl_in_use.push_back(extra);
extra->reset();
return extra;
}
ggml_tensor_extra_cl_q4_K * ggml_opencl_alloc_temp_tensor_extra_q4_K() {
ggml_tensor_extra_cl_q4_K * extra;
if (temp_tensor_extras_q4_K.empty()) {
@@ -4461,6 +4609,11 @@ struct ggml_backend_opencl_buffer_context {
}
temp_tensor_extras_q8_0_in_use.clear();
for (ggml_tensor_extra_cl_iq4_nl * e : temp_tensor_extras_iq4_nl_in_use) {
temp_tensor_extras_iq4_nl.push_back(e);
}
temp_tensor_extras_iq4_nl_in_use.clear();
for (ggml_tensor_extra_cl_q4_K * e : temp_tensor_extras_q4_K_in_use) {
temp_tensor_extras_q4_K.push_back(e);
}
@@ -4492,6 +4645,8 @@ struct ggml_backend_opencl_buffer_context {
std::vector<ggml_tensor_extra_cl_mxfp4 *> temp_tensor_extras_mxfp4_in_use;
std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0;
std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0_in_use;
std::vector<ggml_tensor_extra_cl_iq4_nl *> temp_tensor_extras_iq4_nl;
std::vector<ggml_tensor_extra_cl_iq4_nl *> temp_tensor_extras_iq4_nl_in_use;
std::vector<ggml_tensor_extra_cl_q4_K *> temp_tensor_extras_q4_K;
std::vector<ggml_tensor_extra_cl_q4_K *> temp_tensor_extras_q4_K_in_use;
std::vector<ggml_tensor_extra_cl_q5_K *> temp_tensor_extras_q5_K;
@@ -5123,6 +5278,87 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
return;
}
if (tensor->type == GGML_TYPE_IQ4_NL) {
ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
GGML_ASSERT(extra_orig && "Tensors in OpenCL backend should have been allocated and initialized");
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
ggml_tensor_extra_cl_iq4_nl * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_iq4_nl();
size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*(ggml_blck_size(tensor->type)/2);
GGML_ASSERT(size_d + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
cl_int err;
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
ggml_nbytes(tensor), NULL, &err);
CL_CHECK(err);
CL_CHECK(clEnqueueWriteBuffer(
queue, data_device, CL_TRUE, 0,
ggml_nbytes(tensor), data, 0, NULL, NULL));
cl_buffer_region region;
// Create subbuffer for scales.
region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
region.size = size_d;
extra->d = clCreateSubBuffer(
extra_orig->data_device, CL_MEM_READ_WRITE,
CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
CL_CHECK(err);
auto previous_origin = region.origin;
// Create subbuffer for quants.
region.origin = align_to(previous_origin + size_d, backend_ctx->alignment);
region.size = size_q;
extra->q = clCreateSubBuffer(
extra_orig->data_device, CL_MEM_READ_WRITE,
CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
CL_CHECK(err);
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
cl_kernel kernel = backend_ctx->kernel_convert_block_iq4_nl;
if (use_adreno_kernels(backend_ctx, tensor)) {
kernel = backend_ctx->kernel_convert_block_iq4_nl_noshuffle;
}
#else
cl_kernel kernel = backend_ctx->kernel_convert_block_iq4_nl;
#endif
cl_ulong n_blk = ggml_nelements(tensor)/ggml_blck_size(tensor->type);
cl_uchar mask_0F = 0x0F;
cl_uchar mask_F0 = 0xF0;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_uchar), &mask_0F));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_uchar), &mask_F0));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &n_blk));
size_t global_work_size[] = {(size_t)CEIL_DIV(n_blk, 64)*64, 1, 1};
size_t local_work_size[] = {64, 1, 1};
cl_event evt;
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
CL_CHECK(clWaitForEvents(1, &evt));
CL_CHECK(clReleaseMemObject(data_device));
tensor->extra = extra;
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
if (use_adreno_kernels(backend_ctx, tensor)) {
int M = tensor->ne[1];
int K = tensor->ne[0];
GGML_ASSERT(K % 32 == 0);
// Transpose q as ushort
transpose_2d_as_16b(backend_ctx, extra->q, extra->q, size_q, K/4, M);
// Transpose d as ushort
transpose_2d_as_16b(backend_ctx, extra->d, extra->d, size_d, K/32, M);
}
#endif
return;
}
if (tensor->type == GGML_TYPE_Q4_K) {
ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
@@ -5775,6 +6011,78 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
CL_CHECK(clReleaseMemObject(data_device));
return;
}
if (tensor->type == GGML_TYPE_IQ4_NL) {
ggml_tensor_extra_cl_iq4_nl * extra = (ggml_tensor_extra_cl_iq4_nl *)tensor->extra;
cl_int err;
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
ggml_nbytes(tensor), NULL, &err);
CL_CHECK(err);
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
if (use_adreno_kernels(backend_ctx, tensor)) {
static ggml_cl_buffer buf_trans_q;
static ggml_cl_buffer buf_trans_d;
static ggml_cl_buffer buf_unpacked;
cl_int M = tensor->ne[1];
cl_int K = tensor->ne[0];
GGML_ASSERT(K % 32 == 0);
size_t size_q = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*(ggml_blck_size(tensor->type)/2);
size_t size_d = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*sizeof(ggml_fp16_t);
GGML_ASSERT(size_d + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
buf_trans_q.allocate(backend_ctx->context, size_q);
buf_trans_d.allocate(backend_ctx->context, size_d);
buf_unpacked.allocate(backend_ctx->context, ggml_nbytes(tensor));
// transpose q, d back
transpose_2d_as_16b(backend_ctx, extra->q, buf_trans_q.buffer, size_q, M, K/4);
transpose_2d_as_16b(backend_ctx, extra->d, buf_trans_d.buffer, size_d, M, K/32);
cl_uchar mask_0F = 0x0F;
cl_uchar mask_F0 = 0xF0;
cl_kernel kernel = backend_ctx->kernel_restore_block_iq4_nl_noshuffle;
cl_ulong n_blk = ggml_nelements(tensor)/ggml_blck_size(tensor->type);
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_trans_q.buffer));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_d.buffer));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &buf_unpacked.buffer));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_uchar), &mask_0F));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_uchar), &mask_F0));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &n_blk));
size_t global_work_size[] = {(size_t)n_blk, 1, 1};
size_t local_work_size[] = {1, 1, 1};
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
CL_CHECK(clEnqueueReadBuffer(queue, buf_unpacked.buffer, CL_TRUE, offset, size, data, 0, NULL, NULL));
return;
}
#endif
cl_kernel kernel = backend_ctx->kernel_restore_block_iq4_nl;
cl_ulong n_blk = ggml_nelements(tensor)/ggml_blck_size(tensor->type);
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &n_blk));
size_t global_work_size[] = {(size_t)n_blk, 1, 1};
size_t local_work_size[] = {1, 1, 1};
cl_event evt;
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
global_work_size, local_work_size, 0, NULL, &evt));
CL_CHECK(clWaitForEvents(1, &evt));
CL_CHECK(clEnqueueReadBuffer(
queue, data_device, CL_TRUE, offset,
size, data, 0, NULL, NULL));
CL_CHECK(clReleaseMemObject(data_device));
return;
}
if (tensor->type == GGML_TYPE_Q4_K) {
ggml_tensor_extra_cl_q4_K * extra = (ggml_tensor_extra_cl_q4_K *)tensor->extra;
@@ -9840,6 +10148,178 @@ static void ggml_cl_mul_mat_q4_1_f32_adreno(ggml_backend_t backend, const ggml_t
#endif
}
static void ggml_cl_mul_mat_iq4_nl_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
GGML_ASSERT(src0);
GGML_ASSERT(src0->extra);
GGML_ASSERT(src1);
GGML_ASSERT(src1->extra);
GGML_ASSERT(dst);
GGML_ASSERT(dst->extra);
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
ggml_tensor_extra_cl_iq4_nl * extra0_iq4_nl = (ggml_tensor_extra_cl_iq4_nl *)src0->extra;
cl_ulong offset1 = extra1->offset + src1->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
const int ne00 = src0->ne[0];
const int ne01 = src0->ne[1];
const int ne1 = dst->ne[1];
GGML_ASSERT(ne00 % 32 == 0);
cl_context context = backend_ctx->context;
cl_kernel kernel;
cl_int err;
cl_image_format img_fmt;
cl_image_desc img_desc;
cl_buffer_region region;
int M = ne01;
int N = ne1;
int K = ne00;
if (ne1 == 1) {
cl_mem q_img = nullptr;
cl_mem b_sub_buf = nullptr;
cl_mem b_img = nullptr;
// image for q
img_fmt = { CL_R, CL_UNSIGNED_INT32};
memset(&img_desc, 0, sizeof(img_desc));
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
img_desc.image_width = M * K / 2 / 4;
img_desc.buffer = extra0_iq4_nl->q;
CL_CHECK((q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
// subbuffer for activations
region.origin = offset1;
region.size = K * N * sizeof(float);
CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &err), err));
// image for activations
img_fmt = {CL_RGBA, CL_FLOAT};
memset(&img_desc, 0, sizeof(img_desc));
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
img_desc.image_width = K * N / 4;
img_desc.buffer = b_sub_buf;
CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
kernel = backend_ctx->kernel_gemv_noshuffle_iq4_nl_f32;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_img));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_iq4_nl->d));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &b_img));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_int), &ne01));
size_t local_work_size[3] = {64, 4, 1};
size_t global_work_size[3] = {(size_t)CEIL_DIV(ne01/2, 64)*64, 4, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
CL_CHECK(clReleaseMemObject(q_img));
CL_CHECK(clReleaseMemObject(b_sub_buf));
CL_CHECK(clReleaseMemObject(b_img));
} else {
cl_mem b_sub_buf = nullptr;
cl_mem b_sub_buf_trans = nullptr;
cl_mem b_img = nullptr;
cl_mem b_img_trans = nullptr;
// subbuffer for activations
region.origin = offset1;
region.size = K * N * sizeof(float);
CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &err), err));
// image for activations
img_fmt = {CL_RGBA, CL_FLOAT};
memset(&img_desc, 0, sizeof(img_desc));
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
img_desc.image_width = K * N / 4;
img_desc.buffer = b_sub_buf;
CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
// pad N to multiple of 8
int extra_elements = N % 8;
int padding = 0;
if (extra_elements > 0){
padding = 8 - extra_elements;
}
// subbuffer for transposed activations
region.origin = 0;
region.size = K * (N + padding) * sizeof(float)/2;
backend_ctx->prealloc_act_trans.allocate(context, region.size);
CL_CHECK((b_sub_buf_trans = clCreateSubBuffer(backend_ctx->prealloc_act_trans.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &err), err));
// image for transposed activations
img_fmt = {CL_RGBA, CL_HALF_FLOAT};
memset(&img_desc, 0, sizeof(img_desc));
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
img_desc.image_width = K * (N + padding) / 4;
img_desc.buffer = b_sub_buf_trans;
CL_CHECK((b_img_trans = clCreateImage(context, 0, &img_fmt, &img_desc, NULL, &err), err));
// transpose activations
int height_B = N/4;
if (height_B == 0) {
height_B = 1;
}
int width_B = K/4;
int padded_height_B = (N + padding)/4;
kernel = backend_ctx->kernel_transpose_32_16;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &b_img));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &b_img_trans));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_B));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_B));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &padded_height_B));
size_t local_work_size_t[2] = { 1, 16 };
size_t global_work_size_t[2] = { (size_t)width_B, (size_t)padded_height_B };
backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size_t, local_work_size_t, dst);
// gemm
kernel = backend_ctx->kernel_gemm_noshuffle_iq4_nl_f32;
int padded_N = N + padding;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_iq4_nl->q));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_iq4_nl->d));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &b_img_trans));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_int), &padded_N));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_int), &ne1));
size_t global_work_size[3] = {(size_t)CEIL_DIV(ne1, 8), (size_t)CEIL_DIV(ne01, 4), 1};
size_t local_work_size[3] = {1, 128, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
CL_CHECK(clReleaseMemObject(b_sub_buf));
CL_CHECK(clReleaseMemObject(b_sub_buf_trans));
CL_CHECK(clReleaseMemObject(b_img));
CL_CHECK(clReleaseMemObject(b_img_trans));
}
#else
GGML_UNUSED(backend);
GGML_UNUSED(src0);
GGML_UNUSED(src1);
GGML_UNUSED(dst);
#endif
}
static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
GGML_ASSERT(src0);
@@ -10634,6 +11114,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
ggml_tensor_extra_cl_q4_1 * extra0_q4_1 = (ggml_tensor_extra_cl_q4_1 *)src0->extra;
ggml_tensor_extra_cl_mxfp4 * extra0_mxfp4 = (ggml_tensor_extra_cl_mxfp4 *)src0->extra;
ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra;
ggml_tensor_extra_cl_iq4_nl * extra0_iq4_nl = (ggml_tensor_extra_cl_iq4_nl *)src0->extra;
ggml_tensor_extra_cl_q4_K * extra0_q4_K = (ggml_tensor_extra_cl_q4_K *)src0->extra;
ggml_tensor_extra_cl_q5_K * extra0_q5_K = (ggml_tensor_extra_cl_q5_K *)src0->extra;
ggml_tensor_extra_cl_q6_K * extra0_q6_K = (ggml_tensor_extra_cl_q6_K *)src0->extra;
@@ -10738,6 +11219,12 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
return;
}
// iq4_nl x fp32
if (src0t == GGML_TYPE_IQ4_NL && src1t == GGML_TYPE_F32) {
ggml_cl_mul_mat_iq4_nl_f32_adreno(backend, src0, src1, dst);
return;
}
// q8_0 x fp32
if (src0t == GGML_TYPE_Q8_0 && src1t == GGML_TYPE_F32 &&
enable_adreno_trans_weight(backend_ctx, src0)) {
@@ -11302,6 +11789,48 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
return;
}
case GGML_TYPE_IQ4_NL: {
if (ne11 < 32) {
break;
}
if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1)) {
break;
}
kernel = backend_ctx->kernel_mul_mm_iq4_nl_f32_l4_lm;
nth0 = 128; // calculated as (BM*BN)/(TM*TN)
int batch_stride_a = ne00*ne01;
int batch_stride_b = ne10*ne11;
int batch_stride_d = ne0*ne1;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_iq4_nl->q));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_iq4_nl->d));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne11));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne10)); // stride_a
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10)); // stride_b
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne01)); // stride_d
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &batch_stride_a));
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &batch_stride_b));
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &batch_stride_d));
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r2));
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r3));
// 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed.
size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
size_t local_work_size[] = {(size_t)nth0, 1, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
return;
}
case GGML_TYPE_Q4_K: {
if (ne11 < 32) {
break;
@@ -11829,6 +12358,70 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne1));
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r2));
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r3));
#endif // GGML_OPENCL_SOA_Q
break;
}
case GGML_TYPE_IQ4_NL: {
#ifdef GGML_OPENCL_SOA_Q
kernel = backend_ctx->kernel_mul_mv_iq4_nl_f32_flat;
if (backend_ctx->gpu_family == INTEL) {
nth0 = 16;
nth1 = 1;
ndst = 8;
} else if (backend_ctx->gpu_family == ADRENO) {
nth0 = 64;
nth1 = 1;
ndst = 8;
} else {
GGML_ASSERT(false && "TODO: Unknown GPU");
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_iq4_nl->q));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_iq4_nl->d));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne0));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne1));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2));
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3));
#else
kernel = backend_ctx->kernel_mul_mv_iq4_nl_f32;
if (backend_ctx->gpu_family == INTEL) {
nth0 = 16;
nth1 = 1;
ndst = 4;
} else if (backend_ctx->gpu_family == ADRENO) {
nth0 = 64;
nth1 = 1;
ndst = 4;
} else {
GGML_ASSERT(false && "TODO: Unknown GPU");
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne0));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne1));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2));
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3));
#endif // GGML_OPENCL_SOA_Q
break;
}
@@ -12131,6 +12724,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_MXFP4 ||
src0t == GGML_TYPE_Q4_1 ||
src0t == GGML_TYPE_Q8_0 ||
src0t == GGML_TYPE_IQ4_NL ||
src0t == GGML_TYPE_Q2_K) {
// Each SIMD group produces N_DST values in the result. Assuming each
// workgroup has N_SIMDGROUP SIMD groups, then each workgroup will
+107
View File
@@ -87,6 +87,17 @@ struct block_q6_K {
half d; // super-block scale
};
//------------------------------------------------------------------------------
// block_iq4_nl
//------------------------------------------------------------------------------
#define QK4_NL 32
struct block_iq4_nl
{
half d;
uint8_t qs[QK4_NL / 2];
};
//------------------------------------------------------------------------------
// kernel_convert_block_q4_0
// Convert the block_q4_0 format to 2 separate arrays (AOS -> SOA).
@@ -895,3 +906,99 @@ kernel void kernel_restore_block_q6_K_noshuffle(
b->scales[i] = s[i];
}
}
//------------------------------------------------------------------------------
// kernel_convert_block_iq4_nl
// Convert the block_iq4_nl format to 2 separate arrays (AOS -> SOA).
//------------------------------------------------------------------------------
kernel void kernel_convert_block_iq4_nl(
global struct block_iq4_nl * src0,
global uchar * dst_q,
global half * dst_d,
uchar mask_0F,
uchar mask_F0,
ulong n_blk
) {
if (get_global_id(0) >= n_blk) {
return;
}
global struct block_iq4_nl * b = (global struct block_iq4_nl *) src0 + get_global_id(0);
global uchar * q = (global uchar *) dst_q + QK4_NL/2*get_global_id(0);
global half * d = (global half *) dst_d + get_global_id(0);
*d = b->d;
for (int i = 0; i < QK4_NL/2; ++i) {
q[i] = b->qs[i];
}
}
kernel void kernel_restore_block_iq4_nl(
global uchar * src_q,
global half * src_d,
global struct block_iq4_nl * dst,
ulong n_blk
) {
if (get_global_id(0) >= n_blk) {
return;
}
global struct block_iq4_nl * b = (global struct block_iq4_nl *) dst + get_global_id(0);
global uchar * q = (global uchar *) src_q + QK4_NL/2*get_global_id(0);
global half * d = (global half *) src_d + get_global_id(0);
b->d = *d;
for (int i = 0; i < QK4_NL/2; ++i) {
b->qs[i] = q[i];
}
}
kernel void kernel_convert_block_iq4_nl_noshuffle(
global struct block_iq4_nl * src0,
global uchar * dst_q,
global half * dst_d,
uchar mask_0F,
uchar mask_F0,
ulong n_blk
) {
if (get_global_id(0) >= n_blk) {
return;
}
global struct block_iq4_nl * b = (global struct block_iq4_nl *) src0 + get_global_id(0);
global uchar * q = (global uchar *) dst_q + QK4_NL/2*get_global_id(0);
global half * d = (global half *) dst_d + get_global_id(0);
*d = b->d;
for (int i = 0; i < QK4_NL/4; ++i) {
uchar x0 = b->qs[2*i + 0];
uchar x1 = b->qs[2*i + 1];
q[i + 0 ] = convert_uchar(x0 & mask_0F) | convert_uchar((x1 & mask_0F) << 4);
q[i + QK4_NL/4] = convert_uchar((x0 & mask_F0) >> 4) | convert_uchar(x1 & mask_F0);
}
}
kernel void kernel_restore_block_iq4_nl_noshuffle(
global uchar * src_q,
global half * src_d,
global struct block_iq4_nl * dst,
uchar mask_0F,
uchar mask_F0,
ulong n_blk
) {
if (get_global_id(0) >= n_blk) {
return;
}
global struct block_iq4_nl * b = (global struct block_iq4_nl *) dst + get_global_id(0);
global uchar * q = (global uchar *) src_q + QK4_NL/2*get_global_id(0);
global half * d = (global half *) src_d + get_global_id(0);
b->d = *d;
for (int i = 0; i < QK4_NL/4; ++i) {
uchar x0 = q[i + 0 ];
uchar x1 = q[i + QK4_NL/4];
b->qs[2*i + 0] = convert_uchar((x0 & mask_0F) | ((x1 & mask_0F) << 4));
b->qs[2*i + 1] = convert_uchar(((x0 & mask_F0) >> 4) | (x1 & mask_F0));
}
}
@@ -0,0 +1,150 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
#ifdef cl_qcom_reqd_sub_group_size
#define ADRENO_GPU 1
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
#endif
constant half kvalues_iq4nl[16] = {
(half)-127.f, (half)-104.f, (half)-83.f, (half)-65.f,
(half) -49.f, (half) -35.f, (half)-22.f, (half)-10.f,
(half) 1.f, (half) 13.f, (half) 25.f, (half) 38.f,
(half) 53.f, (half) 69.f, (half) 89.f, (half)113.f
};
// Packed LUT: 2 FP16 values per uint, 8 unique constant loads instead of 16
constant uint iq4nl_packed[8] = {
0xD680D7F0u, // idx 0,1: -127, -104
0xD410D530u, // idx 2,3: -83, -65
0xD060D220u, // idx 4,5: -49, -35
0xC900CD80u, // idx 6,7: -22, -10
0x4A803C00u, // idx 8,9: 1, 13
0x50C04E40u, // idx 10,11: 25, 38
0x545052A0u, // idx 12,13: 53, 69
0x57105590u // idx 14,15: 89, 113
};
// Packed dequant: 1 uint constant load (8-way divergence) + shift + as_half
#define IQ4_NL_DEQUANT(nibble) as_half((ushort)(iq4nl_packed[(nibble) >> 1] >> (((nibble) & 1u) << 4)))
#ifdef ADRENO_GPU
REQD_SUBGROUP_SIZE_128
#endif
kernel void kernel_gemm_noshuffle_iq4_nl_f32(
global const ushort * src0_q,
global const half * src0_d,
read_only image1d_buffer_t src1,
global float * dst,
ulong offsetd,
int m,
int n,
int k,
int n_no_padding
) {
dst = (global float *)((global char *)dst + offsetd);
int m_4 = m >> 2;
int n_4 = n >> 2;
int gy = get_global_id(0);
int gx = get_global_id(1);
int gx_2 = gx << 2;
half8 c0 = 0, c1 = 0, c2 = 0, c3 = 0;
half8 B;
half4 dequantized_weights;
global const ushort * weight_ptr = src0_q + gx_2;
global const half * scale_ptr = src0_d + gx_2;
for (int i = 0; i < k; i += 4) {
B.s0123 = read_imageh(src1, gy*2 + (i)*(n_4));
B.s4567 = read_imageh(src1, gy*2 + (i)*(n_4)+1);
ushort4 bits4 = vload4(0, weight_ptr + (i/4)*(m));
half4 scale = vload4(0, scale_ptr + (i/32)*(m));
// j=0
dequantized_weights.s0 = IQ4_NL_DEQUANT(bits4.s0 & 0x000Fu) * scale.s0;
dequantized_weights.s1 = IQ4_NL_DEQUANT(bits4.s1 & 0x000Fu) * scale.s1;
dequantized_weights.s2 = IQ4_NL_DEQUANT(bits4.s2 & 0x000Fu) * scale.s2;
dequantized_weights.s3 = IQ4_NL_DEQUANT(bits4.s3 & 0x000Fu) * scale.s3;
c0 += B * dequantized_weights.s0;
c1 += B * dequantized_weights.s1;
c2 += B * dequantized_weights.s2;
c3 += B * dequantized_weights.s3;
// j=1
B.s0123 = read_imageh(src1, gy*2 + (i+1)*(n_4));
B.s4567 = read_imageh(src1, gy*2 + (i+1)*(n_4)+1);
dequantized_weights.s0 = IQ4_NL_DEQUANT((bits4.s0 >> 4) & 0x000Fu) * scale.s0;
dequantized_weights.s1 = IQ4_NL_DEQUANT((bits4.s1 >> 4) & 0x000Fu) * scale.s1;
dequantized_weights.s2 = IQ4_NL_DEQUANT((bits4.s2 >> 4) & 0x000Fu) * scale.s2;
dequantized_weights.s3 = IQ4_NL_DEQUANT((bits4.s3 >> 4) & 0x000Fu) * scale.s3;
c0 += B * dequantized_weights.s0;
c1 += B * dequantized_weights.s1;
c2 += B * dequantized_weights.s2;
c3 += B * dequantized_weights.s3;
// j=2
B.s0123 = read_imageh(src1, gy*2 + (i+2)*(n_4));
B.s4567 = read_imageh(src1, gy*2 + (i+2)*(n_4)+1);
dequantized_weights.s0 = IQ4_NL_DEQUANT((bits4.s0 >> 8) & 0x000Fu) * scale.s0;
dequantized_weights.s1 = IQ4_NL_DEQUANT((bits4.s1 >> 8) & 0x000Fu) * scale.s1;
dequantized_weights.s2 = IQ4_NL_DEQUANT((bits4.s2 >> 8) & 0x000Fu) * scale.s2;
dequantized_weights.s3 = IQ4_NL_DEQUANT((bits4.s3 >> 8) & 0x000Fu) * scale.s3;
c0 += B * dequantized_weights.s0;
c1 += B * dequantized_weights.s1;
c2 += B * dequantized_weights.s2;
c3 += B * dequantized_weights.s3;
// j=3
B.s0123 = read_imageh(src1, gy*2 + (i+3)*(n_4));
B.s4567 = read_imageh(src1, gy*2 + (i+3)*(n_4)+1);
dequantized_weights.s0 = IQ4_NL_DEQUANT((bits4.s0 >> 12) & 0x000Fu) * scale.s0;
dequantized_weights.s1 = IQ4_NL_DEQUANT((bits4.s1 >> 12) & 0x000Fu) * scale.s1;
dequantized_weights.s2 = IQ4_NL_DEQUANT((bits4.s2 >> 12) & 0x000Fu) * scale.s2;
dequantized_weights.s3 = IQ4_NL_DEQUANT((bits4.s3 >> 12) & 0x000Fu) * scale.s3;
c0 += B * dequantized_weights.s0;
c1 += B * dequantized_weights.s1;
c2 += B * dequantized_weights.s2;
c3 += B * dequantized_weights.s3;
}
int idx = (gy<<3)*m + (gx<<2);
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s0, c1.s0, c2.s0, c3.s0), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s1, c1.s1, c2.s1, c3.s1), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s2, c1.s2, c2.s2, c3.s2), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s3, c1.s3, c2.s3, c3.s3), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s4, c1.s4, c2.s4, c3.s4), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s5, c1.s5, c2.s5, c3.s5), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s6, c1.s6, c2.s6, c3.s6), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s7, c1.s7, c2.s7, c3.s7), 0, dst + idx);
}
}
@@ -0,0 +1,302 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
#ifdef cl_qcom_reqd_sub_group_size
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
#define ADRENO_GPU 1
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
#endif
#define QK4_NL 32
#define NSUBGROUPS 4
#define SUBGROUP_SIZE 64
constant half kvalues_iq4nl[16] = {
(half)-127.f, (half)-104.f, (half)-83.f, (half)-65.f,
(half) -49.f, (half) -35.f, (half)-22.f, (half)-10.f,
(half) 1.f, (half) 13.f, (half) 25.f, (half) 38.f,
(half) 53.f, (half) 69.f, (half) 89.f, (half)113.f
};
// Packed LUT: 2 FP16 values per uint, 8 unique constant loads instead of 16
constant uint iq4nl_packed[8] = {
0xD680D7F0u, // idx 0,1: -127, -104
0xD410D530u, // idx 2,3: -83, -65
0xD060D220u, // idx 4,5: -49, -35
0xC900CD80u, // idx 6,7: -22, -10
0x4A803C00u, // idx 8,9: 1, 13
0x50C04E40u, // idx 10,11: 25, 38
0x545052A0u, // idx 12,13: 53, 69
0x57105590u // idx 14,15: 89, 113
};
// Packed dequant: 1 uint constant load (8-way divergence) + shift + as_half
#define IQ4_NL_DEQUANT(nibble) as_half((ushort)(iq4nl_packed[(nibble) >> 1] >> (((nibble) & 1u) << 4)))
#define dequantizeBlockAccum_ns_sgbroadcast_1_hi(total_sums, bits4, scale, y) \
float shared_y; \
shared_y = sub_group_broadcast(y.s0, 0); \
total_sums.s0 += IQ4_NL_DEQUANT((bits4.s0 & 0x000F)) * scale.s0 * shared_y; \
total_sums.s1 += IQ4_NL_DEQUANT((bits4.s1 & 0x000F)) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s1, 0); \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s0 & 0x00F0) >> 4)) * scale.s0 * shared_y; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s1 & 0x00F0) >> 4)) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s2, 0); \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s0 & 0x0F00) >> 8)) * scale.s0 * shared_y; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s1 & 0x0F00) >> 8)) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s3, 0); \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s0 & 0xF000) >> 12)) * scale.s0 * shared_y; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s1 & 0xF000) >> 12)) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s4, 0); \
total_sums.s0 += IQ4_NL_DEQUANT((bits4.s2 & 0x000F)) * scale.s0 * shared_y; \
total_sums.s1 += IQ4_NL_DEQUANT((bits4.s3 & 0x000F)) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s5, 0); \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s2 & 0x00F0) >> 4)) * scale.s0 * shared_y; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s3 & 0x00F0) >> 4)) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s6, 0); \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s2 & 0x0F00) >> 8)) * scale.s0 * shared_y; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s3 & 0x0F00) >> 8)) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s7, 0); \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s2 & 0xF000) >> 12)) * scale.s0 * shared_y; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s3 & 0xF000) >> 12)) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s0, 1); \
total_sums.s0 += IQ4_NL_DEQUANT((bits4.s4 & 0x000F)) * scale.s0 * shared_y; \
total_sums.s1 += IQ4_NL_DEQUANT((bits4.s5 & 0x000F)) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s1, 1); \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s4 & 0x00F0) >> 4)) * scale.s0 * shared_y; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s5 & 0x00F0) >> 4)) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s2, 1); \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s4 & 0x0F00) >> 8)) * scale.s0 * shared_y; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s5 & 0x0F00) >> 8)) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s3, 1); \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s4 & 0xF000) >> 12)) * scale.s0 * shared_y; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s5 & 0xF000) >> 12)) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s4, 1); \
total_sums.s0 += IQ4_NL_DEQUANT((bits4.s6 & 0x000F)) * scale.s0 * shared_y; \
total_sums.s1 += IQ4_NL_DEQUANT((bits4.s7 & 0x000F)) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s5, 1); \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s6 & 0x00F0) >> 4)) * scale.s0 * shared_y; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s7 & 0x00F0) >> 4)) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s6, 1); \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s6 & 0x0F00) >> 8)) * scale.s0 * shared_y; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s7 & 0x0F00) >> 8)) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s7, 1); \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s6 & 0xF000) >> 12)) * scale.s0 * shared_y; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s7 & 0xF000) >> 12)) * scale.s1 * shared_y; \
#define dequantizeBlockAccum_ns_sgbroadcast_1_lo(total_sums, bits4, scale, y) \
shared_y = sub_group_broadcast(y.s0, 2); \
total_sums.s0 += IQ4_NL_DEQUANT((bits4.s0 & 0x000F)) * scale.s0 * shared_y; \
total_sums.s1 += IQ4_NL_DEQUANT((bits4.s1 & 0x000F)) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s1, 2); \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s0 & 0x00F0) >> 4)) * scale.s0 * shared_y; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s1 & 0x00F0) >> 4)) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s2, 2); \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s0 & 0x0F00) >> 8)) * scale.s0 * shared_y; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s1 & 0x0F00) >> 8)) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s3, 2); \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s0 & 0xF000) >> 12)) * scale.s0 * shared_y; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s1 & 0xF000) >> 12)) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s4, 2); \
total_sums.s0 += IQ4_NL_DEQUANT((bits4.s2 & 0x000F)) * scale.s0 * shared_y; \
total_sums.s1 += IQ4_NL_DEQUANT((bits4.s3 & 0x000F)) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s5, 2); \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s2 & 0x00F0) >> 4)) * scale.s0 * shared_y; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s3 & 0x00F0) >> 4)) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s6, 2); \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s2 & 0x0F00) >> 8)) * scale.s0 * shared_y; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s3 & 0x0F00) >> 8)) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s7, 2); \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s2 & 0xF000) >> 12)) * scale.s0 * shared_y; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s3 & 0xF000) >> 12)) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s0, 3); \
total_sums.s0 += IQ4_NL_DEQUANT((bits4.s4 & 0x000F)) * scale.s0 * shared_y; \
total_sums.s1 += IQ4_NL_DEQUANT((bits4.s5 & 0x000F)) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s1, 3); \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s4 & 0x00F0) >> 4)) * scale.s0 * shared_y; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s5 & 0x00F0) >> 4)) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s2, 3); \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s4 & 0x0F00) >> 8)) * scale.s0 * shared_y; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s5 & 0x0F00) >> 8)) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s3, 3); \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s4 & 0xF000) >> 12)) * scale.s0 * shared_y; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s5 & 0xF000) >> 12)) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s4, 3); \
total_sums.s0 += IQ4_NL_DEQUANT((bits4.s6 & 0x000F)) * scale.s0 * shared_y; \
total_sums.s1 += IQ4_NL_DEQUANT((bits4.s7 & 0x000F)) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s5, 3); \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s6 & 0x00F0) >> 4)) * scale.s0 * shared_y; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s7 & 0x00F0) >> 4)) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s6, 3); \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s6 & 0x0F00) >> 8)) * scale.s0 * shared_y; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s7 & 0x0F00) >> 8)) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s7, 3); \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s6 & 0xF000) >> 12)) * scale.s0 * shared_y; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s7 & 0xF000) >> 12)) * scale.s1 * shared_y; \
#define dequantizeBlockAccum_ns_sgbroadcast_8_hi(total_sums, bits4, scale, y) \
float8 shared_y; \
shared_y = sub_group_broadcast(y, 0); \
total_sums.s0 += IQ4_NL_DEQUANT((bits4.s0 & 0x000F)) * scale.s0 * shared_y.s0; \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s0 & 0x00F0) >> 4)) * scale.s0 * shared_y.s1; \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s0 & 0x0F00) >> 8)) * scale.s0 * shared_y.s2; \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s0 & 0xF000) >> 12)) * scale.s0 * shared_y.s3; \
total_sums.s0 += IQ4_NL_DEQUANT((bits4.s2 & 0x000F)) * scale.s0 * shared_y.s4; \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s2 & 0x00F0) >> 4)) * scale.s0 * shared_y.s5; \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s2 & 0x0F00) >> 8)) * scale.s0 * shared_y.s6; \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s2 & 0xF000) >> 12)) * scale.s0 * shared_y.s7; \
total_sums.s1 += IQ4_NL_DEQUANT((bits4.s1 & 0x000F)) * scale.s1 * shared_y.s0; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s1 & 0x00F0) >> 4)) * scale.s1 * shared_y.s1; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s1 & 0x0F00) >> 8)) * scale.s1 * shared_y.s2; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s1 & 0xF000) >> 12)) * scale.s1 * shared_y.s3; \
total_sums.s1 += IQ4_NL_DEQUANT((bits4.s3 & 0x000F)) * scale.s1 * shared_y.s4; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s3 & 0x00F0) >> 4)) * scale.s1 * shared_y.s5; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s3 & 0x0F00) >> 8)) * scale.s1 * shared_y.s6; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s3 & 0xF000) >> 12)) * scale.s1 * shared_y.s7; \
shared_y = sub_group_broadcast(y, 1); \
total_sums.s0 += IQ4_NL_DEQUANT((bits4.s4 & 0x000F)) * scale.s0 * shared_y.s0; \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s4 & 0x00F0) >> 4)) * scale.s0 * shared_y.s1; \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s4 & 0x0F00) >> 8)) * scale.s0 * shared_y.s2; \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s4 & 0xF000) >> 12)) * scale.s0 * shared_y.s3; \
total_sums.s0 += IQ4_NL_DEQUANT((bits4.s6 & 0x000F)) * scale.s0 * shared_y.s4; \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s6 & 0x00F0) >> 4)) * scale.s0 * shared_y.s5; \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s6 & 0x0F00) >> 8)) * scale.s0 * shared_y.s6; \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s6 & 0xF000) >> 12)) * scale.s0 * shared_y.s7; \
total_sums.s1 += IQ4_NL_DEQUANT((bits4.s5 & 0x000F)) * scale.s1 * shared_y.s0; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s5 & 0x00F0) >> 4)) * scale.s1 * shared_y.s1; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s5 & 0x0F00) >> 8)) * scale.s1 * shared_y.s2; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s5 & 0xF000) >> 12)) * scale.s1 * shared_y.s3; \
total_sums.s1 += IQ4_NL_DEQUANT((bits4.s7 & 0x000F)) * scale.s1 * shared_y.s4; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s7 & 0x00F0) >> 4)) * scale.s1 * shared_y.s5; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s7 & 0x0F00) >> 8)) * scale.s1 * shared_y.s6; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s7 & 0xF000) >> 12)) * scale.s1 * shared_y.s7; \
#define dequantizeBlockAccum_ns_sgbroadcast_8_lo(total_sums, bits4, scale, y) \
shared_y = sub_group_broadcast(y, 2); \
total_sums.s0 += IQ4_NL_DEQUANT((bits4.s0 & 0x000F)) * scale.s0 * shared_y.s0; \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s0 & 0x00F0) >> 4)) * scale.s0 * shared_y.s1; \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s0 & 0x0F00) >> 8)) * scale.s0 * shared_y.s2; \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s0 & 0xF000) >> 12)) * scale.s0 * shared_y.s3; \
total_sums.s0 += IQ4_NL_DEQUANT((bits4.s2 & 0x000F)) * scale.s0 * shared_y.s4; \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s2 & 0x00F0) >> 4)) * scale.s0 * shared_y.s5; \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s2 & 0x0F00) >> 8)) * scale.s0 * shared_y.s6; \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s2 & 0xF000) >> 12)) * scale.s0 * shared_y.s7; \
total_sums.s1 += IQ4_NL_DEQUANT((bits4.s1 & 0x000F)) * scale.s1 * shared_y.s0; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s1 & 0x00F0) >> 4)) * scale.s1 * shared_y.s1; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s1 & 0x0F00) >> 8)) * scale.s1 * shared_y.s2; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s1 & 0xF000) >> 12)) * scale.s1 * shared_y.s3; \
total_sums.s1 += IQ4_NL_DEQUANT((bits4.s3 & 0x000F)) * scale.s1 * shared_y.s4; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s3 & 0x00F0) >> 4)) * scale.s1 * shared_y.s5; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s3 & 0x0F00) >> 8)) * scale.s1 * shared_y.s6; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s3 & 0xF000) >> 12)) * scale.s1 * shared_y.s7; \
shared_y = sub_group_broadcast(y, 3); \
total_sums.s0 += IQ4_NL_DEQUANT((bits4.s4 & 0x000F)) * scale.s0 * shared_y.s0; \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s4 & 0x00F0) >> 4)) * scale.s0 * shared_y.s1; \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s4 & 0x0F00) >> 8)) * scale.s0 * shared_y.s2; \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s4 & 0xF000) >> 12)) * scale.s0 * shared_y.s3; \
total_sums.s0 += IQ4_NL_DEQUANT((bits4.s6 & 0x000F)) * scale.s0 * shared_y.s4; \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s6 & 0x00F0) >> 4)) * scale.s0 * shared_y.s5; \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s6 & 0x0F00) >> 8)) * scale.s0 * shared_y.s6; \
total_sums.s0 += IQ4_NL_DEQUANT(((bits4.s6 & 0xF000) >> 12)) * scale.s0 * shared_y.s7; \
total_sums.s1 += IQ4_NL_DEQUANT((bits4.s5 & 0x000F)) * scale.s1 * shared_y.s0; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s5 & 0x00F0) >> 4)) * scale.s1 * shared_y.s1; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s5 & 0x0F00) >> 8)) * scale.s1 * shared_y.s2; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s5 & 0xF000) >> 12)) * scale.s1 * shared_y.s3; \
total_sums.s1 += IQ4_NL_DEQUANT((bits4.s7 & 0x000F)) * scale.s1 * shared_y.s4; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s7 & 0x00F0) >> 4)) * scale.s1 * shared_y.s5; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s7 & 0x0F00) >> 8)) * scale.s1 * shared_y.s6; \
total_sums.s1 += IQ4_NL_DEQUANT(((bits4.s7 & 0xF000) >> 12)) * scale.s1 * shared_y.s7; \
#ifdef ADRENO_GPU
REQD_SUBGROUP_SIZE_64
#endif
kernel void kernel_gemv_noshuffle_iq4_nl_f32(
read_only image1d_buffer_t src0_q,
global half2 * src0_d,
read_only image1d_buffer_t src1,
global float * dst,
ulong offsetd,
int ne00,
int ne01)
{
uint groupId = get_local_id(1);
uint gid = get_global_id(0);
ushort slid = get_sub_group_local_id();
uint K = ne00;
uint M = ne01;
uint LINE_STRIDE_A = M / 2;
uint BLOCK_STRIDE_A = NSUBGROUPS * M;
private uint4 regA;
private half2 regS;
private float8 regB;
private float2 totalSum = (float2)(0.0f);
// loop along K in block granularity, skip 4 blocks every iter
for (uint k = groupId; k < (K / QK4_NL); k += NSUBGROUPS) {
regS = src0_d[gid + k * LINE_STRIDE_A]; // each fiber loads scale of two rows
// first 4 fibers in each wave load 8 B values to its private scope
if (slid < 4) {
regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
}
// load half weights for two blocks in consecutive rows
regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
#ifdef VECTOR_SUB_GROUP_BROADCAST
dequantizeBlockAccum_ns_sgbroadcast_8_hi(totalSum, as_ushort8(regA), regS, regB);
#else
dequantizeBlockAccum_ns_sgbroadcast_1_hi(totalSum, as_ushort8(regA), regS, regB);
#endif // VECTOR_SUB_GROUP_BROADCAST
regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
#ifdef VECTOR_SUB_GROUP_BROADCAST
dequantizeBlockAccum_ns_sgbroadcast_8_lo(totalSum, as_ushort8(regA), regS, regB);
#else
dequantizeBlockAccum_ns_sgbroadcast_1_lo(totalSum, as_ushort8(regA), regS, regB);
#endif // VECTOR_SUB_GROUP_BROADCAST
}
// reduction in local memory, assumes #wave=4
local float2 reduceLM[SUBGROUP_SIZE * 3];
if (groupId == 1) {
reduceLM[SUBGROUP_SIZE * 0 + slid] = totalSum;
}
if (groupId == 2) {
reduceLM[SUBGROUP_SIZE * 1 + slid] = totalSum;
}
if (groupId == 3) {
reduceLM[SUBGROUP_SIZE * 2 + slid] = totalSum;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (groupId == 0) {
totalSum += reduceLM[SUBGROUP_SIZE * 0 + slid];
}
if (groupId == 0) {
totalSum += reduceLM[SUBGROUP_SIZE * 1 + slid];
}
if (groupId == 0) {
totalSum += reduceLM[SUBGROUP_SIZE * 2 + slid];
}
// 2 outputs per fiber in wave 0
if (groupId == 0) {
dst = (global float*)((global char*)dst + offsetd);
vstore2(totalSum, 0, &(dst[gid * 2]));
}
}
@@ -0,0 +1,171 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#define LOAD_VEC_A 8
#define LOAD_VEC_B 4
#define BM 64
#define BN 64
#define BK 32
#define TM 4
#define TN 8
constant float kvalues_iq4nl[16] = {
-127.f, -104.f, -83.f, -65.f, -49.f, -35.f, -22.f, -10.f,
1.f, 13.f, 25.f, 38.f, 53.f, 69.f, 89.f, 113.f
};
kernel void kernel_mul_mm_iq4_nl_f32_l4_lm(
global uchar4 * src0_q,
global half * src0_d,
global float4 * src1,
ulong offset1,
global float * dst,
ulong offsetd,
int ne00,
int ne01,
int ne02,
int ne11,
int ne12,
int stride_a,
int stride_b,
int stride_d,
int batch_stride_a,
int batch_stride_b,
int batch_stride_d,
int r2,
int r3
) {
src1 = (global float4*)((global char*)src1 + offset1);
dst = (global float *)((global char*)dst + offsetd);
local float buf_a[BM * BK];
local float buf_b[BN * BK];
const int batch_idx = get_global_id(2);
const int i13 = batch_idx / ne12;
const int i12 = batch_idx % ne12;
const int i03 = i13 / r3;
const int i02 = i12 / r2;
const int batch_idx_a = i03 * ne02 + i02;
const int ir = get_group_id(0);
const int ic = get_group_id(1);
const int tid = get_local_id(0);
const int th_r = tid % (BM / TM);
const int th_c = tid / (BM / TM);
const int loadr_a = get_local_id(0) % (BK / LOAD_VEC_A);
const int loadc_a = get_local_id(0) / (BK / LOAD_VEC_A);
const int loadr_b = get_local_id(0) % (BK / LOAD_VEC_B);
const int loadc_b = get_local_id(0) / (BK / LOAD_VEC_B);
const int loadstride_a = get_local_size(0) * LOAD_VEC_A / BK;
const int loadstride_b = get_local_size(0) * LOAD_VEC_B / BK;
int pos_a = (batch_idx_a * batch_stride_a + ir * BM * stride_a) / LOAD_VEC_A;
int pos_b = (batch_idx * batch_stride_b + ic * BN * stride_b) / LOAD_VEC_B;
float sums[TM * TN];
float cache_a[TM];
float cache_b[TN];
for (int i = 0; i < TM * TN; i++) {
sums[i] = 0.0f;
}
for (int block = 0; block < ne00; block += BK) {
for (int l = 0; l < BM; l += loadstride_a) {
if (ir*BM + loadc_a + l < ne01) {
int idx = pos_a + (loadc_a + l) * stride_a / LOAD_VEC_A + loadr_a;
int ib = idx / 4;
int iqs = idx % 4;
float d = (float)src0_d[ib];
global uchar4 * qs = src0_q + ib*4 + iqs;
uchar4 q = *qs;
// IQ4_NL: use lookup table instead of linear (nibble - 8)
float4 v1 = (float4)(kvalues_iq4nl[(q.s0 )&0x0F], kvalues_iq4nl[(q.s1 )&0x0F],
kvalues_iq4nl[(q.s2 )&0x0F], kvalues_iq4nl[(q.s3 )&0x0F])*d;
float4 v2 = (float4)(kvalues_iq4nl[(q.s0>>4)&0x0F], kvalues_iq4nl[(q.s1>>4)&0x0F],
kvalues_iq4nl[(q.s2>>4)&0x0F], kvalues_iq4nl[(q.s3>>4)&0x0F])*d;
buf_a[(loadr_a * 4 + 0) * BM + loadc_a + l] = v1.s0;
buf_a[(loadr_a * 4 + 1) * BM + loadc_a + l] = v1.s1;
buf_a[(loadr_a * 4 + 2) * BM + loadc_a + l] = v1.s2;
buf_a[(loadr_a * 4 + 3) * BM + loadc_a + l] = v1.s3;
buf_a[(loadr_a * 4 + 16) * BM + loadc_a + l] = v2.s0;
buf_a[(loadr_a * 4 + 17) * BM + loadc_a + l] = v2.s1;
buf_a[(loadr_a * 4 + 18) * BM + loadc_a + l] = v2.s2;
buf_a[(loadr_a * 4 + 19) * BM + loadc_a + l] = v2.s3;
} else {
buf_a[(loadr_a * 4 + 0) * BM + loadc_a + l] = 0.0f;
buf_a[(loadr_a * 4 + 1) * BM + loadc_a + l] = 0.0f;
buf_a[(loadr_a * 4 + 2) * BM + loadc_a + l] = 0.0f;
buf_a[(loadr_a * 4 + 3) * BM + loadc_a + l] = 0.0f;
buf_a[(loadr_a * 4 + 16) * BM + loadc_a + l] = 0.0f;
buf_a[(loadr_a * 4 + 17) * BM + loadc_a + l] = 0.0f;
buf_a[(loadr_a * 4 + 18) * BM + loadc_a + l] = 0.0f;
buf_a[(loadr_a * 4 + 19) * BM + loadc_a + l] = 0.0f;
}
}
for (int l = 0; l < BN; l += loadstride_b) {
if (ic*BN + loadc_b + l < ne11) {
int idx = pos_b + (loadc_b + l) * stride_b / LOAD_VEC_B + loadr_b;
buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = src1[idx].s0;
buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = src1[idx].s1;
buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = src1[idx].s2;
buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = src1[idx].s3;
} else {
buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = 0.0f;
buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = 0.0f;
buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = 0.0f;
buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = 0.0f;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
pos_a += BK / LOAD_VEC_A;
pos_b += BK / LOAD_VEC_B;
for (int i = 0; i < BK; i++) {
for (int j = 0; j < TM; j++) {
cache_a[j] = buf_a[(i) * BM + th_r * TM + j];
}
for (int j = 0; j < TN; j++) {
cache_b[j] = buf_b[(i) * BN + th_c * TN + j];
}
for (int cc = 0; cc < TN; cc++) {
for (int cr = 0; cr < TM; cr++) {
const int sums_idx = cc*TM + cr;
sums[sums_idx] = mad(cache_a[cr], cache_b[cc], sums[sums_idx]);
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
const int dr = ir * BM + th_r * TM;
const int dc = ic * BN + th_c * TN;
const int offsets = batch_idx * batch_stride_d;
for (int cc = 0; cc < TN; cc++) {
for (int cr = 0; cr < TM; cr++) {
if (dr + cr < ne01 && dc + cc < ne11) {
dst[offsets + (dc + cc) * stride_d + dr + cr] = sums[cc * TM + cr];
}
}
}
}
@@ -0,0 +1,164 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#ifdef cl_intel_subgroups
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
#else
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
#endif
#ifdef cl_intel_required_subgroup_size
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
#define INTEL_GPU 1
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
#elif defined(cl_qcom_reqd_sub_group_size)
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
#define ADRENO_GPU 1
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
#endif
#define QK4_NL 32
typedef char int8_t;
typedef uchar uint8_t;
typedef short int16_t;
typedef ushort uint16_t;
typedef int int32_t;
typedef uint uint32_t;
constant float kvalues_iq4nl[16] = {
-127.f, -104.f, -83.f, -65.f, -49.f, -35.f, -22.f, -10.f,
1.f, 13.f, 25.f, 38.f, 53.f, 69.f, 89.f, 113.f
};
//------------------------------------------------------------------------------
// block_iq4_nl
//------------------------------------------------------------------------------
struct block_iq4_nl
{
half d;
uint8_t qs[QK4_NL / 2];
};
//------------------------------------------------------------------------------
// mul_vec_q_n_f32
//------------------------------------------------------------------------------
// Compute inner product between half a block of iq4_nl and 16 floats (yl).
// il indicates where the quants begin (0 or 8).
inline float block_iq4_nl_dot_y(
global struct block_iq4_nl * qb_curr,
private float * yl,
int il
) {
float d = qb_curr->d;
float acc = 0.f;
global uchar * qs = qb_curr->qs + il;
for (int i = 0; i < 8; ++i) {
acc += yl[i] * kvalues_iq4nl[qs[i] & 0x0F];
acc += yl[i+8] * kvalues_iq4nl[qs[i] >> 4];
}
return d * acc;
}
#ifdef INTEL_GPU
#define N_DST 4 // each subgroup group works on 4 rows
#define N_SUBGROUP 1 // number of subgroups in a thread group
#define N_SUBGROUP_SIZE 16 // assuming subgroup size is 16
#elif defined (ADRENO_GPU)
#define N_DST 4
#define N_SUBGROUP 1
#define N_SUBGROUP_SIZE 64
#endif
inline void mul_vec_q_n_f32(
global void * src0,
global float * src1,
global float * dst,
int ne00,
int ne01,
int ne02,
int ne10,
int ne12,
int ne0,
int ne1,
int r2,
int r3
) {
const ulong nb = ne00/QK4_NL;
int r0 = get_group_id(0);
int r1 = get_group_id(1);
int im = get_group_id(2);
int first_row = (r0 * N_SUBGROUP + get_sub_group_id()) * N_DST;
int i12 = im%ne12;
int i13 = im/ne12;
ulong offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
global struct block_iq4_nl * x = (global struct block_iq4_nl *) src0 + offset0;
global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
float yl[16]; // src1 vector cache
float sumf[N_DST]={0.f};
int ix = get_sub_group_local_id()/2;
int il = 8*(get_sub_group_local_id()%2);
global float * yb = y + ix * QK4_NL + il;
// each thread in a SIMD group deals with half a block.
for (int ib = ix; ib < nb; ib += N_SUBGROUP_SIZE/2) {
for (int i = 0; i < 8; ++i) {
yl[i] = yb[i];
yl[i+8] = yb[i+16];
}
for (int row = 0; row < N_DST; row++) {
sumf[row] += block_iq4_nl_dot_y(x+ib+row*nb, yl, il);
}
yb += QK4_NL * (N_SUBGROUP_SIZE/2);
}
float tot[N_DST] = {
sub_group_reduce_add(sumf[0]), sub_group_reduce_add(sumf[1]),
sub_group_reduce_add(sumf[2]), sub_group_reduce_add(sumf[3])};
for (int row = 0; row < N_DST; ++row) {
if (get_sub_group_local_id() == 0 && first_row + row < ne01) {
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot[row];
}
}
}
#ifdef INTEL_GPU
REQD_SUBGROUP_SIZE_16
#elif defined (ADRENO_GPU)
REQD_SUBGROUP_SIZE_64
#endif
kernel void kernel_mul_mv_iq4_nl_f32(
global void * src0,
ulong offset0,
global float * src1,
ulong offset1,
global float * dst,
ulong offsetd,
int ne00,
int ne01,
int ne02,
int ne10,
int ne12,
int ne0,
int ne1,
int r2,
int r3
) {
src0 = (global void*)((global char*)src0 + offset0);
src1 = (global float*)((global char*)src1 + offset1);
dst = (global float*)((global char*)dst + offsetd);
mul_vec_q_n_f32(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
}
@@ -0,0 +1,202 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#ifdef cl_intel_subgroups
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
#else
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
#endif
#ifdef cl_intel_required_subgroup_size
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
#define INTEL_GPU 1
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
#elif defined(cl_qcom_reqd_sub_group_size)
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
#define ADRENO_GPU 1
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
#endif
#define QK4_NL 32
typedef char int8_t;
typedef uchar uint8_t;
typedef short int16_t;
typedef ushort uint16_t;
typedef int int32_t;
typedef uint uint32_t;
constant float kvalues_iq4nl[16] = {
-127.f, -104.f, -83.f, -65.f, -49.f, -35.f, -22.f, -10.f,
1.f, 13.f, 25.f, 38.f, 53.f, 69.f, 89.f, 113.f
};
//------------------------------------------------------------------------------
// block_iq4_nl
//------------------------------------------------------------------------------
struct block_iq4_nl
{
half d;
uint8_t qs[QK4_NL / 2];
};
// Compute dot product between half a block of iq4_nl quants and activations.
// x points to the quant bytes, dh points to the scale.
// yl has 16 activation values: [0..7] for low nibbles, [8..15] for high nibbles.
// il indicates offset into the quant bytes (0 or 8).
inline float block_iq4_nl_dot_y_flat(
global uchar * x,
global half * dh,
private float * yl,
int il
) {
float d = *dh;
global uchar * qs = x + il;
float acc = 0.f;
for (int i = 0; i < 8; ++i) {
acc += yl[i] * kvalues_iq4nl[qs[i] & 0x0F];
acc += yl[i+8] * kvalues_iq4nl[qs[i] >> 4];
}
return d * acc;
}
#undef N_DST
#undef N_SIMDGROUP
#undef N_SIMDWIDTH
#ifdef INTEL_GPU
#define N_DST 8 // each subgroup works on 8 rows
#define N_SUBGROUP 1 // number of subgroups in a thread group
#define N_SUBGROUP_SIZE 16 // assuming subgroup size is 16
#elif defined (ADRENO_GPU)
#define N_DST 8
#define N_SUBGROUP 1
#define N_SUBGROUP_SIZE 64
#endif
inline void mul_vec_q_n_f32_8x_flat(
global uchar * src0_q,
global half * src0_d,
global float * src1,
global float * dst,
int ne00,
int ne01,
int ne02,
int ne10,
int ne12,
int ne0,
int ne1,
int r2,
int r3
) {
const ulong nb = ne00/QK4_NL;
int r0 = get_group_id(0);
int r1 = get_group_id(1);
int im = get_group_id(2);
int first_row = (r0 * N_SUBGROUP + get_sub_group_id()) * N_DST;
int i12 = im%ne12;
int i13 = im/ne12;
// The number of scales is the same as the number of blocks.
ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
// Each block contains QK4_NL/2 uchars, hence offset for qs is as follows.
ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_NL/2;
global uchar * x = (global uchar *) src0_q + offset0_q;
global half * d = (global half *) src0_d + offset0_d;
global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
float yl[16];
float8 sumf = 0.f;
int ix = get_sub_group_local_id()/2;
int il = 8*(get_sub_group_local_id()%2);
global float * yb = y + ix*QK4_NL + il;
for (int ib = ix; ib < nb; ib += N_SUBGROUP_SIZE/2) {
for (int i = 0; i < 8; ++i) {
yl[i] = yb[i];
yl[i+8] = yb[i+16];
}
sumf.s0 += block_iq4_nl_dot_y_flat(x + ib*QK4_NL/2 + 0*nb*QK4_NL/2, d + ib + 0*nb, yl, il);
sumf.s1 += block_iq4_nl_dot_y_flat(x + ib*QK4_NL/2 + 1*nb*QK4_NL/2, d + ib + 1*nb, yl, il);
sumf.s2 += block_iq4_nl_dot_y_flat(x + ib*QK4_NL/2 + 2*nb*QK4_NL/2, d + ib + 2*nb, yl, il);
sumf.s3 += block_iq4_nl_dot_y_flat(x + ib*QK4_NL/2 + 3*nb*QK4_NL/2, d + ib + 3*nb, yl, il);
sumf.s4 += block_iq4_nl_dot_y_flat(x + ib*QK4_NL/2 + 4*nb*QK4_NL/2, d + ib + 4*nb, yl, il);
sumf.s5 += block_iq4_nl_dot_y_flat(x + ib*QK4_NL/2 + 5*nb*QK4_NL/2, d + ib + 5*nb, yl, il);
sumf.s6 += block_iq4_nl_dot_y_flat(x + ib*QK4_NL/2 + 6*nb*QK4_NL/2, d + ib + 6*nb, yl, il);
sumf.s7 += block_iq4_nl_dot_y_flat(x + ib*QK4_NL/2 + 7*nb*QK4_NL/2, d + ib + 7*nb, yl, il);
yb += QK4_NL * (N_SUBGROUP_SIZE/2);
}
float8 tot = (float8)(
sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3),
sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5),
sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7)
);
if (get_sub_group_local_id() == 0) {
if (first_row + 0 < ne01) {
dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
}
if (first_row + 1 < ne01) {
dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
}
if (first_row + 2 < ne01) {
dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
}
if (first_row + 3 < ne01) {
dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
}
if (first_row + 4 < ne01) {
dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
}
if (first_row + 5 < ne01) {
dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
}
if (first_row + 6 < ne01) {
dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
}
if (first_row + 7 < ne01) {
dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
}
}
}
#ifdef INTEL_GPU
REQD_SUBGROUP_SIZE_16
#elif defined (ADRENO_GPU)
REQD_SUBGROUP_SIZE_64
#endif
kernel void kernel_mul_mv_iq4_nl_f32_flat(
global uchar * src0_q,
global half * src0_d,
global float * src1,
ulong offset1,
global float * dst,
ulong offsetd,
int ne00,
int ne01,
int ne02,
int ne10,
int ne12,
int ne0,
int ne1,
int r2,
int r3
) {
src1 = (global float*)((global char*)src1 + offset1);
dst = (global float*)((global char*)dst + offsetd);
mul_vec_q_n_f32_8x_flat(src0_q, src0_d, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
}
+1 -1
View File
@@ -224,7 +224,7 @@ struct sycl_device_info {
// cudaOccupancyMaxActiveBlocksPerMultiprocessor
bool vmm; // virtual memory support
size_t total_vram;
//sycl_hw_info hw_info; \\ device id and aarch, currently not used
sycl_hw_info hw_info;
optimize_feature opt_feature;
};
+60 -1
View File
@@ -104,6 +104,7 @@ static ggml_sycl_device_info ggml_sycl_init() {
info.max_work_group_sizes[i] = prop.get_max_work_group_size();
info.devices[i].max_wg_per_cu = info.max_work_group_sizes[i] / prop.get_max_compute_units();
info.devices[i].hw_info = get_device_hw_info(&device);
}
@@ -3703,9 +3704,16 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
// Dispatch becomes obscure with the reorder, MMVQ when the reorder optimization
// is enabled takes precedence over DMMV, the current if-else implementation
// requires disabling DMMV if both conditions are met
if (!g_ggml_sycl_prioritize_dmmv && ((should_reorder_tensor(ctx, dst) &&
ggml_sycl_supports_reorder_mmvq(src0->type)))) {
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
// Arc770 get benefit with Q4_0 by skipping it.
if (!(ggml_sycl_info().devices[ctx.device].hw_info.arch ==
gpu_arch::intel_gpu_acm_g10 &&
src0->type == GGML_TYPE_Q4_0)) {
use_dequantize_mul_mat_vec =
use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
}
}
if (!split && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
@@ -3808,6 +3816,51 @@ __dpct_inline__ static void k_copy_dst_from_contiguous(
}
}
// Fused MoE TG fast path. Returns false to fall back to the per-expert loop below.
static bool ggml_sycl_mul_mat_id_mmvq_fused(
ggml_backend_sycl_context & ctx, const ggml_tensor * src0,
const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst)
{
const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
const int64_t ne12 = src1->ne[2];
if (ne12 != 1) return false;
if (src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) return false;
if (ne10 != src0->ne[0] || ne10 % QK8_1 != 0) return false;
if (!ggml_is_contiguous(src1)) return false;
// Reorder layout not supported; fall back.
const ggml_tensor_extra_gpu * src0_extra =
static_cast<const ggml_tensor_extra_gpu *>(src0->extra);
if (src0_extra && src0_extra->optimized_feature.reorder) return false;
const int64_t n_ids_per_group = ids->ne[0];
if (ids->ne[1] != 1) return false;
if (ne11 != 1 && ne11 != n_ids_per_group) return false;
const queue_ptr stream = ctx.stream();
const int src1_padded_cols = GGML_PAD((int) ne10, MATRIX_ROW_PADDING);
const int n_experts_used = (int) n_ids_per_group;
const int nrows = (int) src0->ne[1];
ggml_sycl_pool_alloc<char> src1_q8_alloc(ctx.pool(),
(size_t) ne11 * src1_padded_cols * sizeof(block_q8_1) / QK8_1);
char * src1_ddq = src1_q8_alloc.get();
quantize_row_q8_1_sycl<quantize_q8_1>(
(const float *) src1->data, src1_ddq, (int) ne10, (int) ne11,
src1_padded_cols, stream);
const size_t bytes_per_qrow = (size_t) src1_padded_cols * sizeof(block_q8_1) / QK8_1;
const size_t src1_row_stride = (ne11 == 1) ? 0 : bytes_per_qrow;
return ggml_sycl_mul_mat_vec_q_id(
src0->type, src0->data, src1_ddq, (const int32_t *) ids->data,
(float *) dst->data, (int) ne10, nrows, n_experts_used,
/*expert_weight_stride=*/ src0->nb[2],
/*dst_row_stride=*/ dst->nb[1],
src1_row_stride, stream);
}
static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
ggml_tensor *dst) try {
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/3);
@@ -3823,6 +3876,12 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
const int64_t n_as = ne02;
const int64_t n_ids = ids->ne[0];
if (ne12 == 1) {
if (ggml_sycl_mul_mat_id_mmvq_fused(ctx, src0, src1, ids, dst)) {
return;
}
}
std::vector<char> ids_host(ggml_nbytes(ids));
const char * ids_dev = (const char *) ids->data;
+151
View File
@@ -1199,3 +1199,154 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
GGML_UNUSED(src1_ddf_i);
GGML_UNUSED(ctx);
}
// src1_row_stride: 0 for shared src1 (gate/up proj), else per-expert stride (down proj).
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
static void mul_mat_vec_q_moe(
const void * __restrict__ vx_base, const void * __restrict__ vy_base,
float * __restrict__ dst_base, const int32_t * __restrict__ ids_dev,
const int ncols, const int nrows,
const size_t expert_weight_stride, const size_t dst_row_stride,
const size_t src1_row_stride,
const sycl::nd_item<3> & item_ct1) {
const int expert_idx = item_ct1.get_group(1);
const int i02 = ids_dev[expert_idx];
const char * vx = (const char *) vx_base + (size_t) i02 * expert_weight_stride;
const char * vy = (const char *) vy_base + (size_t) expert_idx * src1_row_stride;
float * dst = (float *) ((char *) dst_base + (size_t) expert_idx * dst_row_stride);
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1);
if (row >= nrows) {
return;
}
const int blocks_per_row = ncols / qk;
constexpr int blocks_per_warp = (vdr * WARP_SIZE + qi - 1) / qi;
float tmp = 0.0f;
const block_q_t * x = (const block_q_t *) vx;
const block_q8_1 * y = (const block_q8_1 *) vy;
for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; i += blocks_per_warp) {
const int ibx = row * blocks_per_row + i;
const int iby = i * (qk / QK8_1);
for (size_t elem = 0; elem < qi / vdr; elem += WARP_SIZE) {
const int iqs = elem + vdr * (item_ct1.get_local_id(2) % (qi / vdr));
tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
}
}
#pragma unroll
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
}
if (item_ct1.get_local_id(2) == 0) {
dst[row] = tmp;
}
}
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
static void launch_mul_mat_vec_q_moe(
const void * vx_base, const void * vy, const int32_t * ids_dev,
float * dst_base, const int ncols, const int nrows, const int n_experts_used,
const size_t expert_weight_stride, const size_t dst_row_stride,
const size_t src1_row_stride,
dpct::queue_ptr stream) {
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
const sycl::range<3> block_nums(1, (unsigned) n_experts_used, (unsigned) block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
stream->submit([&](sycl::handler & cgh) {
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
mul_mat_vec_q_moe<qk, qi, block_q_t, vdr, vec_dot_q_sycl>(
vx_base, vy, dst_base, ids_dev, ncols, nrows,
expert_weight_stride, dst_row_stride, src1_row_stride, item);
});
});
}
bool ggml_sycl_mul_mat_vec_q_id(
enum ggml_type src0_type,
const void * vx_base,
const void * vy,
const int32_t * ids_dev,
float * dst_base,
int ncols,
int nrows,
int n_experts_used,
size_t expert_weight_stride,
size_t dst_row_stride,
size_t src1_row_stride,
dpct::queue_ptr stream) {
switch (src0_type) {
case GGML_TYPE_Q4_0:
launch_mul_mat_vec_q_moe<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
return true;
case GGML_TYPE_Q4_1:
launch_mul_mat_vec_q_moe<QK4_1, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
return true;
case GGML_TYPE_Q5_0:
launch_mul_mat_vec_q_moe<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
return true;
case GGML_TYPE_Q5_1:
launch_mul_mat_vec_q_moe<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
return true;
case GGML_TYPE_Q8_0:
launch_mul_mat_vec_q_moe<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
return true;
case GGML_TYPE_Q2_K:
launch_mul_mat_vec_q_moe<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
return true;
case GGML_TYPE_Q3_K:
launch_mul_mat_vec_q_moe<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
return true;
case GGML_TYPE_Q4_K:
launch_mul_mat_vec_q_moe<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
return true;
case GGML_TYPE_Q5_K:
launch_mul_mat_vec_q_moe<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
return true;
case GGML_TYPE_Q6_K:
launch_mul_mat_vec_q_moe<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
return true;
case GGML_TYPE_MXFP4:
launch_mul_mat_vec_q_moe<QK_MXFP4, QI_MXFP4, block_mxfp4, VDR_MXFP4_Q8_1_MMVQ, vec_dot_mxfp4_q8_1>(
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
return true;
case GGML_TYPE_NVFP4:
launch_mul_mat_vec_q_moe<QK_NVFP4, QI_NVFP4, block_nvfp4, VDR_NVFP4_Q8_1_MMVQ, vec_dot_nvfp4_q8_1>(
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
return true;
default:
return false;
}
}
+16
View File
@@ -24,4 +24,20 @@ void ggml_sycl_op_mul_mat_vec_q(
const int64_t src1_ncols, const int64_t src1_padded_row_size,
const dpct::queue_ptr &stream);
// Requires standard (non-reorder) block layout for src0.
// Returns false if src0_type isn't handled; caller should fall back.
bool ggml_sycl_mul_mat_vec_q_id(
enum ggml_type src0_type,
const void * vx_base, // start of stacked expert weights
const void * vy, // pre-quantized src1 (Q8_1)
const int32_t * ids_dev, // device-side int32, length n_experts_used
float * dst_base,
int ncols,
int nrows,
int n_experts_used,
size_t expert_weight_stride, // bytes between experts in vx_base
size_t dst_row_stride, // bytes between dst rows
size_t src1_row_stride, // 0 = shared src1, else per-expert stride in bytes
dpct::queue_ptr stream);
#endif // GGML_SYCL_MMVQ_HPP
+63 -11
View File
@@ -1,15 +1,67 @@
#include "sycl_hw.hpp"
// TODO: currently not used
/*
sycl_hw_info get_device_hw_info(sycl::device *device_ptr) {
sycl_hw_info res;
int32_t id = device_ptr->get_info<sycl::ext::intel::info::device::device_id>();
res.device_id = id;
using namespace std;
syclex::architecture arch = device_ptr->get_info<syclex::info::device::architecture>();
res.arch = arch;
return res;
}
/*defined in
* /opt/intel/oneapi/compiler/latest/include/sycl/ext/oneapi/experimental/device_architecture.def
*/
static map<gpu_arch, std::pair<const char*, sycl_intel_gpu_family>> arch2name = {
{gpu_arch::intel_gpu_bdw, {"intel_gpu_bdw", GPU_FAMILY_IGPU_NON_XE}},
{gpu_arch::intel_gpu_skl, {"intel_gpu_skl", GPU_FAMILY_IGPU_NON_XE}},
{gpu_arch::intel_gpu_kbl, {"intel_gpu_kbl", GPU_FAMILY_IGPU_NON_XE}},
{gpu_arch::intel_gpu_cfl, {"intel_gpu_cfl", GPU_FAMILY_IGPU_NON_XE}},
{gpu_arch::intel_gpu_apl, {"intel_gpu_apl", GPU_FAMILY_IGPU_NON_XE}},
{gpu_arch::intel_gpu_glk, {"intel_gpu_glk", GPU_FAMILY_IGPU_NON_XE}},
{gpu_arch::intel_gpu_whl, {"intel_gpu_whl", GPU_FAMILY_IGPU_NON_XE}},
{gpu_arch::intel_gpu_aml, {"intel_gpu_aml", GPU_FAMILY_IGPU_NON_XE}},
{gpu_arch::intel_gpu_cml, {"intel_gpu_cml", GPU_FAMILY_IGPU_NON_XE}},
{gpu_arch::intel_gpu_icllp, {"intel_gpu_icllp", GPU_FAMILY_IGPU_NON_XE}},
{gpu_arch::intel_gpu_ehl, {"intel_gpu_ehl", GPU_FAMILY_IGPU_NON_XE}},
{gpu_arch::intel_gpu_tgllp, {"intel_gpu_tgllp", GPU_FAMILY_IGPU_NON_XE}},
{gpu_arch::intel_gpu_rkl, {"intel_gpu_rkl", GPU_FAMILY_IGPU_NON_XE}},
{gpu_arch::intel_gpu_adl_s, {"intel_gpu_adl_s", GPU_FAMILY_IGPU_NON_XE}},
{gpu_arch::intel_gpu_adl_p, {"intel_gpu_adl_p", GPU_FAMILY_IGPU_NON_XE}},
{gpu_arch::intel_gpu_adl_n, {"intel_gpu_adl_n", GPU_FAMILY_IGPU_NON_XE}},
{gpu_arch::intel_gpu_dg1, {"intel_gpu_dg1", GPU_FAMILY_DGPU_CLIENT_GAME}},
{gpu_arch::intel_gpu_acm_g10, {"intel_gpu_acm_g10", GPU_FAMILY_DGPU_CLIENT_GAME}},
{gpu_arch::intel_gpu_acm_g11, {"intel_gpu_acm_g11", GPU_FAMILY_DGPU_CLIENT_GAME}},
{gpu_arch::intel_gpu_acm_g12, {"intel_gpu_acm_g12", GPU_FAMILY_DGPU_CLIENT_GAME}},
{gpu_arch::intel_gpu_pvc, {"intel_gpu_pvc", GPU_FAMILY_DGPU_CLOUD}},
{gpu_arch::intel_gpu_pvc_vg, {"intel_gpu_pvc_vg", GPU_FAMILY_DGPU_CLOUD}},
{gpu_arch::intel_gpu_mtl_u, {"intel_gpu_mtl_u", GPU_FAMILY_IGPU_XE}},
{gpu_arch::intel_gpu_mtl_h, {"intel_gpu_mtl_h", GPU_FAMILY_IGPU_XE}},
{gpu_arch::intel_gpu_arl_h, {"intel_gpu_arl_h", GPU_FAMILY_IGPU_XE}},
{gpu_arch::intel_gpu_bmg_g21, {"intel_gpu_bmg_g21", GPU_FAMILY_DGPU_CLIENT_GAME}},
{gpu_arch::intel_gpu_bmg_g31, {"intel_gpu_bmg_g31", GPU_FAMILY_DGPU_CLIENT_GAME}},
{gpu_arch::intel_gpu_lnl_m, {"intel_gpu_lnl_m", GPU_FAMILY_IGPU_XE}},
{gpu_arch::intel_gpu_ptl_h, {"intel_gpu_ptl_h", GPU_FAMILY_IGPU_XE}},
{gpu_arch::intel_gpu_ptl_u, {"intel_gpu_ptl_u", GPU_FAMILY_IGPU_XE}},
{gpu_arch::intel_gpu_wcl, {"intel_gpu_wcl", GPU_FAMILY_IGPU_XE}}
};
sycl_hw_info get_device_hw_info(sycl::device* device_ptr) {
sycl_hw_info res;
int32_t id =
device_ptr->get_info<sycl::ext::intel::info::device::device_id>();
res.device_id = id;
res.name = device_ptr->get_info<sycl::info::device::name>();
syclex::architecture arch =
device_ptr->get_info<syclex::info::device::architecture>();
res.arch = arch;
map<syclex::architecture,
std::pair<const char*, sycl_intel_gpu_family>>::iterator it =
arch2name.find(res.arch);
if (it != arch2name.end()) {
res.arch_name = it->second.first;
res.gpu_family = it->second.second;
} else {
res.arch_name = "unknown";
res.gpu_family = GPU_FAMILY_UKNOWN;
}
return res;
}
+20 -8
View File
@@ -9,18 +9,30 @@
#include <sycl/sycl.hpp>
namespace syclex = sycl::ext::oneapi::experimental;
using gpu_arch = sycl::ext::oneapi::experimental::architecture;
// TODO: currently not used
/*
struct sycl_hw_info {
syclex::architecture arch;
int32_t device_id;
// It's used to mark the GPU computing capacity
// The value must flow the order of performance.
enum sycl_intel_gpu_family {
GPU_FAMILY_UKNOWN = -1,
// iGPU without Xe core, before Meteor Lake iGPU(Xe)
GPU_FAMILY_IGPU_NON_XE = 0,
// iGPU with Xe core, Meteor Lake iGPU or newer.
GPU_FAMILY_IGPU_XE = 1,
// dGPU for gaming in client/data center (DG1/FLex 140 or newer).
GPU_FAMILY_DGPU_CLIENT_GAME = 2,
// dGPU for AI in cloud, PVC or newer.
GPU_FAMILY_DGPU_CLOUD = 3
};
bool is_in_vector(std::vector<int> &vec, int item);
struct sycl_hw_info {
syclex::architecture arch;
const char* arch_name;
int32_t device_id;
std::string name;
sycl_intel_gpu_family gpu_family;
};
sycl_hw_info get_device_hw_info(sycl::device *device_ptr);
*/
#endif // SYCL_HW_HPP
+290 -158
View File
@@ -26,20 +26,23 @@
// Matrix multiplication parameters
// Register tiling parameters
#define WEBGPU_MUL_MAT_TILE_M 8
#define WEBGPU_MUL_MAT_TILE_N 8
#define WEBGPU_MUL_MAT_TILE_M 4
#define WEBGPU_MUL_MAT_TILE_N 4
#define WEBGPU_MUL_MAT_WG_SIZE_M 8
#define WEBGPU_MUL_MAT_WG_SIZE_N 8
#define WEBGPU_MUL_MAT_TILE_K 32
#define WEBGPU_MUL_MAT_REG_TILE_K_FLOAT 8
#define WEBGPU_MUL_MAT_REG_TILE_K_QUANT 32
// Subgroup matrix parameters
// The number of subgroups in the M dimension
#define WEBGPU_MUL_MAT_SUBGROUP_M 2
// The number of subgroups in the N dimension
#define WEBGPU_MUL_MAT_SUBGROUP_N 2
#define WEBGPU_MUL_MAT_SUBGROUP_N 4
// The number of subgroup matrices each subgroup accumulates over
#define WEBGPU_MUL_MAT_SUBGROUP_MATRIX_M 4
#define WEBGPU_MUL_MAT_SUBGROUP_MATRIX_N 2
#define WEBGPU_MUL_MAT_SUBGROUP_TILE_K_FLOAT 32
#define WEBGPU_MUL_MAT_SUBGROUP_TILE_K_QUANT 32
// Matrix-vector multiplication parameters
#define WEBGPU_MUL_MAT_VEC_WG_SIZE 256
@@ -98,6 +101,29 @@ struct ggml_webgpu_ssm_conv_shader_decisions {
uint32_t tokens_per_wg;
};
struct ggml_webgpu_ssm_scan_pipeline_key {
int type;
int d_state;
bool operator==(const ggml_webgpu_ssm_scan_pipeline_key & other) const {
return type == other.type && d_state == other.d_state;
}
};
struct ggml_webgpu_ssm_scan_pipeline_key_hash {
size_t operator()(const ggml_webgpu_ssm_scan_pipeline_key & key) const {
size_t seed = 0;
ggml_webgpu_hash_combine(seed, key.type);
ggml_webgpu_hash_combine(seed, key.d_state);
return seed;
}
};
struct ggml_webgpu_ssm_scan_shader_decisions {
uint32_t wg_size;
uint32_t tokens_per_tile;
};
/** Argsort **/
struct ggml_webgpu_argsort_shader_lib_context {
@@ -197,11 +223,12 @@ struct ggml_webgpu_row_norm_pipeline_key_hash {
/** RMS_NORM + MUL **/
struct ggml_webgpu_rms_norm_mul_pipeline_key {
bool inplace;
bool src_overlap;
bool inplace; // rn_src == dst
bool overlap; // mul_src == dst
bool src_overlap; // rn_src == mul_src
bool operator==(const ggml_webgpu_rms_norm_mul_pipeline_key & other) const {
return inplace == other.inplace && src_overlap == other.src_overlap;
return inplace == other.inplace && overlap == other.overlap && src_overlap == other.src_overlap;
}
};
@@ -209,6 +236,7 @@ struct ggml_webgpu_rms_norm_mul_pipeline_key_hash {
size_t operator()(const ggml_webgpu_rms_norm_mul_pipeline_key & key) const {
size_t seed = 0;
ggml_webgpu_hash_combine(seed, key.inplace);
ggml_webgpu_hash_combine(seed, key.overlap);
ggml_webgpu_hash_combine(seed, key.src_overlap);
return seed;
}
@@ -434,19 +462,27 @@ struct ggml_webgpu_unary_pipeline_key_hash {
/** FlashAttention */
enum ggml_webgpu_flash_attn_path : uint32_t {
GGML_WEBGPU_FLASH_ATTN_PATH_SUBGROUP_MATRIX = 0u,
GGML_WEBGPU_FLASH_ATTN_PATH_TILE = 1u,
GGML_WEBGPU_FLASH_ATTN_PATH_VEC = 2u,
};
struct ggml_webgpu_flash_attn_pipeline_key {
ggml_type kv_type;
uint32_t head_dim_qk;
uint32_t head_dim_v;
bool kv_direct;
bool kv_overlap;
bool has_mask;
bool has_sinks;
bool uses_logit_softcap;
uint32_t path;
bool operator==(const ggml_webgpu_flash_attn_pipeline_key & other) const {
return kv_type == other.kv_type && head_dim_qk == other.head_dim_qk && head_dim_v == other.head_dim_v &&
kv_direct == other.kv_direct && has_mask == other.has_mask && has_sinks == other.has_sinks &&
uses_logit_softcap == other.uses_logit_softcap;
kv_direct == other.kv_direct && kv_overlap == other.kv_overlap && has_mask == other.has_mask &&
has_sinks == other.has_sinks && uses_logit_softcap == other.uses_logit_softcap && path == other.path;
}
};
@@ -457,39 +493,70 @@ struct ggml_webgpu_flash_attn_pipeline_key_hash {
ggml_webgpu_hash_combine(seed, key.head_dim_qk);
ggml_webgpu_hash_combine(seed, key.head_dim_v);
ggml_webgpu_hash_combine(seed, key.kv_direct);
ggml_webgpu_hash_combine(seed, key.kv_overlap);
ggml_webgpu_hash_combine(seed, key.has_mask);
ggml_webgpu_hash_combine(seed, key.has_sinks);
ggml_webgpu_hash_combine(seed, key.uses_logit_softcap);
ggml_webgpu_hash_combine(seed, key.path);
return seed;
}
};
struct ggml_webgpu_flash_attn_decisions {
uint32_t q_tile = 0;
uint32_t kv_tile = 0;
uint32_t wg_size = 0;
uint32_t path = GGML_WEBGPU_FLASH_ATTN_PATH_SUBGROUP_MATRIX;
uint32_t q_tile = 0;
uint32_t kv_tile = 0;
uint32_t wg_size = 0;
bool kv_direct = false;
};
struct ggml_webgpu_flash_attn_vec_decisions {
uint32_t kv_tile = 0;
uint32_t wg_size = 0;
};
inline constexpr uint32_t GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH = 4u;
inline constexpr uint32_t GGML_WEBGPU_FLASH_ATTN_TILE_Q_TILE = 4u;
inline uint32_t ggml_webgpu_flash_attn_pick_vec_ne(const ggml_webgpu_flash_attn_pipeline_key & key) {
if (key.path != GGML_WEBGPU_FLASH_ATTN_PATH_VEC || key.kv_type != GGML_TYPE_F16 ||
key.head_dim_qk != key.head_dim_v) {
return 1u;
}
switch (key.head_dim_qk) {
case 64:
case 192:
case 576:
return 2u;
case 96:
return 4u;
default:
return 1u;
}
}
inline ggml_webgpu_flash_attn_pipeline_key ggml_webgpu_flash_attn_make_pipeline_key(
const ggml_webgpu_shader_lib_context & context) {
const ggml_webgpu_shader_lib_context & context,
uint32_t path) {
const bool has_mask = context.src3 != nullptr;
const bool has_sinks = context.src4 != nullptr;
const bool kv_direct = (context.src1->type == GGML_TYPE_F16) && (context.src0->ne[0] % context.sg_mat_k == 0) &&
(context.src1->ne[1] % GGML_WEBGPU_KV_SEQ_PAD == 0);
bool kv_direct = false;
if (path != GGML_WEBGPU_FLASH_ATTN_PATH_TILE) {
uint32_t kv_direct_align = GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH;
if (path == GGML_WEBGPU_FLASH_ATTN_PATH_SUBGROUP_MATRIX) {
kv_direct_align = context.sg_mat_k;
}
kv_direct = (context.src1->type == GGML_TYPE_F16) &&
(context.src0->ne[0] % std::max(1u, kv_direct_align) == 0) &&
(context.src1->ne[1] % GGML_WEBGPU_KV_SEQ_PAD == 0);
}
ggml_webgpu_flash_attn_pipeline_key key = {};
key.kv_type = context.src1->type;
key.head_dim_qk = (uint32_t) context.src0->ne[0];
key.head_dim_v = (uint32_t) context.src2->ne[0];
key.kv_direct = kv_direct;
key.kv_overlap = context.src_overlap;
key.has_mask = has_mask;
key.has_sinks = has_sinks;
key.uses_logit_softcap = ggml_get_op_params_f32(context.dst, 2) != 0.0f;
key.path = path;
return key;
}
@@ -552,11 +619,19 @@ inline size_t ggml_webgpu_flash_attn_wg_mem_bytes(uint32_t q_tile,
inline uint32_t ggml_webgpu_flash_attn_max_kv_tile(const ggml_webgpu_shader_lib_context & context,
const ggml_webgpu_flash_attn_pipeline_key & key) {
const size_t limit_bytes = context.wg_mem_limit_bytes;
const size_t q_tile = context.sg_mat_m;
const size_t limit_bytes = context.wg_mem_limit_bytes;
uint32_t q_tile = context.sg_mat_m;
uint32_t kv_granularity = context.sg_mat_n;
if (key.path == GGML_WEBGPU_FLASH_ATTN_PATH_TILE) {
q_tile = GGML_WEBGPU_FLASH_ATTN_TILE_Q_TILE;
kv_granularity = std::max(1u, context.max_subgroup_size);
} else if (key.path == GGML_WEBGPU_FLASH_ATTN_PATH_VEC) {
q_tile = 1u;
kv_granularity = 8u;
}
const size_t base_q_bytes = (key.head_dim_qk + key.head_dim_v) * q_tile * GGML_WEBGPU_F16_SIZE_BYTES +
2 * q_tile * GGML_WEBGPU_F32_SIZE_BYTES;
size_t bytes_per_kv = 0;
size_t bytes_per_kv = 0;
if (!key.kv_direct) {
bytes_per_kv += std::max(key.head_dim_qk, key.head_dim_v);
}
@@ -566,23 +641,90 @@ inline uint32_t ggml_webgpu_flash_attn_max_kv_tile(const ggml_webgpu_shader_lib_
bytes_per_kv += q_tile;
bytes_per_kv *= GGML_WEBGPU_F16_SIZE_BYTES;
const uint32_t max_kv_tile = (limit_bytes - base_q_bytes) / bytes_per_kv;
return (max_kv_tile / context.sg_mat_n) * context.sg_mat_n;
return (max_kv_tile / kv_granularity) * kv_granularity;
}
inline uint32_t ggml_webgpu_flash_attn_vec_get_kv_tile(const ggml_webgpu_shader_lib_context & context) {
const ggml_webgpu_flash_attn_pipeline_key key = ggml_webgpu_flash_attn_make_pipeline_key(context);
const uint32_t min_kv_tile = ggml_webgpu_flash_attn_max_kv_tile(context, key);
uint32_t kv_tile = std::max(context.sg_mat_n, std::min(32u, min_kv_tile));
kv_tile = (kv_tile / context.sg_mat_n) * context.sg_mat_n;
inline ggml_webgpu_flash_attn_decisions ggml_webgpu_flash_attn_get_decisions(
const ggml_webgpu_shader_lib_context & context,
size_t storage_offset_alignment) {
ggml_webgpu_flash_attn_decisions decisions = {};
const size_t alignment = std::max<size_t>(1u, storage_offset_alignment);
const auto * K = context.src1;
const auto * V = context.src2;
GGML_ASSERT(K != nullptr);
GGML_ASSERT(V != nullptr);
if (key.kv_direct) {
kv_tile = std::min(kv_tile, GGML_WEBGPU_KV_SEQ_PAD);
while (GGML_WEBGPU_KV_SEQ_PAD % kv_tile != 0) {
kv_tile -= context.sg_mat_n;
const auto flash_attn_tensor_offset = [](const ggml_tensor * tensor) -> size_t {
constexpr uintptr_t ptr_base_addr = 0x1000u;
const ggml_tensor * base = tensor->view_src != nullptr ? tensor->view_src : tensor;
return reinterpret_cast<uintptr_t>(base->data) - ptr_base_addr + tensor->view_offs;
};
const uint32_t k_offset_elems =
(uint32_t) ((flash_attn_tensor_offset(K) & (alignment - 1)) / ggml_type_size(K->type));
const uint32_t v_offset_elems =
(uint32_t) ((flash_attn_tensor_offset(V) & (alignment - 1)) / ggml_type_size(V->type));
const bool f16_vec4_aligned = (k_offset_elems % GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH == 0u) &&
(v_offset_elems % GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH == 0u);
const bool kv_vec_type_supported =
K->type == GGML_TYPE_F16 || K->type == GGML_TYPE_Q4_0 || K->type == GGML_TYPE_Q8_0;
const bool use_vec = context.supports_subgroups && (context.src0->ne[1] < 20) && (context.src0->ne[0] % 32 == 0) &&
(context.src2->ne[0] % GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH == 0) &&
kv_vec_type_supported && (K->type != GGML_TYPE_F16 || f16_vec4_aligned) &&
(context.src2->type == K->type);
const bool use_tile = context.supports_subgroups && !context.supports_subgroup_matrix && K->type == GGML_TYPE_F16 &&
V->type == GGML_TYPE_F16 && f16_vec4_aligned &&
(context.src0->ne[0] % GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH == 0) &&
(context.src2->ne[0] % GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH == 0) && !use_vec;
decisions.path = use_vec ? GGML_WEBGPU_FLASH_ATTN_PATH_VEC :
use_tile ? GGML_WEBGPU_FLASH_ATTN_PATH_TILE :
GGML_WEBGPU_FLASH_ATTN_PATH_SUBGROUP_MATRIX;
const ggml_webgpu_flash_attn_pipeline_key key = ggml_webgpu_flash_attn_make_pipeline_key(context, decisions.path);
decisions.kv_direct = key.kv_direct;
if (decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_VEC) {
const uint32_t min_kv_tile = ggml_webgpu_flash_attn_max_kv_tile(context, key);
decisions.q_tile = 1u;
decisions.kv_tile = std::max(8u, std::min(32u, min_kv_tile));
decisions.kv_tile = (decisions.kv_tile / 8u) * 8u;
decisions.wg_size = std::max(1u, std::min<uint32_t>(32u, context.max_subgroup_size));
if (decisions.kv_direct) {
decisions.kv_tile = std::min(decisions.kv_tile, GGML_WEBGPU_KV_SEQ_PAD);
while (GGML_WEBGPU_KV_SEQ_PAD % decisions.kv_tile != 0) {
decisions.kv_tile -= 8u;
}
}
return decisions;
}
decisions.q_tile =
decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_TILE ? GGML_WEBGPU_FLASH_ATTN_TILE_Q_TILE : context.sg_mat_m;
decisions.kv_tile = decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_TILE ?
std::min(64u, ggml_webgpu_flash_attn_max_kv_tile(context, key)) :
std::min(ggml_webgpu_flash_attn_max_kv_tile(context, key),
context.sg_mat_n * GGML_WEBGPU_FLASH_ATTN_PREFERRED_KV_SG_TILES);
decisions.wg_size = decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_TILE ?
GGML_WEBGPU_FLASH_ATTN_PREFERRED_WG_SIZE :
std::max(context.max_subgroup_size, GGML_WEBGPU_FLASH_ATTN_PREFERRED_WG_SIZE);
if (decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_TILE) {
const uint32_t tile_kv_granularity = std::max(1u, context.max_subgroup_size);
decisions.kv_tile =
std::max(tile_kv_granularity, (decisions.kv_tile / tile_kv_granularity) * tile_kv_granularity);
}
if (decisions.kv_direct) {
GGML_ASSERT(decisions.kv_tile <= GGML_WEBGPU_KV_SEQ_PAD);
while (GGML_WEBGPU_KV_SEQ_PAD % decisions.kv_tile != 0) {
decisions.kv_tile -= decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_TILE ?
std::max(1u, context.max_subgroup_size) :
context.sg_mat_n;
}
}
return kv_tile;
return decisions;
}
/** Matrix Multiplication **/
@@ -805,6 +947,8 @@ class ggml_webgpu_shader_lib {
solve_tri_pipelines; // type
std::unordered_map<ggml_webgpu_ssm_conv_pipeline_key, webgpu_pipeline, ggml_webgpu_ssm_conv_pipeline_key_hash>
ssm_conv_pipelines; // type/vectorized
std::unordered_map<ggml_webgpu_ssm_scan_pipeline_key, webgpu_pipeline, ggml_webgpu_ssm_scan_pipeline_key_hash>
ssm_scan_pipelines; // type/d_state
std::unordered_map<ggml_webgpu_gated_delta_net_pipeline_key,
webgpu_pipeline,
ggml_webgpu_gated_delta_net_pipeline_key_hash>
@@ -819,8 +963,6 @@ class ggml_webgpu_shader_lib {
repeat_pipelines; // type
std::unordered_map<ggml_webgpu_flash_attn_pipeline_key, webgpu_pipeline, ggml_webgpu_flash_attn_pipeline_key_hash>
flash_attn_pipelines;
std::unordered_map<ggml_webgpu_flash_attn_pipeline_key, webgpu_pipeline, ggml_webgpu_flash_attn_pipeline_key_hash>
flash_attn_vec_pipelines;
std::unordered_map<ggml_webgpu_flash_attn_vec_reduce_pipeline_key,
webgpu_pipeline,
ggml_webgpu_flash_attn_vec_reduce_pipeline_key_hash>
@@ -1319,6 +1461,53 @@ class ggml_webgpu_shader_lib {
return ssm_conv_pipelines[key];
}
webgpu_pipeline get_ssm_scan_pipeline(const ggml_webgpu_shader_lib_context & context) {
ggml_webgpu_ssm_scan_pipeline_key key = {};
key.type = context.dst->type;
key.d_state = (int) context.src0->ne[0];
auto it = ssm_scan_pipelines.find(key);
if (it != ssm_scan_pipelines.end()) {
return it->second;
}
std::vector<std::string> defines;
std::string variant = "ssm_scan";
switch (key.type) {
case GGML_TYPE_F32:
variant += "_f32";
break;
default:
GGML_ABORT("Unsupported type for ssm_scan shader");
}
const uint32_t wg_size = (uint32_t) key.d_state;
constexpr uint32_t tokens_per_tile = 4u;
defines.push_back("WG_SIZE=" + std::to_string(wg_size) + "u");
defines.push_back("TOKENS_PER_TILE=" + std::to_string(tokens_per_tile) + "u");
if (context.supports_subgroups) {
defines.push_back("USE_SUBGROUP_REDUCTION");
variant += "_sg_reduce";
} else {
variant += "_wg_reduce";
}
variant += "_d" + std::to_string(key.d_state);
auto processed = preprocessor.preprocess(wgsl_ssm_scan, defines);
auto decisions = std::make_shared<ggml_webgpu_ssm_scan_shader_decisions>();
decisions->wg_size = wg_size;
decisions->tokens_per_tile = tokens_per_tile;
webgpu_pipeline pipeline = ggml_webgpu_create_pipeline(device, processed, variant);
pipeline.context = decisions;
ssm_scan_pipelines[key] = pipeline;
return ssm_scan_pipelines[key];
}
webgpu_pipeline get_gated_delta_net_pipeline(const ggml_webgpu_shader_lib_context & context) {
ggml_webgpu_gated_delta_net_pipeline_key key = {};
key.type = context.dst->type;
@@ -1548,13 +1737,24 @@ class ggml_webgpu_shader_lib {
// VEC/SCALAR controls
defines.push_back(key.vectorized ? "VEC" : "SCALAR");
const bool is_quant = ggml_is_quantized(context.src0->type);
uint32_t tile_k;
if (key.use_subgroup_matrix) {
tile_k = is_quant ? WEBGPU_MUL_MAT_SUBGROUP_TILE_K_QUANT
: WEBGPU_MUL_MAT_SUBGROUP_TILE_K_FLOAT;
} else {
tile_k = is_quant ? WEBGPU_MUL_MAT_REG_TILE_K_QUANT
: WEBGPU_MUL_MAT_REG_TILE_K_FLOAT;
}
// Tiles
defines.push_back("TILE_M=" + std::to_string(WEBGPU_MUL_MAT_TILE_M) + "u");
defines.push_back("TILE_N=" + std::to_string(WEBGPU_MUL_MAT_TILE_N) + "u");
defines.push_back("TILE_K=" + std::to_string(WEBGPU_MUL_MAT_TILE_K) + "u");
// Subgroup matrix specifics
if (key.use_subgroup_matrix) {
defines.push_back("TILE_K=" + std::to_string(tile_k) + "u");
defines.push_back("MAX_SUBGROUP_SIZE=" + std::to_string(context.max_subgroup_size) + "u");
defines.push_back("SUBGROUP_M=" + std::to_string(WEBGPU_MUL_MAT_SUBGROUP_M) + "u");
defines.push_back("SUBGROUP_N=" + std::to_string(WEBGPU_MUL_MAT_SUBGROUP_N) + "u");
@@ -1574,12 +1774,13 @@ class ggml_webgpu_shader_lib {
if (!key.use_subgroup_matrix) {
defines.push_back("WORKGROUP_SIZE_M=" + std::to_string(WEBGPU_MUL_MAT_WG_SIZE_M) + "u");
defines.push_back("WORKGROUP_SIZE_N=" + std::to_string(WEBGPU_MUL_MAT_WG_SIZE_N) + "u");
defines.push_back("TILE_K=" + std::to_string(tile_k) + "u");
}
auto processed = preprocessor.preprocess(shader_src, defines);
auto decisions = std::make_shared<ggml_webgpu_mul_mat_shader_decisions>();
decisions->tile_k = WEBGPU_MUL_MAT_TILE_K;
decisions->tile_k = tile_k;
decisions->tile_m = WEBGPU_MUL_MAT_TILE_M;
decisions->tile_n = WEBGPU_MUL_MAT_TILE_N;
decisions->use_subgroup_matrix = key.use_subgroup_matrix;
@@ -1776,10 +1977,15 @@ class ggml_webgpu_shader_lib {
defines.push_back("SCALAR");
// mul_mat_id is register-tile only.
const uint32_t tile_k = ggml_is_quantized(context.src0->type)
? WEBGPU_MUL_MAT_REG_TILE_K_QUANT
: WEBGPU_MUL_MAT_REG_TILE_K_FLOAT;
// Tiles
defines.push_back("TILE_M=" + std::to_string(WEBGPU_MUL_MAT_TILE_M) + "u");
defines.push_back("TILE_N=" + std::to_string(WEBGPU_MUL_MAT_TILE_N) + "u");
defines.push_back("TILE_K=" + std::to_string(WEBGPU_MUL_MAT_TILE_K) + "u");
defines.push_back("TILE_K=" + std::to_string(tile_k) + "u");
defines.push_back("WORKGROUP_SIZE_M=" + std::to_string(WEBGPU_MUL_MAT_WG_SIZE_M) + "u");
defines.push_back("WORKGROUP_SIZE_N=" + std::to_string(WEBGPU_MUL_MAT_WG_SIZE_N) + "u");
@@ -1790,7 +1996,7 @@ class ggml_webgpu_shader_lib {
auto processed = preprocessor.preprocess(wgsl_mul_mat_id, defines);
auto decisions = std::make_shared<ggml_webgpu_mul_mat_shader_decisions>();
decisions->tile_k = WEBGPU_MUL_MAT_TILE_K;
decisions->tile_k = tile_k;
decisions->tile_m = WEBGPU_MUL_MAT_TILE_M;
decisions->tile_n = WEBGPU_MUL_MAT_TILE_N;
decisions->wg_size_m = WEBGPU_MUL_MAT_WG_SIZE_M;
@@ -1878,6 +2084,7 @@ class ggml_webgpu_shader_lib {
webgpu_pipeline get_rms_norm_mul_pipeline(const ggml_webgpu_shader_lib_context & context) {
ggml_webgpu_rms_norm_mul_pipeline_key key = {};
key.inplace = context.inplace;
key.overlap = context.overlap;
key.src_overlap = context.src_overlap;
auto it = rms_norm_mul_pipelines.find(key);
@@ -1892,6 +2099,9 @@ class ggml_webgpu_shader_lib {
if (key.inplace) {
defines.push_back("INPLACE");
variant += "_inplace";
} else if (key.overlap) {
defines.push_back("OVERLAP");
variant += "_overlap";
} else if (key.src_overlap) {
defines.push_back("SRC_OVERLAP");
variant += "_src_overlap";
@@ -2038,14 +2248,19 @@ class ggml_webgpu_shader_lib {
return repeat_pipelines[key];
}
webgpu_pipeline get_flash_attn_pipeline(const ggml_webgpu_shader_lib_context & context) {
const ggml_webgpu_flash_attn_pipeline_key key = ggml_webgpu_flash_attn_make_pipeline_key(context);
auto it = flash_attn_pipelines.find(key);
webgpu_pipeline get_flash_attn_pipeline(const ggml_webgpu_shader_lib_context & context,
size_t storage_offset_alignment) {
const ggml_webgpu_flash_attn_decisions decisions =
ggml_webgpu_flash_attn_get_decisions(context, storage_offset_alignment);
ggml_webgpu_flash_attn_pipeline_key key = ggml_webgpu_flash_attn_make_pipeline_key(context, decisions.path);
auto it = flash_attn_pipelines.find(key);
if (it != flash_attn_pipelines.end()) {
return it->second;
}
std::vector<std::string> defines;
std::string variant = "flash_attn";
std::string variant = decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_VEC ? "flash_attn_vec" :
decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_TILE ? "flash_attn_tile" :
"flash_attn";
switch (key.kv_type) {
case GGML_TYPE_F32:
@@ -2067,7 +2282,12 @@ class ggml_webgpu_shader_lib {
if (key.has_mask) {
defines.push_back("MASK");
variant += "_mask";
if (key.path == GGML_WEBGPU_FLASH_ATTN_PATH_VEC) {
defines.push_back("BLK");
variant += "_mask_blk";
} else {
variant += "_mask";
}
}
if (key.has_sinks) {
defines.push_back("SINKS");
@@ -2081,6 +2301,10 @@ class ggml_webgpu_shader_lib {
defines.push_back("KV_DIRECT");
variant += "_kvdirect";
}
if (key.kv_overlap) {
defines.push_back("KV_OVERLAP");
variant += "_kv_overlap";
}
defines.push_back(std::string("HEAD_DIM_QK=") + std::to_string(key.head_dim_qk));
variant += std::string("_hsqk") + std::to_string(key.head_dim_qk);
@@ -2088,129 +2312,37 @@ class ggml_webgpu_shader_lib {
defines.push_back(std::string("HEAD_DIM_V=") + std::to_string(key.head_dim_v));
variant += std::string("_hsv") + std::to_string(key.head_dim_v);
defines.push_back(std::string("SG_MAT_M=") + std::to_string(context.sg_mat_m));
defines.push_back(std::string("SG_MAT_N=") + std::to_string(context.sg_mat_n));
defines.push_back(std::string("SG_MAT_K=") + std::to_string(context.sg_mat_k));
auto decisions = std::make_shared<ggml_webgpu_flash_attn_decisions>();
decisions->q_tile = context.sg_mat_m;
const uint32_t min_kv_tile = ggml_webgpu_flash_attn_max_kv_tile(context, key);
uint32_t kv_tile = std::min(min_kv_tile, context.sg_mat_n * GGML_WEBGPU_FLASH_ATTN_PREFERRED_KV_SG_TILES);
if (key.kv_direct) {
kv_tile = std::min(kv_tile, GGML_WEBGPU_KV_SEQ_PAD);
while (GGML_WEBGPU_KV_SEQ_PAD % kv_tile != 0) {
kv_tile -= context.sg_mat_n;
}
const char * shader_src = wgsl_flash_attn;
if (key.path == GGML_WEBGPU_FLASH_ATTN_PATH_VEC) {
defines.push_back("KV_GRANULARITY=8");
defines.push_back(std::string("VEC_NE=") + std::to_string(ggml_webgpu_flash_attn_pick_vec_ne(key)) + "u");
shader_src = wgsl_flash_attn_vec_split;
} else if (key.path == GGML_WEBGPU_FLASH_ATTN_PATH_TILE) {
shader_src = wgsl_flash_attn_tile;
defines.push_back("MAX_SUBGROUP_SIZE=" + std::to_string(context.max_subgroup_size));
defines.push_back("KV_STAGE_STRIDE=" + std::to_string(std::max(key.head_dim_qk, key.head_dim_v)));
variant += "_tile";
} else {
defines.push_back(std::string("SG_MAT_M=") + std::to_string(context.sg_mat_m));
defines.push_back(std::string("SG_MAT_N=") + std::to_string(context.sg_mat_n));
defines.push_back(std::string("SG_MAT_K=") + std::to_string(context.sg_mat_k));
}
decisions->kv_tile = kv_tile;
decisions->wg_size = std::max(context.max_subgroup_size, GGML_WEBGPU_FLASH_ATTN_PREFERRED_WG_SIZE);
defines.push_back(std::string("Q_TILE=") + std::to_string(decisions->q_tile));
defines.push_back(std::string("KV_TILE=") + std::to_string(decisions->kv_tile));
defines.push_back(std::string("WG_SIZE=") + std::to_string(decisions->wg_size));
auto pipeline_decisions = std::make_shared<ggml_webgpu_flash_attn_decisions>(decisions);
defines.push_back(std::string("Q_TILE=") + std::to_string(decisions.q_tile));
defines.push_back(std::string("KV_TILE=") + std::to_string(decisions.kv_tile));
defines.push_back(std::string("WG_SIZE=") + std::to_string(decisions.wg_size));
webgpu_pipeline pipeline =
ggml_webgpu_create_pipeline(device, preprocessor.preprocess(wgsl_flash_attn, defines), variant);
pipeline.context = decisions;
ggml_webgpu_create_pipeline(device, preprocessor.preprocess(shader_src, defines), variant);
pipeline.context = pipeline_decisions;
flash_attn_pipelines[key] = pipeline;
return flash_attn_pipelines[key];
}
webgpu_pipeline get_flash_attn_vec_pipeline(const ggml_webgpu_shader_lib_context & context) {
const ggml_webgpu_flash_attn_pipeline_key key = ggml_webgpu_flash_attn_make_pipeline_key(context);
auto it = flash_attn_vec_pipelines.find(key);
if (it != flash_attn_vec_pipelines.end()) {
return it->second;
}
std::vector<std::string> defines;
std::string variant = "flash_attn_vec";
switch (key.kv_type) {
case GGML_TYPE_F32:
defines.push_back("KV_F32");
break;
case GGML_TYPE_F16:
defines.push_back("KV_F16");
break;
case GGML_TYPE_Q4_0:
defines.push_back("KV_Q4_0");
break;
case GGML_TYPE_Q8_0:
defines.push_back("KV_Q8_0");
break;
default:
GGML_ABORT("Unsupported KV type for flash attention shader");
}
variant += std::string("_") + ggml_type_name(key.kv_type);
if (key.has_mask) {
defines.push_back("MASK");
defines.push_back("BLK");
variant += "_mask_blk";
}
if (key.has_sinks) {
defines.push_back("SINKS");
variant += "_sinks";
}
if (key.uses_logit_softcap) {
defines.push_back("LOGIT_SOFTCAP");
variant += "_lgsc";
}
if (key.kv_direct) {
defines.push_back("KV_DIRECT");
variant += "_kvdirect";
}
defines.push_back(std::string("HEAD_DIM_QK=") + std::to_string(key.head_dim_qk));
variant += std::string("_hsqk") + std::to_string(key.head_dim_qk);
defines.push_back(std::string("HEAD_DIM_V=") + std::to_string(key.head_dim_v));
variant += std::string("_hsv") + std::to_string(key.head_dim_v);
defines.push_back(std::string("SG_MAT_M=") + std::to_string(context.sg_mat_m));
defines.push_back(std::string("SG_MAT_N=") + std::to_string(context.sg_mat_n));
defines.push_back(std::string("SG_MAT_K=") + std::to_string(context.sg_mat_k));
defines.push_back("Q_TILE=1");
auto decisions = std::make_shared<ggml_webgpu_flash_attn_vec_decisions>();
decisions->kv_tile = ggml_webgpu_flash_attn_vec_get_kv_tile(context);
decisions->wg_size = std::max(1u, std::min<uint32_t>(32u, context.max_subgroup_size));
uint32_t vec_ne = 1u;
// Keep conservative defaults unless this is the f16 vec-split shape family.
if (key.kv_type == GGML_TYPE_F16 && key.head_dim_qk == key.head_dim_v) {
switch (key.head_dim_qk) {
case 64:
case 192:
case 576:
vec_ne = 2u;
break;
case 96:
vec_ne = 4u;
break;
default:
break;
}
}
defines.push_back(std::string("KV_TILE=") + std::to_string(decisions->kv_tile));
defines.push_back(std::string("WG_SIZE=") + std::to_string(decisions->wg_size));
defines.push_back(std::string("VEC_NE=") + std::to_string(vec_ne) + "u");
webgpu_pipeline pipeline =
ggml_webgpu_create_pipeline(device, preprocessor.preprocess(wgsl_flash_attn_vec_split, defines), variant);
pipeline.context = decisions;
flash_attn_vec_pipelines[key] = pipeline;
return flash_attn_vec_pipelines[key];
}
webgpu_pipeline get_flash_attn_blk_pipeline(const ggml_webgpu_shader_lib_context & context) {
webgpu_pipeline get_flash_attn_blk_pipeline(const ggml_webgpu_shader_lib_context & context, uint32_t kv_tile) {
ggml_webgpu_flash_attn_blk_pipeline_key key = {};
key.kv_tile = ggml_webgpu_flash_attn_vec_get_kv_tile(context);
key.kv_tile = kv_tile;
auto it = flash_attn_blk_pipelines.find(key);
if (it != flash_attn_blk_pipelines.end()) {
return it->second;
+214 -81
View File
@@ -389,23 +389,6 @@ static size_t ggml_webgpu_tensor_misalignment(webgpu_context & ctx, const ggml_t
return offset & (ctx->global_ctx->capabilities.limits.minStorageBufferOffsetAlignment - 1);
}
static bool ggml_webgpu_flash_attn_use_vec(webgpu_global_context & global_ctx,
const ggml_tensor * Q,
const ggml_tensor * K,
const ggml_tensor * V) {
const size_t alignment = global_ctx->capabilities.limits.minStorageBufferOffsetAlignment;
const uint32_t k_offset_elems =
(uint32_t) ((ggml_webgpu_tensor_offset(K) & (alignment - 1)) / ggml_type_size(K->type));
const uint32_t v_offset_elems =
(uint32_t) ((ggml_webgpu_tensor_offset(V) & (alignment - 1)) / ggml_type_size(V->type));
const bool f16_vec4_aligned = (k_offset_elems % 4u == 0u) && (v_offset_elems % 4u == 0u);
const bool kv_vec_type_supported =
K->type == GGML_TYPE_F16 || K->type == GGML_TYPE_Q4_0 || K->type == GGML_TYPE_Q8_0;
return (Q->ne[1] < 20) && (Q->ne[0] % 32 == 0) && (V->ne[0] % 4 == 0) && kv_vec_type_supported &&
(K->type != GGML_TYPE_F16 || f16_vec4_aligned) && (V->type == K->type);
}
static size_t ggml_webgpu_tensor_align_offset(webgpu_context & ctx, const ggml_tensor * t) {
size_t offset = ggml_webgpu_tensor_offset(t);
return offset & ~(ctx->global_ctx->capabilities.limits.minStorageBufferOffsetAlignment - 1);
@@ -1132,6 +1115,80 @@ static webgpu_encoded_op ggml_webgpu_ssm_conv(webgpu_context & ctx,
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
}
static webgpu_encoded_op ggml_webgpu_ssm_scan(webgpu_context & ctx,
ggml_tensor * src0,
ggml_tensor * src1,
ggml_tensor * src2,
ggml_tensor * src3,
ggml_tensor * src4,
ggml_tensor * src5,
ggml_tensor * src6,
ggml_tensor * dst) {
ggml_webgpu_shader_lib_context shader_lib_ctx = {};
shader_lib_ctx.src0 = src0;
shader_lib_ctx.dst = dst;
shader_lib_ctx.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
shader_lib_ctx.supports_subgroups = ctx->global_ctx->capabilities.supports_subgroups;
webgpu_pipeline pipeline = ctx->shader_lib->get_ssm_scan_pipeline(shader_lib_ctx);
std::vector<uint32_t> params = {
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)),
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)),
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src2) / ggml_type_size(src2->type)),
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src3) / ggml_type_size(src3->type)),
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src4) / ggml_type_size(src4->type)),
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src5) / ggml_type_size(src5->type)),
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src6) / ggml_type_size(src6->type)),
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
(uint32_t) (src0->nb[1] / ggml_type_size(src0->type)),
(uint32_t) (src0->nb[2] / ggml_type_size(src0->type)),
(uint32_t) (src0->nb[3] / ggml_type_size(src0->type)),
(uint32_t) (src1->nb[1] / ggml_type_size(src1->type)),
(uint32_t) (src1->nb[2] / ggml_type_size(src1->type)),
(uint32_t) (src1->nb[3] / ggml_type_size(src1->type)),
(uint32_t) (src2->nb[1] / ggml_type_size(src2->type)),
(uint32_t) (src2->nb[2] / ggml_type_size(src2->type)),
(uint32_t) src3->ne[0],
(uint32_t) (src3->nb[1] / ggml_type_size(src3->type)),
(uint32_t) (src4->nb[1] / ggml_type_size(src4->type)),
(uint32_t) (src4->nb[2] / ggml_type_size(src4->type)),
(uint32_t) (src4->nb[3] / ggml_type_size(src4->type)),
(uint32_t) (src5->nb[1] / ggml_type_size(src5->type)),
(uint32_t) (src5->nb[2] / ggml_type_size(src5->type)),
(uint32_t) (src5->nb[3] / ggml_type_size(src5->type)),
(uint32_t) src0->ne[0],
(uint32_t) src0->ne[1],
(uint32_t) src0->ne[2],
(uint32_t) src4->ne[1],
(uint32_t) src1->ne[2],
(uint32_t) src1->ne[3],
(uint32_t) ggml_nelements(src1),
};
std::vector<wgpu::BindGroupEntry> entries = {
ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, src0), ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, src1),
ggml_webgpu_make_tensor_bind_group_entry(ctx, 2, src2), ggml_webgpu_make_tensor_bind_group_entry(ctx, 3, src3),
ggml_webgpu_make_tensor_bind_group_entry(ctx, 4, src4), ggml_webgpu_make_tensor_bind_group_entry(ctx, 5, src5),
ggml_webgpu_make_tensor_bind_group_entry(ctx, 6, src6), ggml_webgpu_make_tensor_bind_group_entry(ctx, 7, dst),
};
const uint32_t total_wg = (uint32_t) (src0->ne[1] * src0->ne[2] * src1->ne[3]);
const uint32_t max_wg_per_dim = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension;
uint32_t wg_x;
uint32_t wg_y;
compute_2d_workgroups(total_wg, max_wg_per_dim, wg_x, wg_y);
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
}
static webgpu_encoded_op ggml_webgpu_gated_delta_net(webgpu_context & ctx,
ggml_tensor * src0,
ggml_tensor * src1,
@@ -1567,7 +1624,6 @@ static webgpu_encoded_op ggml_webgpu_mul_mat_id(webgpu_context & ctx,
return ggml_backend_webgpu_build_multi(ctx, dispatches);
}
#ifndef __EMSCRIPTEN__
static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx,
ggml_tensor * Q,
ggml_tensor * K,
@@ -1585,13 +1641,29 @@ static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx,
float m0 = powf(2.0f, -(max_bias) / n_head_log2);
float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
const int has_mask = (mask != nullptr);
const int has_sinks = (sinks != nullptr);
const int has_mask = (mask != nullptr);
const int has_sinks = (sinks != nullptr);
const bool kv_overlap = ggml_webgpu_tensor_overlap(K, V) && K->type == V->type;
uint32_t offset_k = (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, K) / ggml_type_size(K->type));
uint32_t offset_v = (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, V) / ggml_type_size(V->type));
size_t kv_bind_offset = 0;
size_t kv_bind_size = 0;
if (kv_overlap) {
const size_t k_bind_offset = ggml_webgpu_tensor_align_offset(ctx, K);
const size_t v_bind_offset = ggml_webgpu_tensor_align_offset(ctx, V);
const size_t k_bind_end = k_bind_offset + ggml_webgpu_tensor_binding_size(ctx, K);
const size_t v_bind_end = v_bind_offset + ggml_webgpu_tensor_binding_size(ctx, V);
kv_bind_offset = std::min(k_bind_offset, v_bind_offset);
kv_bind_size = std::max(k_bind_end, v_bind_end) - kv_bind_offset;
offset_k = (uint32_t) ((ggml_webgpu_tensor_offset(K) - kv_bind_offset) / ggml_type_size(K->type));
offset_v = (uint32_t) ((ggml_webgpu_tensor_offset(V) - kv_bind_offset) / ggml_type_size(V->type));
}
std::vector<uint32_t> params = {
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, Q) / ggml_type_size(Q->type)),
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, K) / ggml_type_size(K->type)),
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, V) / ggml_type_size(V->type)),
offset_k,
offset_v,
has_mask ? (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, mask) / ggml_type_size(mask->type)) : 0,
has_sinks ? (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, sinks) / ggml_type_size(sinks->type)) : 0,
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
@@ -1619,10 +1691,15 @@ static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx,
};
std::vector<wgpu::BindGroupEntry> entries = {
ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, Q),
ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, K),
ggml_webgpu_make_tensor_bind_group_entry(ctx, 2, V),
};
uint32_t binding_index = 3;
if (kv_overlap) {
entries.push_back(
ggml_webgpu_make_bind_group_entry(1, ggml_webgpu_tensor_buf(K), kv_bind_offset, kv_bind_size));
} else {
entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, K));
entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 2, V));
}
uint32_t binding_index = kv_overlap ? 2u : 3u;
if (has_mask) {
entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, binding_index++, mask));
}
@@ -1638,25 +1715,25 @@ static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx,
shader_lib_ctx.src3 = mask;
shader_lib_ctx.src4 = sinks;
shader_lib_ctx.dst = dst;
shader_lib_ctx.src_overlap = kv_overlap;
shader_lib_ctx.supports_subgroups = ctx->global_ctx->capabilities.supports_subgroups;
shader_lib_ctx.supports_subgroup_matrix = ctx->global_ctx->capabilities.supports_subgroup_matrix;
shader_lib_ctx.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
shader_lib_ctx.wg_mem_limit_bytes = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupStorageSize;
shader_lib_ctx.sg_mat_m = ctx->global_ctx->capabilities.sg_mat_m;
shader_lib_ctx.sg_mat_n = ctx->global_ctx->capabilities.sg_mat_n;
shader_lib_ctx.sg_mat_k = ctx->global_ctx->capabilities.sg_mat_k;
shader_lib_ctx.max_subgroup_size = ctx->global_ctx->capabilities.max_subgroup_size;
const bool use_vec = ggml_webgpu_flash_attn_use_vec(ctx->global_ctx, Q, K, V);
webgpu_pipeline pipeline = use_vec ? ctx->shader_lib->get_flash_attn_vec_pipeline(shader_lib_ctx) :
ctx->shader_lib->get_flash_attn_pipeline(shader_lib_ctx);
webgpu_pipeline pipeline = ctx->shader_lib->get_flash_attn_pipeline(
shader_lib_ctx, ctx->global_ctx->capabilities.limits.minStorageBufferOffsetAlignment);
auto * decisions = static_cast<ggml_webgpu_flash_attn_decisions *>(pipeline.context.get());
if (!use_vec) {
auto * decisions = static_cast<ggml_webgpu_flash_attn_decisions *>(pipeline.context.get());
if (decisions->path != GGML_WEBGPU_FLASH_ATTN_PATH_VEC) {
uint32_t wg_per_head = CEIL_DIV(Q->ne[1], decisions->q_tile);
uint32_t wg_x = wg_per_head * Q->ne[2] * Q->ne[3]; // wg per head * number of heads * number of batches
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
}
auto * decisions = static_cast<ggml_webgpu_flash_attn_vec_decisions *>(pipeline.context.get());
wgpu::Buffer blk_buf = {};
uint64_t blk_size_bytes = 0;
uint32_t blk_nblk0 = 0;
@@ -1695,10 +1772,12 @@ static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx,
tmp_bind_size = tmp_size_bytes;
scratch_offset = ROUNDUP_POW2(scratch_offset + tmp_size_bytes, align_bytes);
} else {
// nwg==1 writes final dst directly in vec-split; keep tmp binding valid without extra allocation.
// nwg==1 writes final dst directly in vec-split; bind tmp to a tiny non-overlapping scratch region.
tmp_size_bytes = WEBGPU_STORAGE_BUF_BINDING_MULT;
tmp_buf = ggml_webgpu_tensor_buf(dst);
tmp_bind_offset = ggml_webgpu_tensor_align_offset(ctx, dst);
tmp_bind_size = ggml_webgpu_tensor_binding_size(ctx, dst);
tmp_bind_offset = scratch_offset;
tmp_bind_size = tmp_size_bytes;
scratch_offset = ROUNDUP_POW2(scratch_offset + tmp_size_bytes, align_bytes);
}
webgpu_pipeline blk_pipeline;
@@ -1713,7 +1792,7 @@ static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx,
const uint64_t blk_elems = (uint64_t) blk_nblk0 * blk_nblk1 * blk_batch_count;
blk_size_bytes = ROUNDUP_POW2(blk_elems * sizeof(uint32_t), WEBGPU_STORAGE_BUF_BINDING_MULT);
const ggml_webgpu_shader_lib_context blk_shader_ctx = shader_lib_ctx;
blk_pipeline = ctx->shader_lib->get_flash_attn_blk_pipeline(blk_shader_ctx);
blk_pipeline = ctx->shader_lib->get_flash_attn_blk_pipeline(blk_shader_ctx, decisions->kv_tile);
blk_params = {
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, mask) / ggml_type_size(mask->type)), // offset_mask
@@ -1745,12 +1824,19 @@ static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx,
std::vector<wgpu::BindGroupEntry> split_entries = {
ggml_webgpu_make_bind_group_entry(0, ggml_webgpu_tensor_buf(Q), ggml_webgpu_tensor_align_offset(ctx, Q),
ggml_webgpu_tensor_binding_size(ctx, Q)),
ggml_webgpu_make_bind_group_entry(1, ggml_webgpu_tensor_buf(K), ggml_webgpu_tensor_align_offset(ctx, K),
ggml_webgpu_tensor_binding_size(ctx, K)),
ggml_webgpu_make_bind_group_entry(2, ggml_webgpu_tensor_buf(V), ggml_webgpu_tensor_align_offset(ctx, V),
ggml_webgpu_tensor_binding_size(ctx, V)),
};
uint32_t split_binding_index = 3;
if (kv_overlap) {
split_entries.push_back(
ggml_webgpu_make_bind_group_entry(1, ggml_webgpu_tensor_buf(K), kv_bind_offset, kv_bind_size));
} else {
split_entries.push_back(ggml_webgpu_make_bind_group_entry(1, ggml_webgpu_tensor_buf(K),
ggml_webgpu_tensor_align_offset(ctx, K),
ggml_webgpu_tensor_binding_size(ctx, K)));
split_entries.push_back(ggml_webgpu_make_bind_group_entry(2, ggml_webgpu_tensor_buf(V),
ggml_webgpu_tensor_align_offset(ctx, V),
ggml_webgpu_tensor_binding_size(ctx, V)));
}
uint32_t split_binding_index = kv_overlap ? 2u : 3u;
if (has_mask) {
split_entries.push_back(ggml_webgpu_make_bind_group_entry(split_binding_index++, ggml_webgpu_tensor_buf(mask),
ggml_webgpu_tensor_align_offset(ctx, mask),
@@ -1820,7 +1906,6 @@ static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx,
return ggml_backend_webgpu_build_multi(ctx, dispatches);
}
#endif // __EMSCRIPTEN__
static webgpu_encoded_op ggml_webgpu_unary_op(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
bool is_unary = dst->op == GGML_OP_UNARY;
@@ -2071,8 +2156,9 @@ static std::optional<webgpu_encoded_op> ggml_webgpu_rms_norm_mul(webgpu_context
GGML_ABORT("rms_norm must be equal to the one of mul_src0 and mul_src1");
}
bool inplace = (ggml_webgpu_tensor_equal(rn_dst, mul_src0) && ggml_webgpu_tensor_equal(mul_src1, dst)) ||
bool overlap = (ggml_webgpu_tensor_equal(rn_dst, mul_src0) && ggml_webgpu_tensor_equal(mul_src1, dst)) ||
(ggml_webgpu_tensor_equal(rn_dst, mul_src1) && ggml_webgpu_tensor_equal(mul_src0, dst));
bool inplace = ggml_webgpu_tensor_equal(rn_src, dst);
bool src_overlap = ggml_webgpu_tensor_overlap(rn_src, mul_src);
uint32_t offset_merged_rn_src = 0;
@@ -2116,7 +2202,7 @@ static std::optional<webgpu_encoded_op> ggml_webgpu_rms_norm_mul(webgpu_context
std::vector<wgpu::BindGroupEntry> entries;
if (inplace) {
if (inplace || overlap) {
entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, rn_src));
entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, mul_src));
} else if (src_overlap) {
@@ -2136,6 +2222,7 @@ static std::optional<webgpu_encoded_op> ggml_webgpu_rms_norm_mul(webgpu_context
ggml_webgpu_shader_lib_context shader_lib_ctx = {};
shader_lib_ctx.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
shader_lib_ctx.inplace = inplace;
shader_lib_ctx.overlap = overlap;
shader_lib_ctx.src_overlap = src_overlap;
webgpu_pipeline pipeline = ctx->shader_lib->get_rms_norm_mul_pipeline(shader_lib_ctx);
@@ -2708,11 +2795,7 @@ static std::optional<webgpu_encoded_op> ggml_webgpu_encode(webgpu_context ctx,
case GGML_OP_MUL_MAT_ID:
return ggml_webgpu_mul_mat_id(ctx, src0, src1, src2, node);
case GGML_OP_FLASH_ATTN_EXT:
#ifndef __EMSCRIPTEN__
return ggml_webgpu_flash_attn(ctx, src0, src1, src2, node->src[3], node->src[4], node);
#else
return std::nullopt;
#endif
case GGML_OP_ADD:
case GGML_OP_SUB:
case GGML_OP_MUL:
@@ -2755,6 +2838,9 @@ static std::optional<webgpu_encoded_op> ggml_webgpu_encode(webgpu_context ctx,
return ggml_webgpu_solve_tri(ctx, src0, src1, node);
case GGML_OP_SSM_CONV:
return ggml_webgpu_ssm_conv(ctx, src0, src1, node);
case GGML_OP_SSM_SCAN:
return ggml_webgpu_ssm_scan(ctx, src0, src1, src2, node->src[3], node->src[4], node->src[5], node->src[6],
node);
case GGML_OP_GATED_DELTA_NET:
return ggml_webgpu_gated_delta_net(ctx, src0, src1, src2, node->src[3], node->src[4], node->src[5], node);
case GGML_OP_PAD:
@@ -2813,7 +2899,10 @@ static void ggml_backend_webgpu_collect_profile_results(webgpu_context &
}
#endif
// Don't bother checking set_rows index overflow for now, since practically the WebGPU doesn't need to support
// models that would require it right now.
static void ggml_backend_webgpu_check_set_rows(webgpu_context & ctx, uint32_t & num_inflight_batches) {
#ifdef GGML_WEBGPU_CHECK_SET_ROWS
wgpu::CommandEncoder encoder = ctx->global_ctx->device.CreateCommandEncoder();
encoder.CopyBufferToBuffer(ctx->set_rows_dev_error_buf, 0, ctx->set_rows_host_error_buf, 0,
ctx->set_rows_host_error_buf.GetSize());
@@ -2826,6 +2915,10 @@ static void ggml_backend_webgpu_check_set_rows(webgpu_context & ctx, uint32_t &
GGML_ABORT("ggml_webgpu: SET_ROWS index > 2^32, unsupported.");
}
ctx->set_rows_host_error_buf.Unmap();
#else
GGML_UNUSED(ctx);
GGML_UNUSED(num_inflight_batches);
#endif
}
static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
@@ -2911,8 +3004,6 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
ggml_backend_webgpu_check_set_rows(ctx, num_inflight_batches);
}
ggml_backend_webgpu_wait_queue(ctx->global_ctx);
WEBGPU_CPU_PROFILE_TOTAL_END(graph_compute, ctx->global_ctx);
return GGML_STATUS_SUCCESS;
}
@@ -3255,13 +3346,19 @@ static size_t ggml_backend_webgpu_buffer_type_get_alloc_size(ggml_backend_buffer
ctx->webgpu_global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
shader_lib_ctx.wg_mem_limit_bytes =
ctx->webgpu_global_ctx->capabilities.limits.maxComputeWorkgroupStorageSize;
shader_lib_ctx.supports_subgroups = ctx->webgpu_global_ctx->capabilities.supports_subgroups;
shader_lib_ctx.supports_subgroup_matrix =
ctx->webgpu_global_ctx->capabilities.supports_subgroup_matrix;
shader_lib_ctx.sg_mat_m = ctx->webgpu_global_ctx->capabilities.sg_mat_m;
shader_lib_ctx.sg_mat_n = ctx->webgpu_global_ctx->capabilities.sg_mat_n;
shader_lib_ctx.sg_mat_k = ctx->webgpu_global_ctx->capabilities.sg_mat_k;
shader_lib_ctx.max_subgroup_size = ctx->webgpu_global_ctx->capabilities.max_subgroup_size;
if (ggml_webgpu_flash_attn_use_vec(ctx->webgpu_global_ctx, Q, K, V)) {
const uint32_t kv_tile = ggml_webgpu_flash_attn_vec_get_kv_tile(shader_lib_ctx);
const ggml_webgpu_flash_attn_decisions decisions = ggml_webgpu_flash_attn_get_decisions(
shader_lib_ctx, ctx->webgpu_global_ctx->capabilities.limits.minStorageBufferOffsetAlignment);
if (decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_VEC) {
const uint32_t kv_tile = decisions.kv_tile;
const uint32_t vec_nwg_cap = std::max(
1u, std::min<uint32_t>(32u, ctx->webgpu_global_ctx->capabilities.max_subgroup_size));
@@ -3281,6 +3378,8 @@ static size_t ggml_backend_webgpu_buffer_type_get_alloc_size(ggml_backend_buffer
const size_t tmp_size_bytes = ROUNDUP_POW2(
(tmp_data_elems + tmp_stats_elems) * sizeof(float), WEBGPU_STORAGE_BUF_BINDING_MULT);
res += tmp_size_bytes + align;
} else {
res += WEBGPU_STORAGE_BUF_BINDING_MULT + align;
}
if (mask != nullptr) {
const uint32_t blk_nblk0 = CEIL_DIV((uint32_t) K->ne[1], kv_tile);
@@ -3429,12 +3528,12 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
ctx->webgpu_global_ctx->capabilities.supports_subgroups =
ctx->webgpu_global_ctx->adapter.HasFeature(wgpu::FeatureName::Subgroups);
bool valid_subgroup_matrix_config = false;
#ifndef __EMSCRIPTEN__
// Accept f16 subgroup matrix configurations (square or non-square).
// NVIDIA GPUs typically report square configs (e.g. 16x16x16),
// while Intel Xe2 GPUs report non-square configs (e.g. 8x16x16).
// The shaders are already parameterized to handle any M/N/K dimensions.
bool valid_subgroup_matrix_config = false;
if (ctx->webgpu_global_ctx->adapter.HasFeature(wgpu::FeatureName::ChromiumExperimentalSubgroupMatrix)) {
for (size_t i = 0; i < subgroup_matrix_configs.configCount; i++) {
const wgpu::SubgroupMatrixConfig config = subgroup_matrix_configs.configs[i];
@@ -3448,8 +3547,8 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
}
}
}
ctx->webgpu_global_ctx->capabilities.supports_subgroup_matrix = valid_subgroup_matrix_config;
#endif
ctx->webgpu_global_ctx->capabilities.supports_subgroup_matrix = valid_subgroup_matrix_config;
// For subgroup matrix code to be the most efficient, we would like the subgroup size to be consistent and accurate.
// Unfortunately, that is not possible, so we use the maximum subgroup size reported by the adapter.
@@ -3497,12 +3596,12 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
// Enable Dawn-specific toggles to increase native performance
// TODO: Maybe WebGPU needs a "fast" mode where you can request compilers skip adding checks like these,
// only for native performance?
const char * const deviceEnabledToggles[] = { "skip_validation", "disable_robustness", "disable_workgroup_init",
"disable_polyfills_on_integer_div_and_mod" };
const char * const deviceDisabledToggles[] = { "timestamp_quantization" };
const char * const deviceEnabledToggles[] = { "disable_robustness", "disable_workgroup_init",
"disable_polyfills_on_integer_div_and_mod" };
const char * const deviceDisabledToggles[] = { "timestamp_quantization" };
wgpu::DawnTogglesDescriptor deviceTogglesDesc;
deviceTogglesDesc.enabledToggles = deviceEnabledToggles;
deviceTogglesDesc.enabledToggleCount = 4;
deviceTogglesDesc.enabledToggleCount = 3;
deviceTogglesDesc.disabledToggles = deviceDisabledToggles;
deviceTogglesDesc.disabledToggleCount = 1;
@@ -3780,33 +3879,63 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
break;
case GGML_OP_FLASH_ATTN_EXT:
{
#ifndef __EMSCRIPTEN__
if (!ctx->webgpu_global_ctx->capabilities.supports_subgroup_matrix) {
break;
}
// Head dimensions must be divisible by subgroup matrix dimensions
if (src0->ne[0] % ctx->webgpu_global_ctx->capabilities.sg_mat_k != 0 ||
src2->ne[0] % ctx->webgpu_global_ctx->capabilities.sg_mat_n != 0) {
break;
}
// Head dimensions must fit in workgroup memory with minimum tile sizes
size_t limit_bytes = ctx->webgpu_global_ctx->capabilities.limits.maxComputeWorkgroupStorageSize;
const bool has_mask = op->src[3] != nullptr;
const bool kv_direct = src1->type == GGML_TYPE_F16 &&
(src0->ne[0] % ctx->webgpu_global_ctx->capabilities.sg_mat_k) == 0 &&
(src1->ne[1] % GGML_WEBGPU_KV_SEQ_PAD) == 0;
const size_t min_bytes = ggml_webgpu_flash_attn_wg_mem_bytes(
ctx->webgpu_global_ctx->capabilities.sg_mat_m, ctx->webgpu_global_ctx->capabilities.sg_mat_n,
(uint32_t) src0->ne[0], (uint32_t) src2->ne[0], has_mask, kv_direct);
if (min_bytes > limit_bytes) {
break;
}
supports_op = src0->type == GGML_TYPE_F32 &&
(src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16 ||
src1->type == GGML_TYPE_Q4_0 || src1->type == GGML_TYPE_Q8_0) &&
src2->type == src1->type && op->type == GGML_TYPE_F32;
#endif
if (!supports_op) {
break;
}
ggml_webgpu_shader_lib_context shader_lib_ctx = {};
shader_lib_ctx.src0 = src0;
shader_lib_ctx.src1 = src1;
shader_lib_ctx.src2 = src2;
shader_lib_ctx.src3 = op->src[3];
shader_lib_ctx.src4 = op->src[4];
shader_lib_ctx.dst = const_cast<ggml_tensor *>(op);
shader_lib_ctx.supports_subgroups = ctx->webgpu_global_ctx->capabilities.supports_subgroups;
shader_lib_ctx.supports_subgroup_matrix = ctx->webgpu_global_ctx->capabilities.supports_subgroup_matrix;
shader_lib_ctx.wg_mem_limit_bytes =
ctx->webgpu_global_ctx->capabilities.limits.maxComputeWorkgroupStorageSize;
shader_lib_ctx.sg_mat_m = ctx->webgpu_global_ctx->capabilities.sg_mat_m;
shader_lib_ctx.sg_mat_n = ctx->webgpu_global_ctx->capabilities.sg_mat_n;
shader_lib_ctx.sg_mat_k = ctx->webgpu_global_ctx->capabilities.sg_mat_k;
shader_lib_ctx.max_subgroup_size = ctx->webgpu_global_ctx->capabilities.max_subgroup_size;
const ggml_webgpu_flash_attn_decisions decisions = ggml_webgpu_flash_attn_get_decisions(
shader_lib_ctx, ctx->webgpu_global_ctx->capabilities.limits.minStorageBufferOffsetAlignment);
const size_t limit_bytes = ctx->webgpu_global_ctx->capabilities.limits.maxComputeWorkgroupStorageSize;
const bool has_mask = op->src[3] != nullptr;
if (decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_VEC) {
const size_t min_bytes =
ggml_webgpu_flash_attn_wg_mem_bytes(decisions.q_tile, decisions.kv_tile, (uint32_t) src0->ne[0],
(uint32_t) src2->ne[0], has_mask, decisions.kv_direct);
if (min_bytes > limit_bytes) {
supports_op = false;
}
break;
}
if (decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_TILE) {
const size_t min_bytes =
ggml_webgpu_flash_attn_wg_mem_bytes(decisions.q_tile, decisions.kv_tile, (uint32_t) src0->ne[0],
(uint32_t) src2->ne[0], has_mask, decisions.kv_direct);
if (min_bytes > limit_bytes) {
supports_op = false;
}
break;
}
if (!ctx->webgpu_global_ctx->capabilities.supports_subgroup_matrix) {
supports_op = false;
break;
}
const size_t min_bytes =
ggml_webgpu_flash_attn_wg_mem_bytes(decisions.q_tile, decisions.kv_tile, (uint32_t) src0->ne[0],
(uint32_t) src2->ne[0], has_mask, decisions.kv_direct);
if (min_bytes > limit_bytes) {
supports_op = false;
}
break;
}
case GGML_OP_RMS_NORM:
@@ -3894,6 +4023,10 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
case GGML_OP_SSM_CONV:
supports_op = op->type == GGML_TYPE_F32;
break;
case GGML_OP_SSM_SCAN:
supports_op = op->type == GGML_TYPE_F32 &&
src0->ne[0] <= ctx->webgpu_global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
break;
case GGML_OP_GATED_DELTA_NET:
{
const uint32_t s_v = (uint32_t) src2->ne[0];
@@ -138,25 +138,54 @@ struct Params {
};
@group(0) @binding(0) var<storage, read_write> Q: array<f32>;
#ifdef KV_OVERLAP
@group(0) @binding(1) var<storage, read_write> K: array<KV_TYPE>;
#define V K
#else
@group(0) @binding(1) var<storage, read_write> K: array<KV_TYPE>;
@group(0) @binding(2) var<storage, read_write> V: array<KV_TYPE>;
#endif
#if defined(MASK) && defined(SINKS)
@group(0) @binding(3) var<storage, read_write> mask: array<f16>;
@group(0) @binding(4) var<storage, read_write> sinks: array<f32>;
#define DST_BINDING 5
#define PARAMS_BINDING 6
#elif defined(MASK)
@group(0) @binding(3) var<storage, read_write> mask: array<f16>;
#define DST_BINDING 4
#define PARAMS_BINDING 5
#elif defined(SINKS)
#ifdef KV_OVERLAP
@group(0) @binding(2) var<storage, read_write> mask: array<f16>;
@group(0) @binding(3) var<storage, read_write> sinks: array<f32>;
#define DST_BINDING 4
#define PARAMS_BINDING 5
#else
@group(0) @binding(3) var<storage, read_write> mask: array<f16>;
@group(0) @binding(4) var<storage, read_write> sinks: array<f32>;
#define DST_BINDING 5
#define PARAMS_BINDING 6
#endif
#elif defined(MASK)
#ifdef KV_OVERLAP
@group(0) @binding(2) var<storage, read_write> mask: array<f16>;
#define DST_BINDING 3
#define PARAMS_BINDING 4
#else
@group(0) @binding(3) var<storage, read_write> mask: array<f16>;
#define DST_BINDING 4
#define PARAMS_BINDING 5
#endif
#elif defined(SINKS)
#ifdef KV_OVERLAP
@group(0) @binding(2) var<storage, read_write> sinks: array<f32>;
#define DST_BINDING 3
#define PARAMS_BINDING 4
#else
@group(0) @binding(3) var<storage, read_write> sinks: array<f32>;
#define DST_BINDING 4
#define PARAMS_BINDING 5
#endif
#else
#ifdef KV_OVERLAP
#define DST_BINDING 2
#define PARAMS_BINDING 3
#else
#define DST_BINDING 3
#define PARAMS_BINDING 4
#endif
#endif
@group(0) @binding(DST_BINDING) var<storage, read_write> dst: array<vec4<f32>>;
@@ -0,0 +1,330 @@
enable f16;
enable subgroups;
#define HEAD_DIM_QK 64
#define HEAD_DIM_V 64
#define KV_STAGE_STRIDE 64
#define Q_TILE 4
#define KV_TILE 64
#define WG_SIZE 128
struct Params {
offset_q: u32,
offset_k: u32,
offset_v: u32,
offset_mask: u32,
offset_sinks: u32,
offset_dst: u32,
n_heads: u32,
seq_len_q: u32,
seq_len_kv: u32,
stride_q1: u32,
stride_q2: u32,
stride_q3: u32,
stride_k1: u32,
stride_k2: u32,
stride_k3: u32,
stride_v1: u32,
stride_v2: u32,
stride_v3: u32,
stride_mask3: u32,
q_per_kv: u32,
scale: f32,
max_bias: f32,
logit_softcap: f32,
n_head_log2: f32,
m0: f32,
m1: f32,
};
@group(0) @binding(0) var<storage, read_write> Q: array<f32>;
#ifdef KV_OVERLAP
@group(0) @binding(1) var<storage, read_write> K: array<vec4<f16>>;
#define V K
#else
@group(0) @binding(1) var<storage, read_write> K: array<vec4<f16>>;
@group(0) @binding(2) var<storage, read_write> V: array<vec4<f16>>;
#endif
#if defined(MASK) && defined(SINKS)
#ifdef KV_OVERLAP
@group(0) @binding(2) var<storage, read_write> mask: array<f16>;
@group(0) @binding(3) var<storage, read_write> sinks: array<f32>;
#define DST_BINDING 4
#define PARAMS_BINDING 5
#else
@group(0) @binding(3) var<storage, read_write> mask: array<f16>;
@group(0) @binding(4) var<storage, read_write> sinks: array<f32>;
#define DST_BINDING 5
#define PARAMS_BINDING 6
#endif
#elif defined(MASK)
#ifdef KV_OVERLAP
@group(0) @binding(2) var<storage, read_write> mask: array<f16>;
#define DST_BINDING 3
#define PARAMS_BINDING 4
#else
@group(0) @binding(3) var<storage, read_write> mask: array<f16>;
#define DST_BINDING 4
#define PARAMS_BINDING 5
#endif
#elif defined(SINKS)
#ifdef KV_OVERLAP
@group(0) @binding(2) var<storage, read_write> sinks: array<f32>;
#define DST_BINDING 3
#define PARAMS_BINDING 4
#else
@group(0) @binding(3) var<storage, read_write> sinks: array<f32>;
#define DST_BINDING 4
#define PARAMS_BINDING 5
#endif
#else
#ifdef KV_OVERLAP
#define DST_BINDING 2
#define PARAMS_BINDING 3
#else
#define DST_BINDING 3
#define PARAMS_BINDING 4
#endif
#endif
@group(0) @binding(DST_BINDING) var<storage, read_write> dst: array<vec4<f32>>;
@group(0) @binding(PARAMS_BINDING) var<uniform> params: Params;
const FLOAT_MIN: f32 = -1.0e9;
const Q_CHUNKS: u32 = HEAD_DIM_QK / 4u;
const V_CHUNKS: u32 = HEAD_DIM_V / 4u;
const SCORE_REGS_PER_LANE: u32 = (KV_TILE + MAX_SUBGROUP_SIZE - 1u) / MAX_SUBGROUP_SIZE;
const OUT_REGS_PER_LANE: u32 = (V_CHUNKS + MAX_SUBGROUP_SIZE - 1u) / MAX_SUBGROUP_SIZE;
var<workgroup> q_shmem: array<f16, Q_TILE * HEAD_DIM_QK>;
var<workgroup> kv_shmem: array<f16, KV_TILE * KV_STAGE_STRIDE>;
var<workgroup> p_shmem: array<f32, Q_TILE * KV_TILE>;
@compute @workgroup_size(WG_SIZE)
fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
@builtin(local_invocation_id) local_id: vec3<u32>,
@builtin(subgroup_id) subgroup_id: u32,
@builtin(subgroup_size) subgroup_size: u32,
@builtin(num_subgroups) num_subgroups: u32,
@builtin(subgroup_invocation_id) sg_inv_id: u32) {
if (subgroup_size == 0u || num_subgroups < Q_TILE) {
return;
}
let wg_per_head = (params.seq_len_q + Q_TILE - 1u) / Q_TILE;
let wg_per_batch = wg_per_head * params.n_heads;
let dst2_stride = HEAD_DIM_V * params.n_heads;
let dst3_stride = dst2_stride * params.seq_len_q;
let batch_idx = wg_id.x / wg_per_batch;
let q_batch_offset = params.offset_q + batch_idx * params.stride_q3;
let k_batch_offset = params.offset_k + batch_idx * params.stride_k3;
let v_batch_offset = params.offset_v + batch_idx * params.stride_v3;
let dst_batch_offset = params.offset_dst + batch_idx * dst3_stride;
let wg_in_batch = wg_id.x % wg_per_batch;
let head_idx = wg_in_batch / wg_per_head;
let q_head_offset = q_batch_offset + head_idx * params.stride_q2;
let k_head_idx = head_idx / params.q_per_kv;
let v_head_offset = v_batch_offset + k_head_idx * params.stride_v2;
let k_head_offset = k_batch_offset + k_head_idx * params.stride_k2;
let wg_in_head = wg_in_batch % wg_per_head;
let q_row_start = wg_in_head * Q_TILE;
let global_q_row = q_row_start + subgroup_id;
let row_active = subgroup_id < Q_TILE && global_q_row < params.seq_len_q;
#ifdef MASK
let mask_global_offset = params.offset_mask + batch_idx * params.stride_mask3 + q_row_start * params.seq_len_kv;
#endif
let dst_global_offset = dst_batch_offset + q_row_start * dst2_stride + head_idx * HEAD_DIM_V;
let head = f32(head_idx);
let slope = select(1.0,
select(pow(params.m1, 2.0 * (head - params.n_head_log2) + 1.0),
pow(params.m0, head + 1.0),
head < params.n_head_log2),
params.max_bias > 0.0);
for (var elem_idx = local_id.x; elem_idx < Q_TILE * HEAD_DIM_QK; elem_idx += WG_SIZE) {
let q_tile_row = elem_idx / HEAD_DIM_QK;
let q_col = elem_idx % HEAD_DIM_QK;
let head_q_row = q_row_start + q_tile_row;
let global_q_row_offset = q_head_offset + head_q_row * params.stride_q1;
q_shmem[elem_idx] = f16(select(
0.0,
Q[global_q_row_offset + q_col] * params.scale,
head_q_row < params.seq_len_q));
}
workgroupBarrier();
var row_max = FLOAT_MIN;
var exp_sum = 0.0;
var out_regs: array<vec4<f32>, OUT_REGS_PER_LANE>;
for (var reg_idx = 0u; reg_idx < OUT_REGS_PER_LANE; reg_idx += 1u) {
out_regs[reg_idx] = vec4<f32>(0.0);
}
let q_base = subgroup_id * HEAD_DIM_QK;
let subgroup_p_offset = subgroup_id * KV_TILE;
for (var kv_tile = 0u; kv_tile < params.seq_len_kv; kv_tile += KV_TILE) {
let kv_count = min(KV_TILE, params.seq_len_kv - kv_tile);
let score_slots = min(SCORE_REGS_PER_LANE, (kv_count + subgroup_size - 1u) / subgroup_size);
let out_slots = min(OUT_REGS_PER_LANE, (V_CHUNKS + subgroup_size - 1u) / subgroup_size);
var local_scores: array<f32, SCORE_REGS_PER_LANE>;
for (var slot = 0u; slot < SCORE_REGS_PER_LANE; slot += 1u) {
local_scores[slot] = FLOAT_MIN;
}
for (var vec_idx_local = local_id.x; vec_idx_local < kv_count * Q_CHUNKS; vec_idx_local += WG_SIZE) {
let kv_local = vec_idx_local / Q_CHUNKS;
let chunk = vec_idx_local % Q_CHUNKS;
let global_k_row = kv_tile + kv_local;
let k_vec_index = (k_head_offset + global_k_row * params.stride_k1 + chunk * 4u) >> 2u;
let k4 = K[k_vec_index];
let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u;
kv_shmem[kv_off + 0u] = k4.x;
kv_shmem[kv_off + 1u] = k4.y;
kv_shmem[kv_off + 2u] = k4.z;
kv_shmem[kv_off + 3u] = k4.w;
}
workgroupBarrier();
var local_max = FLOAT_MIN;
if (row_active) {
for (var slot = 0u; slot < score_slots; slot += 1u) {
let kv_local = sg_inv_id + slot * subgroup_size;
if (kv_local >= kv_count) {
continue;
}
let global_k_row = kv_tile + kv_local;
var dot_val = 0.0;
for (var chunk = 0u; chunk < Q_CHUNKS; chunk += 1u) {
let q_off = q_base + chunk * 4u;
let qv = vec4<f32>(
f32(q_shmem[q_off + 0u]),
f32(q_shmem[q_off + 1u]),
f32(q_shmem[q_off + 2u]),
f32(q_shmem[q_off + 3u]));
let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u;
let kv = vec4<f32>(
f32(kv_shmem[kv_off + 0u]),
f32(kv_shmem[kv_off + 1u]),
f32(kv_shmem[kv_off + 2u]),
f32(kv_shmem[kv_off + 3u]));
dot_val += dot(qv, kv);
}
#ifdef LOGIT_SOFTCAP
dot_val = params.logit_softcap * tanh(dot_val);
#endif
#ifdef MASK
let mask_idx = mask_global_offset + subgroup_id * params.seq_len_kv + global_k_row;
dot_val += slope * f32(mask[mask_idx]);
#endif
local_scores[slot] = dot_val;
local_max = max(local_max, dot_val);
}
}
let tile_max = subgroupMax(local_max);
let new_max = max(row_max, tile_max);
let cur_exp = exp(row_max - new_max);
exp_sum *= cur_exp;
for (var reg_idx = 0u; reg_idx < OUT_REGS_PER_LANE; reg_idx += 1u) {
out_regs[reg_idx] *= cur_exp;
}
var local_sum = 0.0;
for (var slot = 0u; slot < score_slots; slot += 1u) {
let kv_local = sg_inv_id + slot * subgroup_size;
if (row_active && kv_local < kv_count) {
let p = exp(local_scores[slot] - new_max);
p_shmem[subgroup_p_offset + kv_local] = p;
local_sum += p;
}
}
workgroupBarrier();
for (var vec_idx_local = local_id.x; vec_idx_local < kv_count * V_CHUNKS; vec_idx_local += WG_SIZE) {
let kv_local = vec_idx_local / V_CHUNKS;
let chunk = vec_idx_local % V_CHUNKS;
let global_v_row = kv_tile + kv_local;
let v_vec_index = (v_head_offset + global_v_row * params.stride_v1 + chunk * 4u) >> 2u;
let v4 = V[v_vec_index];
let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u;
kv_shmem[kv_off + 0u] = v4.x;
kv_shmem[kv_off + 1u] = v4.y;
kv_shmem[kv_off + 2u] = v4.z;
kv_shmem[kv_off + 3u] = v4.w;
}
workgroupBarrier();
let tile_sum = subgroupAdd(local_sum);
exp_sum += tile_sum;
row_max = new_max;
if (row_active) {
for (var reg_idx = 0u; reg_idx < out_slots; reg_idx += 1u) {
let chunk = sg_inv_id + reg_idx * subgroup_size;
if (chunk >= V_CHUNKS) {
continue;
}
var acc = out_regs[reg_idx];
for (var kv_local = 0u; kv_local < kv_count; kv_local += 1u) {
let p = p_shmem[subgroup_p_offset + kv_local];
let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u;
let v4 = vec4<f32>(
f32(kv_shmem[kv_off + 0u]),
f32(kv_shmem[kv_off + 1u]),
f32(kv_shmem[kv_off + 2u]),
f32(kv_shmem[kv_off + 3u]));
acc += p * v4;
}
out_regs[reg_idx] = acc;
}
}
workgroupBarrier();
}
#ifdef SINKS
if (row_active) {
let sink_score = sinks[params.offset_sinks + head_idx];
let sink_max = max(row_max, sink_score);
let sink_scale = exp(row_max - sink_max);
for (var reg_idx = 0u; reg_idx < OUT_REGS_PER_LANE; reg_idx += 1u) {
out_regs[reg_idx] *= sink_scale;
}
exp_sum = exp_sum * sink_scale + exp(sink_score - sink_max);
row_max = sink_max;
}
#endif
if (row_active) {
let inv_exp_sum = select(0.0, 1.0 / exp_sum, exp_sum != 0.0);
let row_base = dst_global_offset + subgroup_id * dst2_stride;
let out_slots = min(OUT_REGS_PER_LANE, (V_CHUNKS + subgroup_size - 1u) / subgroup_size);
for (var reg_idx = 0u; reg_idx < out_slots; reg_idx += 1u) {
let chunk = sg_inv_id + reg_idx * subgroup_size;
if (chunk >= V_CHUNKS) {
continue;
}
let dst_vec_index = (row_base + chunk * 4u) >> 2u;
dst[dst_vec_index] = out_regs[reg_idx] * inv_exp_sum;
}
}
}
@@ -15,7 +15,7 @@ struct Params {
nblk1: u32,
};
@group(0) @binding(0) var<storage, read> mask: array<f16>;
@group(0) @binding(0) var<storage, read_write> mask: array<f16>;
@group(0) @binding(1) var<storage, read_write> blk: array<u32>;
@group(0) @binding(2) var<uniform> params: Params;
@@ -1,8 +1,6 @@
diagnostic(off, chromium.subgroup_matrix_uniformity);
diagnostic(off, subgroup_uniformity);
enable f16;
enable subgroups;
enable chromium_experimental_subgroup_matrix;
#ifdef KV_F32
#define KV_TYPE f32
@@ -13,19 +11,14 @@ enable chromium_experimental_subgroup_matrix;
#define HEAD_DIM_QK 64
#define HEAD_DIM_V 64
#define SG_MAT_M 8
#define SG_MAT_N 8
#define SG_MAT_K 8
#define Q_TILE SG_MAT_M
#define KV_GRANULARITY 8
#define KV_TILE 16
#define WG_SIZE 64
#ifndef VEC_NE
#define VEC_NE 4u
#endif
#define KV_BLOCKS (KV_TILE / SG_MAT_N)
#define KV_BLOCKS (KV_TILE / KV_GRANULARITY)
#define BLOCK_SIZE 32
#define BLOCKS_K ((HEAD_DIM_QK + BLOCK_SIZE - 1) / BLOCK_SIZE)
@@ -97,6 +90,14 @@ struct Params {
};
@group(0) @binding(0) var<storage, read_write> Q: array<f32>;
#ifdef KV_OVERLAP
#if defined(KV_Q4_0) || defined(KV_Q8_0)
@group(0) @binding(1) var<storage, read_write> K: array<KV_TYPE>;
#else
@group(0) @binding(1) var<storage, read_write> K: array<vec4<KV_TYPE>>;
#endif
#define V K
#else
#if defined(KV_Q4_0) || defined(KV_Q8_0)
@group(0) @binding(1) var<storage, read_write> K: array<KV_TYPE>;
#else
@@ -107,7 +108,22 @@ struct Params {
#else
@group(0) @binding(2) var<storage, read_write> V: array<vec4<KV_TYPE>>;
#endif
#endif
#if defined(MASK) && defined(SINKS)
#ifdef KV_OVERLAP
@group(0) @binding(2) var<storage, read_write> mask: array<f16>;
@group(0) @binding(3) var<storage, read_write> sinks: array<f32>;
#ifdef BLK
#define BLK_BINDING 4
#define TMP_BINDING 5
#define DST_BINDING 6
#define PARAMS_BINDING 7
#else
#define TMP_BINDING 4
#define DST_BINDING 5
#define PARAMS_BINDING 6
#endif
#else
@group(0) @binding(3) var<storage, read_write> mask: array<f16>;
@group(0) @binding(4) var<storage, read_write> sinks: array<f32>;
#ifdef BLK
@@ -120,7 +136,21 @@ struct Params {
#define DST_BINDING 6
#define PARAMS_BINDING 7
#endif
#endif
#elif defined(MASK)
#ifdef KV_OVERLAP
@group(0) @binding(2) var<storage, read_write> mask: array<f16>;
#ifdef BLK
#define BLK_BINDING 3
#define TMP_BINDING 4
#define DST_BINDING 5
#define PARAMS_BINDING 6
#else
#define TMP_BINDING 3
#define DST_BINDING 4
#define PARAMS_BINDING 5
#endif
#else
@group(0) @binding(3) var<storage, read_write> mask: array<f16>;
#ifdef BLK
#define BLK_BINDING 4
@@ -132,16 +162,30 @@ struct Params {
#define DST_BINDING 5
#define PARAMS_BINDING 6
#endif
#endif
#elif defined(SINKS)
#ifdef KV_OVERLAP
@group(0) @binding(2) var<storage, read_write> sinks: array<f32>;
#define TMP_BINDING 3
#define DST_BINDING 4
#define PARAMS_BINDING 5
#else
@group(0) @binding(3) var<storage, read_write> sinks: array<f32>;
#define TMP_BINDING 4
#define DST_BINDING 5
#define PARAMS_BINDING 6
#endif
#else
#ifdef KV_OVERLAP
#define TMP_BINDING 2
#define DST_BINDING 3
#define PARAMS_BINDING 4
#else
#define TMP_BINDING 3
#define DST_BINDING 4
#define PARAMS_BINDING 5
#endif
#endif
#ifdef BLK
@group(0) @binding(BLK_BINDING) var<storage, read_write> blk: array<u32>;
@@ -153,7 +197,7 @@ struct Params {
// Just a very small float value.
const FLOAT_MIN: f32 = -1.0e9;
var<workgroup> q_shmem: array<f16, Q_TILE * HEAD_DIM_QK>;
var<workgroup> q_shmem: array<f16, HEAD_DIM_QK>;
#ifndef KV_DIRECT
const kv_shmem_size = KV_TILE * max(HEAD_DIM_QK, HEAD_DIM_V);
@@ -161,31 +205,27 @@ const kv_shmem_size = KV_TILE * max(HEAD_DIM_QK, HEAD_DIM_V);
var<workgroup> kv_shmem: array<f16, kv_shmem_size>;
#endif
var<workgroup> o_shmem: array<f16, Q_TILE * HEAD_DIM_V>;
var<workgroup> o_shmem: array<f16, HEAD_DIM_V>;
#ifdef MASK
// storage for mask values
var<workgroup> mask_shmem: array<f16, Q_TILE * KV_TILE>;
var<workgroup> mask_shmem: array<f16, KV_TILE>;
#endif
// note that we reuse the same storage for both since we only need one at a time
var<workgroup> inter_shmem: array<f16, Q_TILE * KV_TILE>;
var<workgroup> inter_shmem: array<f16, KV_TILE>;
// Storage for row max and exp sum during online softmax
var<workgroup> row_max_shmem: array<f32, Q_TILE>;
var<workgroup> exp_sum_shmem: array<f32, Q_TILE>;
var<workgroup> blk_state_wg: u32;
fn calc_softmax_term(kv_idx: u32, q_tile_row: u32, slope: f32, has_bias: bool, apply_mask: bool) -> f32 {
fn calc_softmax_term(kv_idx: u32, slope: f32, has_bias: bool, apply_mask: bool) -> f32 {
var v = select(FLOAT_MIN,
f32(inter_shmem[kv_idx + q_tile_row * KV_TILE]) * params.scale,
f32(inter_shmem[kv_idx]) * params.scale,
kv_idx < KV_TILE);
#ifdef LOGIT_SOFTCAP
v = params.logit_softcap * tanh(v);
#endif
#ifdef MASK
if (apply_mask) {
var mask_val = select(0.0,f32(mask_shmem[q_tile_row * KV_TILE + kv_idx]), kv_idx < KV_TILE);
var mask_val = select(0.0, f32(mask_shmem[kv_idx]), kv_idx < KV_TILE);
v += select(mask_val, slope * mask_val, has_bias);
}
#endif
@@ -199,19 +239,17 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
@builtin(subgroup_size) subgroup_size: u32,
@builtin(num_subgroups) num_subgroups: u32,
@builtin(subgroup_invocation_id) sg_inv_id: u32) {
// Vec path processes exactly one query row per workgroup, so subgroup 0 can
// keep the running softmax state in private storage.
var row_max = FLOAT_MIN;
var exp_sum = 0.0;
// initialize row max for online softmax
for (var i = local_id.x; i < Q_TILE; i += WG_SIZE) {
row_max_shmem[i] = FLOAT_MIN;
exp_sum_shmem[i] = 0.0;
}
for (var i = local_id.x; i < Q_TILE * HEAD_DIM_V; i += WG_SIZE) {
for (var i = local_id.x; i < HEAD_DIM_V; i += WG_SIZE) {
o_shmem[i] = 0.0;
}
// workgroups per head/batch
let wg_per_head = (params.seq_len_q + Q_TILE - 1u) / Q_TILE;
let wg_per_head = params.seq_len_q;
let wg_per_batch = wg_per_head * params.n_heads;
let dst2_stride = HEAD_DIM_V * params.n_heads;
@@ -235,9 +273,9 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
let k_head_offset = k_batch_offset + k_head_idx * params.stride_k2;
let v_head_offset = v_batch_offset + v_head_idx * params.stride_v2;
// starting Q row for this workgroup
// Vec path handles one Q row per workgroup.
let wg_in_head = wg_in_batch % wg_per_head;
let q_row_start = wg_in_head * Q_TILE;
let q_row_start = wg_in_head;
#ifdef MASK
// mask offset
@@ -248,21 +286,18 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
let has_bias = params.max_bias > 0.0;
let slope = select(1.0, select(pow(params.m1, 2.0 * (head - params.n_head_log2) + 1.0), pow(params.m0, head + 1.0), head < params.n_head_log2), has_bias);
// load q tile into shared memory
for (var elem_idx = local_id.x; elem_idx < Q_TILE * HEAD_DIM_QK; elem_idx += WG_SIZE) {
let q_row = elem_idx / HEAD_DIM_QK;
let q_col = elem_idx % HEAD_DIM_QK;
let head_q_row = q_row_start + q_row;
let global_q_row_offset = q_head_offset + head_q_row * params.stride_q1;
// load the single Q row into shared memory
for (var elem_idx = local_id.x; elem_idx < HEAD_DIM_QK; elem_idx += WG_SIZE) {
let global_q_row_offset = q_head_offset + q_row_start * params.stride_q1;
q_shmem[elem_idx] = f16(select(
0.0,
Q[global_q_row_offset + q_col],
head_q_row < params.seq_len_q && q_col < HEAD_DIM_QK));
Q[global_q_row_offset + elem_idx],
q_row_start < params.seq_len_q));
}
for (var kv_tile = iwg * KV_TILE; kv_tile < params.seq_len_kv; kv_tile += KV_TILE * params.nwg) {
#ifdef BLK
let q_blk = q_row_start / Q_TILE;
let q_blk = q_row_start;
let kv_blk = kv_tile / KV_TILE;
let blk_batch = select(0u, batch_idx, params.stride_mask3 > 0u);
let blk_idx = params.blk_base + (blk_batch * params.blk_nblk1 + q_blk) * params.blk_nblk0 + kv_blk;
@@ -270,13 +305,9 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
#else
let blk_state_local = 1u;
#endif
if (local_id.x == 0u) {
blk_state_wg = blk_state_local;
}
workgroupBarrier();
let blk_state = blk_state_wg;
let blk_state = blk_state_local;
let skip_tile = blk_state == 0u;
for (var elem_idx = local_id.x; elem_idx < Q_TILE * KV_TILE; elem_idx += WG_SIZE) {
for (var elem_idx = local_id.x; elem_idx < KV_TILE; elem_idx += WG_SIZE) {
inter_shmem[elem_idx] = f16(0.0);
}
@@ -360,20 +391,14 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
let num_of_threads = subgroup_size / VEC_NE;
let tx = sg_inv_id % num_of_threads;
let ty = sg_inv_id / num_of_threads;
for (var q_tile_row = subgroup_id; q_tile_row < Q_TILE; q_tile_row += num_subgroups) {
let global_q_row = q_row_start + q_tile_row;
if (global_q_row >= params.seq_len_q) {
continue;
}
let local_q_row_offset = q_tile_row * HEAD_DIM_QK;
if (subgroup_id == 0u && q_row_start < params.seq_len_q) {
for (var kv_base : u32 = 0u; kv_base < KV_TILE; kv_base += VEC_NE) {
let kv_idx = kv_base + ty;
var partial_sum: f32 = 0.0;
let kv_valid = kv_idx < KV_TILE && (kv_tile + kv_idx) < params.seq_len_kv;
if (kv_valid) {
for (var i = tx; i < (HEAD_DIM_QK / 4u); i += num_of_threads) {
let q_off = local_q_row_offset + i * 4u;
let q_off = i * 4u;
let qv = vec4<f32>(
f32(q_shmem[q_off + 0u]),
@@ -410,8 +435,7 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
let sum_bcast = subgroupShuffle(sum, num_of_threads * ty);
if (tx == 0u && kv_valid) {
let dst_idx = q_tile_row * KV_TILE + kv_idx;
inter_shmem[dst_idx] = f16(sum_bcast);
inter_shmem[kv_idx] = f16(sum_bcast);
}
}
}
@@ -422,13 +446,10 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
let apply_mask = !skip_tile && (blk_state != 2u);
if (apply_mask) {
// load mask tile into shared memory for this KV block
for (var elem_idx = local_id.x; elem_idx < Q_TILE * KV_TILE; elem_idx += WG_SIZE) {
let mask_row = elem_idx / KV_TILE;
let mask_col = elem_idx % KV_TILE;
let global_q_row = q_row_start + mask_row;
let global_k_col = kv_tile + mask_col;
let mask_in_bounds = global_q_row < params.seq_len_q && global_k_col < params.seq_len_kv;
let mask_idx = mask_global_offset + mask_row * params.seq_len_kv + global_k_col;
for (var elem_idx = local_id.x; elem_idx < KV_TILE; elem_idx += WG_SIZE) {
let global_k_col = kv_tile + elem_idx;
let mask_in_bounds = q_row_start < params.seq_len_q && global_k_col < params.seq_len_kv;
let mask_idx = mask_global_offset + global_k_col;
mask_shmem[elem_idx] = select(0.0, mask[mask_idx], mask_in_bounds);
}
}
@@ -439,50 +460,40 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
workgroupBarrier();
// online softmax
if (!skip_tile) {
for (var q_tile_row = subgroup_id; q_tile_row < Q_TILE; q_tile_row += num_subgroups) {
let global_q_row = q_row_start + q_tile_row;
if (global_q_row >= params.seq_len_q) {
break;
}
if (!skip_tile && subgroup_id == 0u && q_row_start < params.seq_len_q) {
var prev_max = row_max;
var final_max = prev_max;
// pass 1: compute final max across the full KV tile in chunks
for (var kv_offset = 0u; kv_offset < KV_TILE; kv_offset += subgroup_size) {
let kv_idx = kv_offset + sg_inv_id;
let kv_valid = kv_tile + kv_idx < params.seq_len_kv && kv_idx < KV_TILE;
let softmax_term = select(FLOAT_MIN,
calc_softmax_term(kv_idx, slope, has_bias, apply_mask),
kv_valid);
final_max = subgroupMax(max(final_max, softmax_term));
}
var prev_max = row_max_shmem[q_tile_row];
var final_max = prev_max;
// pass 1: compute final max across the full KV tile in chunks
for (var kv_offset = 0u; kv_offset < KV_TILE; kv_offset += subgroup_size) {
let kv_idx = kv_offset + sg_inv_id;
let kv_valid = kv_tile + kv_idx < params.seq_len_kv && kv_idx < KV_TILE;
let softmax_term = select(FLOAT_MIN,
calc_softmax_term(kv_idx, q_tile_row, slope, has_bias, apply_mask),
kv_valid);
final_max = subgroupMax(max(final_max, softmax_term));
var total_exp_term: f32 = 0.0;
// pass 2: compute exp sum and write P using final_max
for (var kv_offset = 0u; kv_offset < KV_TILE; kv_offset += subgroup_size) {
let kv_idx = kv_offset + sg_inv_id;
let softmax_term = calc_softmax_term(kv_idx, slope, has_bias, apply_mask);
let cur_p = select(0.0,
exp(softmax_term - final_max),
kv_tile + kv_idx < params.seq_len_kv && kv_idx < KV_TILE);
total_exp_term += subgroupAdd(cur_p);
if (kv_idx < KV_TILE) {
inter_shmem[kv_idx] = f16(cur_p);
}
}
var total_exp_term: f32 = 0.0;
// pass 2: compute exp sum and write P using final_max
for (var kv_offset = 0u; kv_offset < KV_TILE; kv_offset += subgroup_size) {
let kv_idx = kv_offset + sg_inv_id;
let softmax_term = calc_softmax_term(kv_idx, q_tile_row, slope, has_bias, apply_mask);
let cur_p = select(0.0,
exp(softmax_term - final_max),
kv_tile + kv_idx < params.seq_len_kv && kv_idx < KV_TILE);
total_exp_term += subgroupAdd(cur_p);
if (kv_idx < KV_TILE) {
inter_shmem[kv_idx + q_tile_row * KV_TILE] = f16(cur_p);
}
}
let cur_exp = exp(prev_max - final_max);
let cur_exp = exp(prev_max - final_max);
row_max = final_max;
exp_sum = exp_sum * cur_exp + total_exp_term;
if (sg_inv_id == 0) {
row_max_shmem[q_tile_row] = final_max;
exp_sum_shmem[q_tile_row] = exp_sum_shmem[q_tile_row] * cur_exp + total_exp_term;
}
for (var elem_idx = sg_inv_id; elem_idx < HEAD_DIM_V; elem_idx += subgroup_size) {
let idx = q_tile_row * HEAD_DIM_V + elem_idx;
o_shmem[idx] = f16(f32(o_shmem[idx]) * cur_exp);
}
for (var elem_idx = sg_inv_id; elem_idx < HEAD_DIM_V; elem_idx += subgroup_size) {
o_shmem[elem_idx] = f16(f32(o_shmem[elem_idx]) * cur_exp);
}
}
@@ -562,15 +573,13 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
workgroupBarrier();
if (!skip_tile) {
// we have P (Q_TILE x KV_TILE) in inter_shmem and V (KV_TILE x head_dim_v) in kv_shmem
// we have P (KV_TILE) in inter_shmem and V (KV_TILE x head_dim_v) in kv_shmem
// we want to compute O += P * V across the full KV tile
let ne_threads : u32 = VEC_NE;
let nl_threads = max(1u, subgroup_size / ne_threads);
let tx_pv = sg_inv_id % nl_threads;
let ty_pv = sg_inv_id / nl_threads;
for (var q_tile_row = subgroup_id;
q_tile_row < Q_TILE;
q_tile_row += num_subgroups) {
if (subgroup_id == 0u && q_row_start < params.seq_len_q) {
for (var vec_col = tx_pv; vec_col < (HEAD_DIM_V / 4u); vec_col += nl_threads) {
var lo = vec4<f32>(0.0, 0.0, 0.0, 0.0);
for (var cc = 0u; cc < KV_TILE / ne_threads; cc += 1u) {
@@ -580,7 +589,7 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
continue;
}
let p = f32(inter_shmem[kv_idx + q_tile_row * KV_TILE]);
let p = f32(inter_shmem[kv_idx]);
#ifdef KV_DIRECT
let v_idx = v_head_offset + v_row * params.stride_v1 + vec_col * 4u;
let v4 = vec4<f32>(V[v_idx >> 2u]);
@@ -621,11 +630,10 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
if (ty_pv == 0u) {
let elem_base = vec_col * 4u;
let o_base_idx = q_tile_row * HEAD_DIM_V + elem_base;
o_shmem[o_base_idx + 0u] = f16(f32(o_shmem[o_base_idx + 0u]) + lo_x);
o_shmem[o_base_idx + 1u] = f16(f32(o_shmem[o_base_idx + 1u]) + lo_y);
o_shmem[o_base_idx + 2u] = f16(f32(o_shmem[o_base_idx + 2u]) + lo_z);
o_shmem[o_base_idx + 3u] = f16(f32(o_shmem[o_base_idx + 3u]) + lo_w);
o_shmem[elem_base + 0u] = f16(f32(o_shmem[elem_base + 0u]) + lo_x);
o_shmem[elem_base + 1u] = f16(f32(o_shmem[elem_base + 1u]) + lo_y);
o_shmem[elem_base + 2u] = f16(f32(o_shmem[elem_base + 2u]) + lo_z);
o_shmem[elem_base + 3u] = f16(f32(o_shmem[elem_base + 3u]) + lo_w);
}
}
}
@@ -637,70 +645,46 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
#ifdef SINKS
// Sinks are global terms and must be applied exactly once across split workgroups.
if (iwg == 0u) {
for (var q_tile_row = subgroup_id;
q_tile_row < Q_TILE;
q_tile_row += num_subgroups) {
let global_q_row = q_row_start + q_tile_row;
if (global_q_row >= params.seq_len_q) {
break;
}
if (iwg == 0u && subgroup_id == 0u && q_row_start < params.seq_len_q) {
var prev_max = row_max;
var prev_max = row_max_shmem[q_tile_row];
// for non-sink threads, exp(FLOAT_MIN) effectively zeroes out their contribution to the sum
let sink_val = select(FLOAT_MIN, sinks[params.offset_sinks + head_idx], sg_inv_id == 0u);
let new_max = subgroupMax(max(prev_max, sink_val));
let max_exp = exp(prev_max - new_max);
let sink_exp = exp(sink_val - new_max);
// for non-sink threads, exp(FLOAT_MIN) effectively zeroes out their contribution to the sum
let sink_val = select(FLOAT_MIN, sinks[params.offset_sinks + head_idx], sg_inv_id == 0);
let new_max = subgroupMax(max(prev_max, sink_val));
let max_exp = exp(prev_max - new_max);
let sink_exp = exp(sink_val - new_max);
let sink_exp_sum = subgroupAdd(sink_exp);
let sink_exp_sum = subgroupAdd(sink_exp);
row_max = new_max;
exp_sum = exp_sum * max_exp + sink_exp_sum;
if (sg_inv_id == 0) {
row_max_shmem[q_tile_row] = new_max;
exp_sum_shmem[q_tile_row] = exp_sum_shmem[q_tile_row] * max_exp + sink_exp_sum;
}
for (var elem_idx = sg_inv_id; elem_idx < HEAD_DIM_V; elem_idx += subgroup_size) {
let idx = q_tile_row * HEAD_DIM_V + elem_idx;
o_shmem[idx] = f16(f32(o_shmem[idx]) * max_exp);
}
for (var elem_idx = sg_inv_id; elem_idx < HEAD_DIM_V; elem_idx += subgroup_size) {
o_shmem[elem_idx] = f16(f32(o_shmem[elem_idx]) * max_exp);
}
workgroupBarrier();
}
workgroupBarrier();
#endif
let rows_per_batch = params.n_heads * params.seq_len_q;
for (var q_tile_row = subgroup_id;
q_tile_row < Q_TILE;
q_tile_row += num_subgroups) {
let global_q_row = q_row_start + q_tile_row;
if (global_q_row >= params.seq_len_q) { break; }
if (subgroup_id == 0u && q_row_start < params.seq_len_q) {
if (params.nwg == 1u) {
let exp_sum = exp_sum_shmem[q_tile_row];
let scale = select(0.0, 1.0 / exp_sum, exp_sum != 0.0);
let row_base: u32 =
params.offset_dst + batch_idx * dst3_stride + global_q_row * dst2_stride + head_idx * HEAD_DIM_V;
let row_base: u32 = params.offset_dst + batch_idx * dst3_stride + q_row_start * dst2_stride +
head_idx * HEAD_DIM_V;
for (var elem_base = sg_inv_id * 4u; elem_base < HEAD_DIM_V; elem_base += subgroup_size * 4u) {
let i0 = q_tile_row * HEAD_DIM_V + (elem_base + 0u);
let i1 = q_tile_row * HEAD_DIM_V + (elem_base + 1u);
let i2 = q_tile_row * HEAD_DIM_V + (elem_base + 2u);
let i3 = q_tile_row * HEAD_DIM_V + (elem_base + 3u);
let v = vec4<f32>(
f32(o_shmem[i0]) * scale,
f32(o_shmem[i1]) * scale,
f32(o_shmem[i2]) * scale,
f32(o_shmem[i3]) * scale
f32(o_shmem[elem_base + 0u]) * scale,
f32(o_shmem[elem_base + 1u]) * scale,
f32(o_shmem[elem_base + 2u]) * scale,
f32(o_shmem[elem_base + 3u]) * scale
);
let dst_vec_index: u32 = (row_base + elem_base) >> 2u;
dst[dst_vec_index] = v;
}
} else {
let rid = batch_idx * rows_per_batch + head_idx * params.seq_len_q + global_q_row;
let rid = batch_idx * rows_per_batch + head_idx * params.seq_len_q + q_row_start;
let tmp_row_data_base = params.tmp_data_base + rid * (HEAD_DIM_V * params.nwg) + iwg * HEAD_DIM_V;
let tmp_row_stats_base = params.tmp_stats_base + rid * (2u * params.nwg) + 2u * iwg;
@@ -708,21 +692,16 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
elem_base < HEAD_DIM_V;
elem_base += subgroup_size * 4u) {
let i0 = q_tile_row * HEAD_DIM_V + (elem_base + 0u);
let i1 = q_tile_row * HEAD_DIM_V + (elem_base + 1u);
let i2 = q_tile_row * HEAD_DIM_V + (elem_base + 2u);
let i3 = q_tile_row * HEAD_DIM_V + (elem_base + 3u);
let tbase = tmp_row_data_base + elem_base;
tmp[tbase + 0u] = f32(o_shmem[i0]);
tmp[tbase + 1u] = f32(o_shmem[i1]);
tmp[tbase + 2u] = f32(o_shmem[i2]);
tmp[tbase + 3u] = f32(o_shmem[i3]);
tmp[tbase + 0u] = f32(o_shmem[elem_base + 0u]);
tmp[tbase + 1u] = f32(o_shmem[elem_base + 1u]);
tmp[tbase + 2u] = f32(o_shmem[elem_base + 2u]);
tmp[tbase + 3u] = f32(o_shmem[elem_base + 3u]);
}
if (sg_inv_id == 0u) {
tmp[tmp_row_stats_base + 0u] = exp_sum_shmem[q_tile_row];
tmp[tmp_row_stats_base + 1u] = row_max_shmem[q_tile_row];
tmp[tmp_row_stats_base + 0u] = exp_sum;
tmp[tmp_row_stats_base + 1u] = row_max;
}
}
}
@@ -1,4 +1,4 @@
#ifdef INPLACE
#ifdef OVERLAP
@group(0) @binding(0)
var<storage, read_write> rn_src: array<f32>;
@@ -13,6 +13,21 @@ fn update(rn_src_offset: u32, dst_offset: u32, scale: f32, mul_src_offset: u32)
mul_src[dst_offset] = scale * rn_src[rn_src_offset] * mul_src[mul_src_offset];
}
#elif INPLACE
@group(0) @binding(0)
var<storage, read_write> rn_src: array<f32>;
@group(0) @binding(1)
var<storage, read_write> mul_src: array<f32>;
@group(0) @binding(2)
var<uniform> params: Params;
fn update(rn_src_offset: u32, dst_offset: u32, scale: f32, mul_src_offset: u32) {
rn_src[dst_offset] = scale * rn_src[rn_src_offset] * mul_src[mul_src_offset];
}
#elif SRC_OVERLAP
@group(0) @binding(0)
@@ -0,0 +1,168 @@
#ifdef USE_SUBGROUP_REDUCTION
enable subgroups;
#endif
struct Params {
offset_s: u32,
offset_x: u32,
offset_dt: u32,
offset_A: u32,
offset_B: u32,
offset_C: u32,
offset_ids: u32,
offset_dst: u32,
stride_s1: u32,
stride_s2: u32,
stride_s3: u32,
stride_x1: u32,
stride_x2: u32,
stride_x3: u32,
stride_dt1: u32,
stride_dt2: u32,
a_ne0: u32,
stride_A1: u32,
stride_B1: u32,
stride_B2: u32,
stride_B3: u32,
stride_C1: u32,
stride_C2: u32,
stride_C3: u32,
d_state: u32,
d_inner: u32,
n_head: u32,
n_group: u32,
n_seq_tokens: u32,
n_seqs: u32,
y_elems: u32,
};
@group(0) @binding(0) var<storage, read_write> s_in: array<f32>;
@group(0) @binding(1) var<storage, read_write> x: array<f32>;
@group(0) @binding(2) var<storage, read_write> dt: array<f32>;
@group(0) @binding(3) var<storage, read_write> A: array<f32>;
@group(0) @binding(4) var<storage, read_write> B: array<f32>;
@group(0) @binding(5) var<storage, read_write> C: array<f32>;
@group(0) @binding(6) var<storage, read_write> ids: array<i32>;
@group(0) @binding(7) var<storage, read_write> dst: array<f32>;
@group(0) @binding(8) var<uniform> params: Params;
var<workgroup> shared_x_dt: array<f32, TOKENS_PER_TILE>;
var<workgroup> shared_dtsp: array<f32, TOKENS_PER_TILE>;
var<workgroup> shared_reduce: array<f32, TOKENS_PER_TILE * WG_SIZE>;
fn reduce_base(token_in_tile: u32) -> u32 {
return token_in_tile * WG_SIZE;
}
@compute @workgroup_size(WG_SIZE)
fn main(
@builtin(local_invocation_id) local_id: vec3<u32>,
@builtin(workgroup_id) wg_id: vec3<u32>,
@builtin(num_workgroups) num_wg: vec3<u32>
#ifdef USE_SUBGROUP_REDUCTION
, @builtin(subgroup_id) subgroup_id: u32,
@builtin(subgroup_invocation_id) subgroup_invocation_id: u32,
@builtin(num_subgroups) num_subgroups: u32
#endif
) {
let tid = local_id.x;
let wg_linear = wg_id.y * num_wg.x + wg_id.x;
let i1 = wg_linear % params.d_inner;
let head_seq = wg_linear / params.d_inner;
let ir = head_seq % params.n_head;
let i3 = head_seq / params.n_head;
let state_slot = u32(ids[params.offset_ids + i3]);
let g = ir / (params.n_head / params.n_group);
let s_idx = params.offset_s + tid + i1 * params.stride_s1 + ir * params.stride_s2 + state_slot * params.stride_s3;
var s_prev = s_in[s_idx];
let A0 = A[params.offset_A + (tid % params.a_ne0) + ir * params.stride_A1];
for (var token_base = 0u; token_base < params.n_seq_tokens; token_base += TOKENS_PER_TILE) {
if (tid < TOKENS_PER_TILE) {
let token = token_base + tid;
if (token < params.n_seq_tokens) {
let x_idx = params.offset_x + i1 + ir * params.stride_x1 + token * params.stride_x2 + i3 * params.stride_x3;
let dt_idx = params.offset_dt + ir + token * params.stride_dt1 + i3 * params.stride_dt2;
let dt0 = dt[dt_idx];
let dtsp = select(log(1.0 + exp(dt0)), dt0, dt0 > 20.0);
shared_dtsp[tid] = dtsp;
shared_x_dt[tid] = x[x_idx] * dtsp;
}
}
workgroupBarrier();
for (var token_in_tile = 0u; token_in_tile < TOKENS_PER_TILE; token_in_tile++) {
let token = token_base + token_in_tile;
if (token >= params.n_seq_tokens) {
break;
}
let x_dt = shared_x_dt[token_in_tile];
let dA = exp(shared_dtsp[token_in_tile] * A0);
let reduce_idx = reduce_base(token_in_tile) + tid;
let b_idx = params.offset_B + tid + g * params.stride_B1 + token * params.stride_B2 + i3 * params.stride_B3;
let c_idx = params.offset_C + tid + g * params.stride_C1 + token * params.stride_C2 + i3 * params.stride_C3;
let s = s_prev * dA + B[b_idx] * x_dt;
s_prev = s;
#ifdef USE_SUBGROUP_REDUCTION
let subgroup_partial = subgroupAdd(s * C[c_idx]);
if (subgroup_invocation_id == 0u) {
shared_reduce[reduce_idx - tid + subgroup_id] = subgroup_partial;
}
#else
shared_reduce[reduce_idx] = s * C[c_idx];
#endif
workgroupBarrier();
#ifdef USE_SUBGROUP_REDUCTION
if (tid == 0u) {
var sum = 0.0;
for (var sg = 0u; sg < num_subgroups; sg++) {
sum += shared_reduce[reduce_base(token_in_tile) + sg];
}
let y_idx =
params.offset_dst + i1 + ir * params.d_inner + token * (params.n_head * params.d_inner) +
i3 * (params.n_seq_tokens * params.n_head * params.d_inner);
dst[y_idx] = sum;
}
#else
for (var stride = WG_SIZE / 2u; stride > 0u; stride >>= 1u) {
if (tid < stride) {
shared_reduce[reduce_idx] += shared_reduce[reduce_idx + stride];
}
workgroupBarrier();
}
if (tid == 0u) {
let y_idx =
params.offset_dst + i1 + ir * params.d_inner + token * (params.n_head * params.d_inner) +
i3 * (params.n_seq_tokens * params.n_head * params.d_inner);
dst[y_idx] = shared_reduce[reduce_base(token_in_tile)];
}
#endif
workgroupBarrier();
}
}
let state_idx =
params.offset_dst + params.y_elems + tid + i1 * params.d_state + ir * (params.d_state * params.d_inner) +
i3 * (params.d_state * params.d_inner * params.n_head);
dst[state_idx] = s_prev;
}
+20 -20
View File
@@ -7656,7 +7656,7 @@ size_t ggml_quantize_chunk(
int64_t nrows,
int64_t n_per_row,
const float * imatrix) {
const int64_t n = (int64_t) nrows * n_per_row;
const int64_t n = nrows * n_per_row;
if (ggml_quantize_requires_imatrix(type)) {
GGML_ASSERT(imatrix != NULL);
@@ -7673,21 +7673,21 @@ size_t ggml_quantize_chunk(
size_t result = 0;
switch (type) {
case GGML_TYPE_Q1_0: result = quantize_q1_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q4_0: result = quantize_q4_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q4_1: result = quantize_q4_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q5_0: result = quantize_q5_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q5_1: result = quantize_q5_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q8_0: result = quantize_q8_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_MXFP4: result = quantize_mxfp4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_NVFP4: result = quantize_nvfp4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q2_K: result = quantize_q2_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q3_K: result = quantize_q3_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q4_K: result = quantize_q4_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q5_K: result = quantize_q5_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q6_K: result = quantize_q6_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_TQ1_0: result = quantize_tq1_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_TQ2_0: result = quantize_tq2_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q1_0: result = quantize_q1_0 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q4_0: result = quantize_q4_0 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q4_1: result = quantize_q4_1 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q5_0: result = quantize_q5_0 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q5_1: result = quantize_q5_1 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q8_0: result = quantize_q8_0 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_MXFP4: result = quantize_mxfp4 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_NVFP4: result = quantize_nvfp4 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q2_K: result = quantize_q2_K (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q3_K: result = quantize_q3_K (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q4_K: result = quantize_q4_K (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q5_K: result = quantize_q5_K (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q6_K: result = quantize_q6_K (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_TQ1_0: result = quantize_tq1_0 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_TQ2_0: result = quantize_tq2_0 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ2_XS: result = quantize_iq2_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ3_XXS: result = quantize_iq3_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
@@ -7752,9 +7752,9 @@ struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) {
}
bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) {
if (p0->n_threads != p1->n_threads ) return false;
if (p0->prio != p1->prio ) return false;
if (p0->poll != p1->poll ) return false;
if (p0->strict_cpu != p1->strict_cpu ) return false;
if (p0->n_threads != p1->n_threads ) return false;
if (p0->prio != p1->prio ) return false;
if (p0->poll != p1->poll ) return false;
if (p0->strict_cpu != p1->strict_cpu ) return false;
return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
}
+8
View File
@@ -68,11 +68,19 @@ dir=$(basename $(pwd))
git branch -D pr/$PR 2> /dev/null
git worktree add -b pr/$PR ../$dir-pr-$PR pr/$PR/$head_ref 2> /dev/null
og_path=$(pwd)
wt_path=$(cd ../$dir-pr-$PR && pwd)
echo "git worktree created in $wt_path"
cd $wt_path
# pi agent setup in the worktree
if [[ -f "$og_path/.pi/SYSTEM.md" && ! -f ".pi/SYSTEM.md" ]]; then
mkdir -p .pi
ln -sfn "$og_path/.pi/SYSTEM.md" .pi/SYSTEM.md
fi
git branch --set-upstream-to=pr/$PR/$head_ref
git pull --ff-only || {
echo "error: failed to pull pr/$PR"
+69 -9
View File
@@ -3,8 +3,12 @@
Test structured output capability via chat completions endpoint.
Each test case contains:
- response_format: OpenAI-compatible response_format specification
(json_schema only llama.cpp does not support json_object)
- response_format: OpenAI-compatible response_format specification.
Both "json_schema" and "json_object" are accepted; with
"json_object" a schema can be supplied via extra_body.
- extra_body (optional): dict of extra top-level request fields merged into
the request payload (mirrors the OpenAI SDK's extra_body
feature; llama.cpp reads a top-level "json_schema" here).
- messages: initial conversation messages
- tools (optional): tool definitions (for mixed tool + structured tests)
- mock_tool_responses (optional): dict mapping tool_name -> callable(arguments) -> str (JSON)
@@ -81,11 +85,14 @@ def print_info(msg):
_print(f"{DIM}{msg}{RESET}")
def print_schema_note(label, rf):
def print_schema_note(label, rf, extra_body=None):
kind = rf.get("type", "?")
name = ""
if kind == "json_schema":
name = rf.get("json_schema", {}).get("name", "")
elif kind == "json_object" and extra_body and "json_schema" in extra_body:
extra_schema = extra_body["json_schema"] or {}
name = extra_schema.get("title") or "extra_body.json_schema"
_print(f"{DIM}{MAGENTA} ⟐ response_format [{label}]: {kind}"
f"{(' / ' + name) if name else ''}{RESET}")
@@ -95,17 +102,20 @@ def print_schema_note(label, rf):
# ---------------------------------------------------------------------------
def chat_completion(url, messages, tools=None, response_format=None, stream=False):
def chat_completion(url, messages, tools=None, response_format=None, stream=False,
extra_body=None):
payload = {
"messages": messages,
"stream": stream,
"max_tokens": 4096,
"max_tokens": 8192,
}
if tools:
payload["tools"] = tools
payload["tool_choice"] = "auto"
if response_format is not None:
payload["response_format"] = response_format
if extra_body:
payload.update(extra_body)
try:
response = requests.post(url, json=payload, stream=stream)
@@ -180,7 +190,7 @@ def chat_completion(url, messages, tools=None, response_format=None, stream=Fals
def run_tool_loop(
url, messages, tools, mock_tool_responses, stream, response_format=None,
max_turns=6,
extra_body=None, max_turns=6,
):
"""
Drive the tool-call loop. If response_format is provided it is applied to
@@ -191,7 +201,8 @@ def run_tool_loop(
for _ in range(max_turns):
result = chat_completion(
url, msgs, tools=tools, response_format=response_format, stream=stream
url, msgs, tools=tools, response_format=response_format, stream=stream,
extra_body=extra_body,
)
if result is None:
return all_tool_calls, msgs, None
@@ -274,7 +285,8 @@ def run_test(url, test_case, stream):
print_header(f"{name} [{mode}] ({apply_stage})")
response_format = test_case["response_format"]
print_schema_note(apply_stage, response_format)
extra_body = test_case.get("extra_body")
print_schema_note(apply_stage, response_format, extra_body)
tools = test_case.get("tools")
mocks = test_case.get("mock_tool_responses") or {}
@@ -290,6 +302,7 @@ def run_test(url, test_case, stream):
mock_tool_responses=mocks,
stream=stream,
response_format=response_format,
extra_body=extra_body,
)
elif apply_stage == "after_tools":
# Phase 1: plain tool loop, no response_format applied yet.
@@ -314,7 +327,8 @@ def run_test(url, test_case, stream):
# model focuses on producing the schema-constrained answer.
_print(f"\n{DIM}{MAGENTA} ⟐ follow-up turn with response_format applied{RESET}")
result = chat_completion(
url, msgs, tools=None, response_format=response_format, stream=stream
url, msgs, tools=None, response_format=response_format, stream=stream,
extra_body=extra_body,
)
final_content = result["content"] if result else None
else:
@@ -481,6 +495,51 @@ def _validate_sentiment(parsed):
return True, f"sentiment={parsed['sentiment']} conf={conf} kws={kws}"
# ---- Test: json_object + extra_body.json_schema (always) ----
#
# Exercises the llama.cpp-specific path where the OpenAI SDK would send
# response_format={"type": "json_object"} and tunnel the schema through
# extra_body.json_schema (which becomes a top-level "json_schema" field on
# the request body).
_PRODUCT_JSON_OBJECT_SCHEMA = {
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "https://example.com/product.schema.json",
"title": "Product",
"description": "A product in the catalog",
"type": "object",
}
PRODUCT_JSON_OBJECT_TEST_CASE = {
"name": "json_object response_format with extra_body json_schema",
"response_format": {"type": "json_object"},
"extra_body": {"json_schema": _PRODUCT_JSON_OBJECT_SCHEMA},
"apply_stage": "always",
"messages": [
{
"role": "system",
"content": (
"Extract structured data from the provided text according to the "
"JSON schema. Return only valid JSON matching the schema exactly."
),
},
{
"role": "user",
"content": "Product: Wireless Headphones, ID: 101, In Stock: Yes",
},
],
"validate": lambda parsed, tcs, raw: _validate_product_json_object(parsed),
}
def _validate_product_json_object(parsed):
if not isinstance(parsed, dict):
return False, f"expected JSON object, got {type(parsed).__name__}: {parsed!r}"
if not parsed:
return False, f"expected non-empty object, got {parsed!r}"
return True, f"product object with {len(parsed)} field(s): {sorted(parsed.keys())}"
# ---- Test 3: Nested recipe schema (always) ----
_RECIPE_SCHEMA = {
@@ -915,6 +974,7 @@ def _validate_country_report(parsed, tcs):
ALL_TEST_CASES = [
BOOK_TEST_CASE,
SENTIMENT_TEST_CASE,
PRODUCT_JSON_OBJECT_TEST_CASE,
RECIPE_TEST_CASE,
SHOP_COMPARISON_TEST_CASE,
COUNTRY_REPORT_TEST_CASE,
+2 -2
View File
@@ -23,10 +23,10 @@ verbose=
[ "$V" != "" ] && verbose="GGML_HEXAGON_VERBOSE=$V" cli_opts="$cli_opts -v"
profile=
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1" cli_opts="$cli_opts -v"
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF" cli_opts="$cli_opts -v"
opmask=
[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
[ "$OPSTAGE" != "" ] && opmask="GGML_HEXAGON_OPSTAGE=$OPSTAGE"
nhvx=
[ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"
+2 -2
View File
@@ -28,10 +28,10 @@ sched=
[ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"
profile=
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1" cli_opts="$cli_opts -v"
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF" cli_opts="$cli_opts -v"
opmask=
[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
[ "$OPSTAGE" != "" ] && opmask="GGML_HEXAGON_OPSTAGE=$OPSTAGE"
nhvx=
[ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"
+2 -2
View File
@@ -28,10 +28,10 @@ sched=
[ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"
profile=
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1" cli_opts="$cli_opts -v"
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF" cli_opts="$cli_opts -v"
opmask=
[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
[ "$OPSTAGE" != "" ] && opmask="GGML_HEXAGON_OPSTAGE=$OPSTAGE"
nhvx=
[ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"
+2 -2
View File
@@ -37,10 +37,10 @@ sched=
[ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"
profile=
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1"
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF"
opmask=
[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
[ "$OPSTAGE" != "" ] && opmask="GGML_HEXAGON_OPSTAGE=$OPSTAGE"
nhvx=
[ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"
+2 -2
View File
@@ -25,10 +25,10 @@ sched=
[ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"
profile=
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1"
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF"
opmask=
[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
[ "$OPSTAGE" != "" ] && opmask="GGML_HEXAGON_OPSTAGE=$OPSTAGE"
nhvx=
[ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"
+188
View File
@@ -0,0 +1,188 @@
#!/usr/bin/env python3
import sys
import os
import re
import argparse
import statistics
import logging
from collections import defaultdict
# Mapping of cli-friendly names to (internal_data_key, Display Header, numeric_sort_key)
COL_MAP = {
"op": ("op", "Op", "op"),
"dims": ("dims", "Dims", "dims"),
"dtypes": ("dtypes", "DTypes", "dtypes"),
"count": ("count", "Count", "_sort_count"),
"max-usec": ("max_usec", "Max usec", "_sort_max_usec"),
"avg-usec": ("avg_usec", "Avg usec", "_sort_avg_usec"),
"max-cycles": ("max_cycles", "Max Cycles", "_sort_max_cycles"),
"avg-cycles": ("avg_cycles", "Avg Cycles", "_sort_avg_cycles"),
"max-pmu": ("max_pmu", "Max PMU", "_sort_max_pmu"),
"avg-pmu": ("avg_pmu", "Avg PMU", "_sort_avg_pmu"),
}
op_pattern = re.compile(
r"profile-op\s+(?P<op_name>[A-Z_0-9]+):\s+.*?\s+:\s+(?P<dims>[\d:x\s\->!]+)\s+:\s+(?P<types>[a-z\d_\s\->x]+)\s+:\s+.*?\s+usec\s+(?P<usec>\d+)\s+cycles\s+(?P<cycles>\d+)(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?"
)
logger = logging.getLogger("ggml-hexagon-profile")
def parse_log(file_path, pmu_index=None):
try:
if file_path != "-":
f = open(file_path, 'r', encoding='utf-8', errors='ignore')
else:
f = os.fdopen(0, 'r', encoding='utf-8', errors='ignore')
except FileNotFoundError:
logger.error(f"file '{file_path}' not found.")
sys.exit(1)
all_ops = []
for line in f:
match = op_pattern.search(line)
if not match: continue
pmu_raw = match.group('pmu')
pmu_val = None
if pmu_raw and pmu_index is not None:
try:
pmu_list = [int(x.strip()) for x in pmu_raw.split(',')]
if len(pmu_list) > pmu_index:
pmu_val = pmu_list[pmu_index]
except (ValueError, IndexError):
pmu_val = None
all_ops.append({
'name': match.group('op_name'),
'dims': match.group('dims').strip(),
'types': match.group('types').strip(),
'usec': int(match.group('usec')),
'cycles': int(match.group('cycles')),
'pmu_val': pmu_val
})
f.close()
return all_ops
def generate_report(ops, top_n, width_overrides, sort_col, pmu_name=None):
if not ops:
logger.info("No valid records found.")
return
grouped = defaultdict(list)
for op in ops:
key = (op['name'], op['dims'], op['types'])
grouped[key].append(op)
group_stats = []
for (name, dims, types), group_ops in grouped.items():
usecs = [o['usec'] for o in group_ops]
cycles = [o['cycles'] for o in group_ops]
pmu_vals = [o['pmu_val'] for o in group_ops if o['pmu_val'] is not None]
group_stats.append({
'op': name,
'dims': dims,
'dtypes': types,
'count': str(len(group_ops)),
'max_usec': str(max(usecs)),
'avg_usec': f"{statistics.mean(usecs):.2f}",
'max_cycles': str(max(cycles)),
'avg_cycles': f"{statistics.mean(cycles):.2f}",
'max_pmu': str(max(pmu_vals)) if pmu_vals else "0",
'avg_pmu': f"{statistics.mean(pmu_vals):.2f}" if pmu_vals else "0.00",
# Numeric values for accurate sorting
'_sort_count': len(group_ops),
'_sort_max_usec': max(usecs),
'_sort_avg_usec': statistics.mean(usecs),
'_sort_max_cycles': max(cycles),
'_sort_avg_cycles': statistics.mean(cycles),
'_sort_max_pmu': max(pmu_vals) if pmu_vals else 0,
'_sort_avg_pmu': statistics.mean(pmu_vals) if pmu_vals else 0
})
# Sorting logic
actual_sort_key = COL_MAP[sort_col][2]
# We sort numeric fields descending, strings (op/dims) ascending
is_numeric = actual_sort_key.startswith("_") or actual_sort_key == "count"
sorted_groups = sorted(group_stats, key=lambda x: x[actual_sort_key], reverse=is_numeric)[:top_n]
# Define initial column order
active_cols = ["op", "dims", "dtypes"]
if pmu_name:
active_cols += ["max-pmu", "avg-pmu"]
active_cols += ["max-usec", "avg-usec", "max-cycles", "avg-cycles", "count"]
final_headers, final_keys, final_widths = [], [], []
for col_name in active_cols:
data_key, header_text, _ = COL_MAP[col_name]
if "pmu" in col_name and pmu_name:
header_text = header_text.replace("PMU", pmu_name)
natural_width = max([len(row[data_key]) for row in sorted_groups] + [len(header_text)])
target_width = width_overrides.get(col_name, natural_width)
if target_width == 0:
continue
final_headers.append(header_text)
final_keys.append(data_key)
final_widths.append(target_width)
# Print Report
logger.info(f"\n# Profile Report (Top {top_n} Ops sorted by {sort_col})\n")
header_line = "| " + " | ".join(f"{h:<{final_widths[i]}}" for i, h in enumerate(final_headers)) + " |"
sep_line = "| " + " | ".join("-" * final_widths[i] for i in range(len(final_headers))) + " |"
logger.info(header_line)
logger.info(sep_line)
for group in sorted_groups:
row_vals = []
for i, key in enumerate(final_keys):
val = group[key]
if len(val) > final_widths[i]:
val = val[:final_widths[i] - 3] + "..."
row_vals.append(f"{val:<{final_widths[i]}}")
logger.info("| " + " | ".join(row_vals) + " |")
def main():
parser = argparse.ArgumentParser(description="Post-process Op profile info.")
parser.add_argument("logfile")
parser.add_argument("-n", "--top", type=int, default=100)
parser.add_argument("--sort", type=str, default="max-usec", choices=list(COL_MAP.keys()))
parser.add_argument("--pmu-index", type=int)
parser.add_argument("--pmu-name", type=str)
parser.add_argument("--width", action='append', default=['dims:40'], help="Override column width, e.g. --width dims:50")
args = parser.parse_args()
logging.basicConfig(level=logging.INFO, format='%(message)s')
# Sort validation: can't sort by PMU if index isn't provided
if "pmu" in args.sort and args.pmu_index is None:
logger.error(f"Cannot sort by '{args.sort}' without --pmu-index.")
sys.exit(1)
overrides = {}
if args.width:
for w in args.width:
try:
name, val = w.split(':')
overrides[name.lower()] = int(val)
except ValueError:
logger.warning(f"Invalid width format '{w}'")
final_pmu_name = (args.pmu_name or f"#{args.pmu_index}") if args.pmu_index is not None else None
ops = parse_log(args.logfile, pmu_index=args.pmu_index)
generate_report(ops, args.top, overrides, args.sort, pmu_name=final_pmu_name)
if __name__ == "__main__":
main()
-1
View File
@@ -1 +0,0 @@
This directory includes pytest based scripts for running CI jobs on Qualcomm Device Cloud (QDC).
-3
View File
@@ -8,12 +8,9 @@ iniconfig==2.1.0
outcome==1.3.0.post0
packaging==25.0
pluggy==1.6.0
Pygments==2.19.2
PySocks==1.7.1
pytest==8.4.2
pytest-dependency==0.6.0
selenium==4.36.0
setuptools==80.9.0
sniffio==1.3.1
sortedcontainers==2.4.0
tomli==2.3.0
+401
View File
@@ -0,0 +1,401 @@
"""Run llama.cpp Hexagon Android tests in a single QDC Appium job.
Bundles test scripts into one artifact and submits a single QDC job:
1. run_bench_tests_posix.py llama-cli and llama-bench on CPU / GPU / NPU
(from scripts/snapdragon/qdc/)
Results are written to $GITHUB_STEP_SUMMARY when set (GitHub Actions).
Prerequisites:
pip install /path/to/qualcomm_device_cloud_sdk*.whl
Required environment variables:
QDC_API_KEY API key from QDC UI -> Users -> Settings -> API Keys
Usage:
python run_qdc_jobs.py \\
--pkg-dir pkg-snapdragon/llama.cpp \\
--model-url https://.../Llama-3.2-1B-Instruct-Q4_0.gguf \\
--device SM8750
"""
from __future__ import annotations
import argparse
import logging
import os
import re
import shutil
import sys
import tempfile
import time
import xml.etree.ElementTree as ET
from dataclasses import dataclass, field
from pathlib import Path
from qualcomm_device_cloud_sdk.api import qdc_api # ty: ignore[unresolved-import]
from qualcomm_device_cloud_sdk.logging import configure_logging # ty: ignore[unresolved-import]
from qualcomm_device_cloud_sdk.models import ArtifactType, JobMode, JobState, JobSubmissionParameter, JobType, TestFramework # ty: ignore[unresolved-import]
configure_logging(level=logging.INFO, handlers=[logging.StreamHandler()])
log = logging.getLogger(__name__)
POLL_INTERVAL = 30
JOB_TIMEOUT = 3600
LOG_UPLOAD_TIMEOUT = 600
CAPACITY_TIMEOUT = 1800
CAPACITY_POLL = 60
MAX_CONCURRENT_JOBS = 5
TERMINAL_STATES = {JobState.COMPLETED, JobState.CANCELED}
NON_TERMINAL_STATES = {JobState.DISPATCHED, JobState.RUNNING, JobState.SETUP, JobState.SUBMITTED}
_SCRIPTS_DIR = Path(__file__).parent
_TESTS_DIR = _SCRIPTS_DIR / "tests"
_RUN_BENCH = _TESTS_DIR / "run_bench_tests_posix.py"
_RUN_BACKEND_OPS = _TESTS_DIR / "run_backend_ops_posix.py"
_UTILS = _TESTS_DIR / "utils.py"
_CONFTEST = _TESTS_DIR / "conftest.py"
_REQUIREMENTS = _SCRIPTS_DIR / "requirements.txt"
_PYTEST_LINE_RE = re.compile(
r"(?:[\w/]+\.py::)?(?:\w+::)?([\w\[\].-]+)\s+(PASSED|FAILED|ERROR|SKIPPED)"
)
_EXCLUDED_LOGS = {"qdc_android_whole_host-000.log", "qdc_kernel_host-000.log"}
_NON_TERMINAL_STATE_VALUES = {s.value for s in NON_TERMINAL_STATES}
@dataclass
class JobResult:
passed: bool
tests: dict[str, bool] = field(default_factory=dict)
raw_logs: dict[str, str] = field(default_factory=dict)
failure_details: dict[str, str] = field(default_factory=dict)
def build_artifact_zip(
pkg_dir: Path,
stage_dir: Path,
*,
test_mode: str = "bench",
model_url: str | None = None,
) -> Path:
"""Bundle everything into a single QDC artifact zip.
Zip structure (extracted by QDC to /qdc/appium/ on the runner):
llama_cpp_bundle/ installed package (adb pushed to /data/local/tmp/)
tests/
utils.py shared helpers (paths, run_adb_command, )
conftest.py shared pytest fixtures (driver)
test_bench_posix.py bench + cli tests (<<MODEL_URL>> substituted)
AND/OR
test_backend_ops_posix.py test-backend-ops -b HTP0
requirements.txt
"""
shutil.copytree(pkg_dir, stage_dir / "llama_cpp_bundle")
tests_dir = stage_dir / "tests"
tests_dir.mkdir()
shutil.copy(_UTILS, tests_dir / "utils.py")
shutil.copy(_CONFTEST, tests_dir / "conftest.py")
if test_mode in ("bench", "all"):
assert model_url is not None, "--model-url is required for bench/all test modes"
(tests_dir / "test_bench_posix.py").write_text(
_RUN_BENCH.read_text().replace("<<MODEL_URL>>", model_url)
)
if test_mode in ("backend-ops", "all"):
shutil.copy(_RUN_BACKEND_OPS, tests_dir / "test_backend_ops_posix.py")
shutil.copy(_REQUIREMENTS, stage_dir / "requirements.txt")
(stage_dir / "pytest.ini").write_text("[pytest]\naddopts = --junitxml=results.xml\n")
zip_base = str(stage_dir / "artifact")
shutil.make_archive(zip_base, "zip", stage_dir)
return Path(f"{zip_base}.zip")
def wait_for_job(client, job_id: str, timeout: int) -> str:
elapsed = 0
while elapsed < timeout:
raw = qdc_api.get_job_status(client, job_id)
try:
status = JobState(raw)
except ValueError:
status = raw
if status in TERMINAL_STATES:
return raw.lower()
log.info("Job %s: %s", job_id, raw)
time.sleep(POLL_INTERVAL)
elapsed += POLL_INTERVAL
raise TimeoutError(f"Job {job_id} did not finish within {timeout}s")
def wait_for_log_upload(client, job_id: str) -> None:
elapsed = 0
while elapsed <= LOG_UPLOAD_TIMEOUT:
status = (qdc_api.get_job_log_upload_status(client, job_id) or "").lower()
if status in {"completed", "failed"}:
return
log.info("Waiting for log upload (status=%s) ...", status)
time.sleep(POLL_INTERVAL)
elapsed += POLL_INTERVAL
log.warning("Timed out waiting for log upload after %ds", LOG_UPLOAD_TIMEOUT)
def wait_for_capacity(client, max_jobs: int = MAX_CONCURRENT_JOBS) -> None:
"""Block until the user's active (non-terminal) QDC job count is below max_jobs."""
elapsed = 0
while elapsed < CAPACITY_TIMEOUT:
jobs_page = qdc_api.get_jobs_list(client, page_number=0, page_size=50)
if jobs_page is None:
log.warning("Could not retrieve job list; proceeding without capacity check")
return
items = getattr(jobs_page, "data", []) or []
active = sum(1 for j in items if getattr(j, "state", None) in _NON_TERMINAL_STATE_VALUES)
if active < max_jobs:
log.info("Active QDC jobs: %d / %d — proceeding", active, max_jobs)
return
log.info("Active QDC jobs: %d / %d — waiting %ds ...", active, max_jobs, CAPACITY_POLL)
time.sleep(CAPACITY_POLL)
elapsed += CAPACITY_POLL
log.warning("Capacity wait timed out after %ds; proceeding anyway", CAPACITY_TIMEOUT)
def _parse_junit_xml(content: str) -> tuple[dict[str, bool], dict[str, str]]:
try:
root = ET.fromstring(content)
except ET.ParseError:
return {}, {}
results: dict[str, bool] = {}
failures: dict[str, str] = {}
for tc in root.iter("testcase"):
name = tc.get("name", "")
if classname := tc.get("classname", ""):
name = f"{classname}.{name}"
failure_el = tc.find("failure")
if failure_el is None:
failure_el = tc.find("error")
results[name] = failure_el is None
if failure_el is not None:
parts = [failure_el.get("message", ""), failure_el.text or ""]
failures[name] = "\n".join(p for p in parts if p).strip()
return results, failures
def _parse_pytest_output(content: str) -> dict[str, bool]:
results: dict[str, bool] = {}
for m in _PYTEST_LINE_RE.finditer(content):
results[m.group(1)] = m.group(2) == "PASSED"
return results
def fetch_logs_and_parse_tests(
client, job_id: str
) -> tuple[dict[str, bool], dict[str, str], dict[str, str]]:
"""Returns (test_results, raw_logs, failure_details)."""
log_files = qdc_api.get_job_log_files(client, job_id)
if not log_files:
log.warning("No log files returned for job %s", job_id)
return {}, {}, {}
test_results: dict[str, bool] = {}
pytest_fallback: dict[str, bool] = {}
raw_logs: dict[str, str] = {}
failure_details: dict[str, str] = {}
with tempfile.TemporaryDirectory() as tmpdir:
for lf in log_files:
log.info("Downloading log file: %s", lf.filename)
zip_path = os.path.join(tmpdir, "log.zip")
qdc_api.download_job_log_files(client, lf.filename, zip_path)
try:
shutil.unpack_archive(zip_path, tmpdir, "zip")
except Exception as e:
log.warning("Could not unpack %s as zip: %s", lf.filename, e)
for root_dir, _, files in os.walk(tmpdir):
for fname in sorted(files):
fpath = os.path.join(root_dir, fname)
content = Path(fpath).read_text(errors="replace")
if fname.endswith(".xml"):
results, failures = _parse_junit_xml(content)
test_results.update(results)
failure_details.update(failures)
elif fname.endswith(".log"):
if fname in _EXCLUDED_LOGS:
continue
log.info("--- %s ---", fname)
log.info("%s", content)
raw_logs[fname] = content
pytest_fallback.update(_parse_pytest_output(content))
return (test_results if test_results else pytest_fallback), raw_logs, failure_details
def write_summary(result: JobResult, title: str = "QDC Test Results") -> None:
summary_path = os.environ.get("GITHUB_STEP_SUMMARY")
if not summary_path:
return
icon = "" if result.passed else ""
lines = [
f"## {title}\n",
f"Overall: {icon} {'PASSED' if result.passed else 'FAILED'}\n",
]
reportable = {n: ok for n, ok in result.tests.items() if "test_install" not in n}
if reportable:
lines += ["| Test | Result |", "| ---- | ------ |"]
for name, ok in reportable.items():
lines.append(f"| `{name}` | {'' if ok else ''} |")
passed_n = sum(1 for v in reportable.values() if v)
failed_n = sum(1 for v in reportable.values() if not v)
lines += ["", f"**{passed_n} passed, {failed_n} failed**"]
else:
lines.append("_No per-test data available._")
failed_names = [n for n, ok in reportable.items() if not ok]
if failed_names:
lines += ["", "### Failures"]
for name in failed_names:
detail = result.failure_details.get(name)
if detail:
lines += [
f"<details><summary><code>{name}</code></summary>",
"",
"```",
detail,
"```",
"",
"</details>",
]
if result.raw_logs:
lines += ["", "### Raw Logs"]
for fname, content in sorted(result.raw_logs.items()):
lines += [
f"<details><summary>{fname}</summary>",
"",
"```",
content.rstrip(),
"```",
"",
"</details>",
]
with open(summary_path, "a") as f:
f.write("\n".join(lines) + "\n")
def parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
p.add_argument("--pkg-dir", required=True, type=Path,
help="Installed llama.cpp package directory (contains bin/ and lib/)")
p.add_argument("--model-url",
help="Direct URL to the GGUF model file (required for --test bench)")
p.add_argument("--device", required=True,
help="QDC chipset name, e.g. SM8750")
p.add_argument("--test", choices=["bench", "backend-ops", "all"], default="bench",
help="Test suite to run (default: bench)")
p.add_argument("--job-timeout", type=int, default=JOB_TIMEOUT, metavar="SECONDS",
help=f"Max seconds to wait for job completion (default: {JOB_TIMEOUT})")
args = p.parse_args()
if args.test in ("bench", "all") and not args.model_url:
p.error("--model-url is required when --test bench or --test all")
return args
def main() -> int:
args = parse_args()
api_key = os.environ.get("QDC_API_KEY")
if not api_key:
log.error("QDC_API_KEY environment variable must be set")
return 1
if not args.pkg_dir.is_dir():
log.error("--pkg-dir %s does not exist", args.pkg_dir)
return 1
client = qdc_api.get_public_api_client_using_api_key(
api_key_header=api_key,
app_name_header="llama-cpp-ci",
on_behalf_of_header="llama-cpp-ci",
client_type_header="Python",
)
target_id = qdc_api.get_target_id(client, args.device)
if target_id is None:
log.error("Could not find QDC target for device %r", args.device)
return 1
with tempfile.TemporaryDirectory() as tmpdir:
log.info("Building artifact ...")
zip_path = build_artifact_zip(
args.pkg_dir, Path(tmpdir),
test_mode=args.test, model_url=args.model_url,
)
log.info("Uploading artifact (%d MB) ...", zip_path.stat().st_size // 1_000_000)
artifact_id = qdc_api.upload_file(client, str(zip_path), ArtifactType.TESTSCRIPT)
if artifact_id is None:
log.error("Artifact upload failed")
return 1
wait_for_capacity(client)
job_id = qdc_api.submit_job(
public_api_client=client,
target_id=target_id,
job_name="llama.cpp Hexagon tests",
external_job_id=None,
job_type=JobType.AUTOMATED,
job_mode=JobMode.APPLICATION,
timeout=max(1, args.job_timeout // 60),
test_framework=TestFramework.APPIUM,
entry_script=None,
job_artifacts=[artifact_id],
monkey_events=None,
monkey_session_timeout=None,
job_parameters=[JobSubmissionParameter.WIFIENABLED],
)
if job_id is None:
log.error("Job submission failed")
return 1
log.info("Job submitted: %s (device=%s)", job_id, args.device)
try:
job_status = wait_for_job(client, job_id, timeout=args.job_timeout)
except TimeoutError as e:
log.error("%s", e)
write_summary(JobResult(passed=False, tests={}), title=f"QDC Job Timed Out ({args.device})")
return 1
log.info("Job %s finished: %s", job_id, job_status)
wait_for_log_upload(client, job_id)
tests, raw_logs, failure_details = fetch_logs_and_parse_tests(client, job_id)
passed = job_status == JobState.COMPLETED.value.lower()
if tests:
passed = passed and all(tests.values())
if not passed:
log.error("Job did not complete successfully or tests failed (status=%s)", job_status)
result = JobResult(passed=passed, tests=tests, raw_logs=raw_logs, failure_details=failure_details)
if args.test == "backend-ops":
title = f"Backend Ops — HTP0 ({args.device})"
elif args.test == "all":
title = f"QDC Tests ({args.device})"
else:
title = f"QDC Test Results ({args.device})"
write_summary(result, title=title)
return 0 if passed else 1
if __name__ == "__main__":
sys.exit(main())
+20
View File
@@ -0,0 +1,20 @@
"""Shared pytest fixtures for QDC on-device test runners."""
import os
import pytest
from appium import webdriver
from utils import options, write_qdc_log
@pytest.fixture(scope="session", autouse=True)
def driver():
return webdriver.Remote(command_executor="http://127.0.0.1:4723/wd/hub", options=options)
def pytest_sessionfinish(session, exitstatus):
xml_path = getattr(session.config.option, "xmlpath", None) or "results.xml"
if os.path.exists(xml_path):
with open(xml_path) as f:
write_qdc_log("results.xml", f.read())
@@ -0,0 +1,41 @@
"""
On-device test-backend-ops runner for llama.cpp (HTP0 backend).
Executed by QDC's Appium test framework on the QDC runner.
The runner has ADB access to the allocated device.
"""
import os
import sys
import pytest
from utils import BIN_PATH, CMD_PREFIX, push_bundle_if_needed, run_adb_command, write_qdc_log
@pytest.fixture(scope="session", autouse=True)
def install(driver):
push_bundle_if_needed(f"{BIN_PATH}/test-backend-ops")
@pytest.mark.parametrize("type_a", ["mxfp4", "fp16", "q4_0"])
def test_backend_ops_htp0(type_a):
cmd = f"{CMD_PREFIX} GGML_HEXAGON_HOSTBUF=0 GGML_HEXAGON_EXPERIMENTAL=1 {BIN_PATH}/test-backend-ops -b HTP0 -o MUL_MAT"
if type_a == "q4_0":
cmd += r' -p "^(?=.*type_a=q4_0)(?!.*type_b=f32,m=576,n=512,k=576).*$"'
else:
cmd += f" -p type_a={type_a}"
result = run_adb_command(
cmd,
check=False,
)
write_qdc_log(f"backend_ops_{type_a}.log", result.stdout or "")
assert result.returncode == 0, f"test-backend-ops type_a={type_a} failed (exit {result.returncode})"
if __name__ == "__main__":
ret = pytest.main(["-s", "--junitxml=results.xml", os.path.realpath(__file__)])
if os.path.exists("results.xml"):
with open("results.xml") as f:
write_qdc_log("results.xml", f.read())
sys.exit(ret)
@@ -0,0 +1,76 @@
"""
On-device bench and completion test runner for llama.cpp (CPU, GPU, NPU backends).
Executed by QDC's Appium test framework on the QDC runner.
The runner has ADB access to the allocated device.
Placeholders replaced at artifact creation time by run_qdc_jobs.py:
<<MODEL_URL>> Direct URL to the GGUF model file (downloaded on-device via curl)
"""
import os
import subprocess
import sys
import pytest
from utils import BIN_PATH, CMD_PREFIX, push_bundle_if_needed, run_adb_command, write_qdc_log
MODEL_PATH = "/data/local/tmp/model.gguf"
PROMPT = "What is the capital of France?"
CLI_OPTS = "--batch-size 128 -n 128 -no-cnv --seed 42"
@pytest.fixture(scope="session", autouse=True)
def install(driver):
push_bundle_if_needed(f"{BIN_PATH}/llama-cli")
# Skip model download if already present
check = subprocess.run(
["adb", "shell", f"ls {MODEL_PATH}"],
text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
)
if check.returncode != 0:
run_adb_command(f'curl -L -J --output {MODEL_PATH} "<<MODEL_URL>>"')
@pytest.mark.parametrize("device,extra_flags", [
pytest.param("none", "-ctk q8_0 -ctv q8_0", id="cpu"),
pytest.param("GPUOpenCL", "", id="gpu"),
pytest.param("HTP0", "-ctk q8_0 -ctv q8_0", id="npu"),
])
def test_llama_completion(device, extra_flags):
result = run_adb_command(
f'{CMD_PREFIX} {BIN_PATH}/llama-completion'
f' -m {MODEL_PATH} --device {device} -ngl 99 -t 4 {CLI_OPTS} {extra_flags} -fa on'
f' -p "{PROMPT}"',
check=False,
)
write_qdc_log(f"llama_completion_{device}.log", result.stdout or "")
assert result.returncode == 0, f"llama-completion {device} failed (exit {result.returncode})"
_DEVICE_LOG_NAME = {"none": "cpu", "GPUOpenCL": "gpu", "HTP0": "htp"}
@pytest.mark.parametrize("device", [
pytest.param("none", id="cpu"),
pytest.param("GPUOpenCL", id="gpu"),
pytest.param("HTP0", id="npu"),
])
def test_llama_bench(device):
result = run_adb_command(
f"{CMD_PREFIX} {BIN_PATH}/llama-bench"
f" -m {MODEL_PATH} --device {device} -ngl 99 --batch-size 128 -t 4 -p 128 -n 32",
check=False,
)
write_qdc_log(f"llama_bench_{_DEVICE_LOG_NAME[device]}.log", result.stdout or "")
assert result.returncode == 0, f"llama-bench {device} failed (exit {result.returncode})"
if __name__ == "__main__":
ret = pytest.main(["-s", "--junitxml=results.xml", os.path.realpath(__file__)])
if os.path.exists("results.xml"):
with open("results.xml") as f:
write_qdc_log("results.xml", f.read())
sys.exit(ret)
@@ -1,63 +0,0 @@
import pytest
import subprocess
import sys
tmp_path='/data/local/tmp'
pkg_path=f'{tmp_path}/llama.cpp'
lib_path=f'{pkg_path}/lib'
bin_path=f'{pkg_path}/bin'
model='../gguf/Llama-3.2-1B-Instruct-Q4_0.gguf'
cli_pref=f'cd {pkg_path} && LD_LIBRARY_PATH={lib_path} ADSP_LIBRARY_PATH={lib_path} {bin_path}'
def run_cmd(cmd):
p = subprocess.run(cmd, text = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT)
sys.stdout.write(p.stdout)
assert(p.returncode == 0)
@pytest.mark.dependency()
def test_install():
run_cmd(['adb', 'push', 'llama.cpp', f'{tmp_path}'])
run_cmd(['adb', 'shell', f'chmod 755 {bin_path}/*'])
## Basic cli tests
def run_llama_cli(dev, opts):
prompt='what is the most popular cookie in the world?\nPlease provide a very brief bullet point summary.\nBegin your answer with **BEGIN**.'
opts = '--batch-size 128 -n 128 -no-cnv --seed 42 ' + opts
run_cmd(['adb', 'shell', f'{cli_pref}/llama-cli -m {model} --device {dev} -ngl 99 -t 4 {opts} -p "{prompt}"'])
@pytest.mark.dependency(depends=['test_install'])
def test_llama_cli_cpu():
run_llama_cli('none', '-ctk q8_0 -ctv q8_0 -fa on')
@pytest.mark.dependency(depends=['test_install'])
def test_llama_cli_gpu():
run_llama_cli('GPUOpenCL', '-fa on')
@pytest.mark.dependency(depends=['test_install'])
def test_llama_cli_npu():
run_llama_cli('HTP0', '-ctk q8_0 -ctv q8_0 -fa on')
## Basic bench tests
def run_llama_bench(dev):
run_cmd(['adb', 'shell', f'{cli_pref}/llama-bench -m {model} --device {dev} -ngl 99 --batch-size 128 -t 4 -p 128 -n 32'])
@pytest.mark.dependency(depends=['test_install'])
def test_llama_bench_cpu():
run_llama_bench('none')
def test_llama_bench_gpu():
run_llama_bench('GPUOpenCL')
def test_llama_bench_npu():
run_llama_bench('HTP0')
+93
View File
@@ -0,0 +1,93 @@
"""Shared helpers for QDC on-device test runners."""
import logging
import os
import subprocess
import tempfile
from appium.options.common import AppiumOptions
log = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# On-device paths
# ---------------------------------------------------------------------------
BUNDLE_PATH = "/data/local/tmp/llama_cpp_bundle"
QDC_LOGS_PATH = "/data/local/tmp/QDC_logs"
LIB_PATH = f"{BUNDLE_PATH}/lib"
BIN_PATH = f"{BUNDLE_PATH}/bin"
ENV_PREFIX = (
f"export LD_LIBRARY_PATH={LIB_PATH} && "
f"export ADSP_LIBRARY_PATH={LIB_PATH} && "
f"chmod +x {BIN_PATH}/* &&"
)
CMD_PREFIX = f"cd {BUNDLE_PATH} && {ENV_PREFIX}"
# ---------------------------------------------------------------------------
# Appium session options
# ---------------------------------------------------------------------------
options = AppiumOptions()
options.set_capability("automationName", "UiAutomator2")
options.set_capability("platformName", "Android")
options.set_capability("deviceName", os.getenv("ANDROID_DEVICE_VERSION"))
# ---------------------------------------------------------------------------
# ADB helpers
# ---------------------------------------------------------------------------
def run_adb_command(cmd: str, *, check: bool = True) -> subprocess.CompletedProcess:
# Append exit-code sentinel because `adb shell` doesn't reliably propagate
# the on-device exit code (older ADB versions always return 0).
raw = subprocess.run(
["adb", "shell", f"{cmd}; echo __RC__:$?"],
text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
)
stdout = raw.stdout
returncode = raw.returncode
if stdout:
lines = stdout.rstrip("\n").split("\n")
if lines and lines[-1].startswith("__RC__:"):
try:
returncode = int(lines[-1][7:])
stdout = "\n".join(lines[:-1]) + "\n"
except ValueError:
pass
log.info("%s", stdout)
result = subprocess.CompletedProcess(raw.args, returncode, stdout=stdout)
if check:
assert returncode == 0, f"Command failed (exit {returncode})"
return result
def write_qdc_log(filename: str, content: str) -> None:
"""Push content as a log file to QDC_LOGS_PATH on the device for QDC log collection."""
subprocess.run(
["adb", "shell", f"mkdir -p {QDC_LOGS_PATH}"],
stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
)
with tempfile.NamedTemporaryFile(mode="w", suffix=".log", delete=False) as f:
f.write(content)
tmp_path = f.name
try:
subprocess.run(
["adb", "push", tmp_path, f"{QDC_LOGS_PATH}/{filename}"],
stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
)
finally:
os.unlink(tmp_path)
def push_bundle_if_needed(check_binary: str) -> None:
"""Push llama_cpp_bundle to the device if check_binary is not already present."""
result = subprocess.run(
["adb", "shell", f"ls {check_binary}"],
text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
)
if result.returncode != 0:
subprocess.run(
["adb", "push", "/qdc/appium/llama_cpp_bundle/", "/data/local/tmp"],
text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
)
+3 -3
View File
@@ -21,11 +21,11 @@ if ($null -ne $env:V) {
}
if ($null -ne $env:PROF) {
$env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1
$env:GGML_HEXAGON_PROFILE=$env:PROF
}
if ($null -ne $env:OPMASK) {
$env:GGML_HEXAGON_OPMASK=$env:OPMASK
if ($null -ne $env:OPSTAGE) {
$env:GGML_HEXAGON_OPSTAGE=$env:OPSTAGE
}
if ($null -ne $env:NHVX) {

Some files were not shown because too many files have changed in this diff Show More