Compare commits

...

5 Commits

Author SHA1 Message Date
Daniel Bevenius 5e6229a840 common : fix double bos, use common_chat_templates for add_bos and add_eos (#15326)
This commit updates common_chat_templates_apply_jinja to use the
the add_bos and add_eos parameters from the chat template instead of
the inputs.

The motivation for this is that currently if the `add_bos` and `add_eos`
from the input parameters are used it is possible to there will be a
missmatch between the model and the chat template which can lead to the
the removal of duplicate BOS/EOS tokens in chat.cpp `apply` to not
happen leading to two BOS tokens being added to the template.
2025-08-15 19:50:52 +02:00
lhez e2c1bfff53 opencl: add initial mxfp4 support via mv (#15270)
* opencl: add reference `mul_mv_mxfp4_f32`

* opencl: add reference `mul_mv_id` for mxfp4

* Q4_0 tranpose fix for Adreno

---------

Co-authored-by: shawngu-quic <shawngu@qti.qualcomm.com>
2025-08-15 09:52:14 -07:00
Georgi Gerganov 5edf1592fd vulkan : fix out-of-bounds access in argmax kernel (#15342)
ggml-ci
2025-08-15 16:16:36 +02:00
Georgi Gerganov db3010bd23 vulkan : fix compile warnings on macos (#15340)
ggml-ci
2025-08-15 15:28:28 +02:00
Aaron Teo ff27f80a74 ggml: initial IBM zDNN backend (#14975)
* ggml-zdnn: inital backend impl

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

ggml-zdnn: temp change z17 to arch15

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

ggml-zdnn: fix build bugs

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: tensor->extra logging check

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

ggml-zdnn: add layout name mapping, ztensor information

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

ggml-zdnn: separate logging into its own line

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

ggml-zdnn: add shape comparison

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

ggml-zdnn: add ggml_tensor shape log

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

ggml-zdnn: fix incorrect shape logging

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: add output buffer check

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: run compute and store into tensor->extra

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: add set_tensor

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: add more loggers

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: update set_tensor logging to check only for matmul

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: last working matmul version

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: add comments to prevent accidentally deleting lines

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: support op out_prod

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: update op out_prod to use tensor->extra

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: rewrite the backend implementation

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: bugfix new impl

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: fix compiler warnings and bugfixes

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: test ztensor finding in init_tensor

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: implement at least 1 op to test

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: assign tensor->extra to buffer

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: add check for view tensors to prevent init_tensor

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: rework init_tensor to create new buffers

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: switch to std vector instead of array

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: switch buffers back and set to arbitrary number

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: impl init_tensor

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: update supports_op matmul matrix

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: fix incorrect ztensor shape, reduce memory padding

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: code clean up

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: impl matmul

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: fix compiler error missing type

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: fix missing data transform call

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: add bias init_tensor

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: tighten memory usage, change string allocation

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: add bias ztensor and data free

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: add bias data transform

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: add more debug info for extra buffer transform

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: add logger to check if mat mul ops go through set_tensor

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: activate bias transform in matmul

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: move weights transform into mulmat

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: add more safeguards in matmul

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: fix sequencing of transforms

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: bugfix transform ztensor vs origtensor

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: figure out why sigtrap is happening

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: fix sigsegv

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: move everything back to local declaration

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: move bias data to local also

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: bring back working matmul

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: rewrite into mre

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: fix missing vector import

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: fix missing vector import in header

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: attempt to fix sigsegv

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: fix missing load tensor

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: fix invalid ztensor buffer release

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: add logging to debug free buffer

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: remove free_buffer debug info

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: add parmblkformat detections

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: add nnpa installed detection

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: add zdnn_init call for static libs

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: add init_tensor

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: attempt at fixing invalid buffer

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: switch to using deque to fix pointer deref problem

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: add weights logging to check

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: attempt to use unique ptr

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: add tensor to pre_tfm_desc logging

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: add inputs logging

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: disable op_none initialisation for testing

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: fix missing return from init_tensor

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: load ztensors in cgraph exec

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: work on moving output ztensor as well

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: disable logging and breakpoints for full test

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: attempt at manually changing the layout

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: attempt at using default nwhc format instead

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: disable global load ztensor for now

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: fix errorenous output load tensor

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: add guards to prevent loading ztensor if transformed

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: code cleanup

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: bring load ztensor back to init routine

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: code clean up

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: fix ztensor deallocation abort

stabilise ggml <-> zdnn api

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: clean up matmul selection

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: clean up project structure

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: update documentation, prepare for upstream

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* chore: add codeowners

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: disable batched matmul

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: attempt at fixing tensor views during matmul

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: deny all view tensors directly

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: fix pr comments

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* docs: update ops docs for zdnn

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: redo test-backend-ops for ops.md

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: fix typo in build-s390x.md

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* codeowners: remove taronaeo for now

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* Revert "codeowners: remove taronaeo for now"

This reverts commit 411ea4ed78.

* ggml-zdnn: remove unused ggml_zdnn macro

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

---------

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
2025-08-15 21:11:22 +08:00
24 changed files with 9783 additions and 123 deletions
@@ -40,7 +40,7 @@ body:
attributes:
label: GGML backends
description: Which GGML backends do you know to be affected?
options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL]
options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL, zDNN]
multiple: true
validations:
required: true
+1 -1
View File
@@ -42,7 +42,7 @@ body:
attributes:
label: GGML backends
description: Which GGML backends do you know to be affected?
options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL]
options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL, zDNN]
multiple: true
validations:
required: true
+5
View File
@@ -22,6 +22,11 @@ Vulkan:
- any-glob-to-any-file:
- ggml/include/ggml-vulkan.h
- ggml/src/ggml-vulkan/**
IBM zDNN:
- changed-files:
- any-glob-to-any-file:
- ggml/include/ggml-zdnn.h
- ggml/src/ggml-zdnn/**
documentation:
- changed-files:
- any-glob-to-any-file:
+1
View File
@@ -10,3 +10,4 @@
/ggml/src/ggml-opt.cpp @JohannesGaessler
/ggml/src/gguf.cpp @JohannesGaessler
/ggml/src/ggml-vulkan/ @0cc4m
/ggml/src/ggml-zdnn/ @taronaeo
+2 -2
View File
@@ -2061,8 +2061,8 @@ static common_chat_params common_chat_templates_apply_jinja(
params.enable_thinking = inputs.enable_thinking;
params.grammar = inputs.grammar;
params.now = inputs.now;
params.add_bos = inputs.add_bos;
params.add_eos = inputs.add_eos;
params.add_bos = tmpls->add_bos;
params.add_eos = tmpls->add_eos;
params.extra_context = json::object();
for (auto el : inputs.chat_template_kwargs) {
+29 -11
View File
@@ -76,6 +76,23 @@ cmake --build build --config Release -j $(nproc)
cmake --build build --config Release -j $(nproc)
```
## IBM zDNN Accelerator
This provides acceleration using the IBM zAIU co-processor located in the Telum I and Telum II processors. Make sure to have the [IBM zDNN library](https://github.com/IBM/zDNN) installed.
#### Compile from source from IBM
You may find the official build instructions here: [Building and Installing zDNN](https://github.com/IBM/zDNN?tab=readme-ov-file#building-and-installing-zdnn)
### Compilation
```bash
cmake -S . -B build \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_ZDNN=ON
cmake --build build --config Release -j$(nproc)
```
## Getting GGUF Models
All models need to be converted to Big-Endian. You can achieve this in three cases:
@@ -145,15 +162,15 @@ All models need to be converted to Big-Endian. You can achieve this in three cas
### 1. SIMD Acceleration
Only available in IBM z15 or later system with the `-DGGML_VXE=ON` (turned on by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z14/arch12. In such systems, the APIs can still run but will use a scalar implementation.
Only available in IBM z15/LinuxONE 3 or later system with the `-DGGML_VXE=ON` (turned on by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z14/arch12. In such systems, the APIs can still run but will use a scalar implementation.
### 2. NNPA Vector Intrinsics Acceleration
Only available in IBM z16 or later system with the `-DGGML_NNPA=ON` (turned off by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation.
Only available in IBM z16/LinuxONE 4 or later system with the `-DGGML_NNPA=ON` (turned off by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation.
### 3. zDNN Accelerator
### 3. zDNN Accelerator (WIP)
_Only available in IBM z16 / LinuxONE 4 or later system. No support currently available._
Only available in IBM z17/LinuxONE 5 or later system with the `-DGGML_ZDNN=ON` compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs will default back to CPU routines.
### 4. Spyre Accelerator
@@ -229,11 +246,12 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
## Appendix A: Hardware Support Matrix
| | Support | Minimum Compiler Version |
| ------- | ------- | ------------------------ |
| IBM z15 | ✅ | |
| IBM z16 | ✅ | |
| IBM z17 | ✅ | GCC 15.1.0 |
| | Support | Minimum Compiler Version |
| -------- | ------- | ------------------------ |
| IBM z15 | ✅ | |
| IBM z16 | ✅ | |
| IBM z17 | ✅ | GCC 15.1.0 |
| IBM zDNN | ✅ | |
- ✅ - supported and verified to run as intended
- 🚫 - unsupported, we are unlikely able to provide support
@@ -242,7 +260,7 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
| | VX/VXE/VXE2 | NNPA | zDNN | Spyre |
| ---------- | ----------- | ---- | ---- | ----- |
| FP32 | ✅ | ✅ | | ❓ |
| FP32 | ✅ | ✅ | | ❓ |
| FP16 | ✅ | ✅ | ❓ | ❓ |
| BF16 | 🚫 | 🚫 | ❓ | ❓ |
| Q4_0 | ✅ | ✅ | ❓ | ❓ |
@@ -273,4 +291,4 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
- 🚫 - acceleration unavailable, will still run using scalar implementation
- ❓ - acceleration unknown, please contribute if you can test it yourself
Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on July 25, 2025.
Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on July 31, 2025.
+89 -88
View File
@@ -12,91 +12,92 @@ Legend:
- 🟡 Partially supported by this backend
- ❌ Not supported by this backend
| Operation | BLAS | CANN | CPU | CUDA | Metal | OpenCL | SYCL | Vulkan |
|-----------|------|------|------|------|------|------|------|------|
| ABS | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
| ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
| ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ |
| ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ |
| ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 |
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ |
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 |
| CONV_2D | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ✅ |
| CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
| CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
| COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 |
| COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
| CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
| CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
| CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
| DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ |
| DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ |
| DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 |
| ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
| EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
| FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 |
| GATED_LINEAR_ATTN | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ |
| GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
| GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
| GEGLU_QUICK | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
| GELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
| GELU_ERF | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
| GELU_QUICK | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
| GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 |
| GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ |
| GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
| HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
| IM2COL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ |
| L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
| LOG | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ |
| MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
| MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ |
| MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
| MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ |
| NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
| NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
| OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
| OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ |
| PAD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| PAD_REFLECT_1D | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
| POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
| REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
| RELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
| REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 |
| REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
| RMS_NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ |
| RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
| RMS_NORM_MUL_ADD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| ROLL | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ |
| ROPE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
| RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
| RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
| SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| SET | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
| SET_ROWS | ❌ | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
| SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
| SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
| SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
| SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
| SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 |
| SOFT_MAX | ❌ | 🟡 | | | | | 🟡 | |
| SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | | | | ✅ |
| SQR | ❌ | | | | 🟡 | ❌ | ✅ | 🟡 |
| SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | ❌ |
| SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | | ❌ | ❌ |
| SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
| STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
| SUB | ❌ | ✅ | ✅ | | 🟡 | 🟡 | | |
| SUM | ❌ | ✅ | ✅ | ✅ | | | ✅ | ✅ |
| SUM_ROWS | ❌ | ✅ | ✅ | ✅ | | ✅ | ✅ | |
| SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
| TANH | | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | 🟡 |
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | | ✅ | | | |
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ |
| Operation | BLAS | CANN | CPU | CUDA | Metal | OpenCL | SYCL | Vulkan | zDNN |
|-----------|------|------|------|------|------|------|------|------|------|
| ABS | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
| ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
| ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
| ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
| ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ❌ |
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
| CONV_2D | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ |
| CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
| CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
| COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ |
| COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
| CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
| CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
| CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
| DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
| DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
| DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
| ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
| EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
| FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ |
| GATED_LINEAR_ATTN | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
| GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
| GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
| GEGLU_QUICK | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
| GELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
| GELU_ERF | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
| GELU_QUICK | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
| GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
| GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ |
| GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
| HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
| IM2COL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ |
| L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
| LOG | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
| MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
| MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
| MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
| MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ |
| NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
| NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
| OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
| OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ |
| PAD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| PAD_REFLECT_1D | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
| POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
| REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
| RELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
| REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ❌ |
| REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
| RMS_NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ |
| RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
| RMS_NORM_MUL_ADD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| ROLL | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ |
| ROPE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
| RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
| RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
| SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| SET | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
| SET_ROWS | ❌ | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
| SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
| SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
| SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
| SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
| SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ |
| SOFTCAP | ❌ | | | | | | | ❌ | ❌ |
| SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | | | 🟡 | ✅ | ❌ |
| SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | | | ❌ | ✅ | |
| SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ |
| SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | ❌ | ❌ |
| SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
| SSM_SCAN | ❌ | ❌ | ✅ | ✅ | | ❌ | ❌ | ❌ | ❌ |
| STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | | |
| SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
| SUM | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | |
| SUM_ROWS | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | |
| SWIGLU | ❌ | | ✅ | ✅ | 🟡 | | ✅ | 🟡 | |
| TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | |
| TIMESTEP_EMBEDDING | ❌ | | ✅ | ✅ | | ✅ | | ✅ | ❌ |
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ |
+8134
View File
File diff suppressed because it is too large Load Diff
+1
View File
@@ -188,6 +188,7 @@ option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation"
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
option(GGML_WEBGPU "ggml: use WebGPU" OFF)
option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF)
option(GGML_ZDNN "ggml: use zDNN" OFF)
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
+16
View File
@@ -0,0 +1,16 @@
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#ifdef __cplusplus
extern "C" {
#endif
GGML_BACKEND_API ggml_backend_t ggml_backend_zdnn_init(void);
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zdnn_reg(void);
#ifdef __cplusplus
}
#endif
+1
View File
@@ -382,6 +382,7 @@ ggml_add_backend(RPC)
ggml_add_backend(SYCL)
ggml_add_backend(Vulkan)
ggml_add_backend(WebGPU)
ggml_add_backend(zDNN)
ggml_add_backend(OpenCL)
foreach (target ggml-base ggml)
+7
View File
@@ -49,6 +49,10 @@
#include "ggml-webgpu.h"
#endif
#ifdef GGML_USE_ZDNN
#include "ggml-zdnn.h"
#endif
#ifdef GGML_USE_OPENCL
#include "ggml-opencl.h"
#endif
@@ -180,6 +184,9 @@ struct ggml_backend_registry {
#ifdef GGML_USE_WEBGPU
register_backend(ggml_backend_webgpu_reg());
#endif
#ifdef GGML_USE_ZDNN
register_backend(ggml_backend_zdnn_reg());
#endif
#ifdef GGML_USE_OPENCL
register_backend(ggml_backend_opencl_reg());
#endif
+1 -1
View File
@@ -460,7 +460,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
# NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version.
# binutils must also be updated to the latest for the -march=z17 flag to work. Otherwise, use -march=arch15.
message(STATUS "z17 target")
list(APPEND ARCH_FLAGS -march=z17)
list(APPEND ARCH_FLAGS -march=arch15)
else()
message(STATUS "Unknown target")
message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.")
+2
View File
@@ -82,7 +82,9 @@ set(GGML_OPENCL_KERNELS
mul_mv_q4_0_f32_1d_8x_flat
mul_mv_q4_0_f32_1d_16x_flat
mul_mv_q6_k
mul_mv_mxfp4_f32
mul_mv_id_q4_0_f32_8x_flat
mul_mv_id_mxfp4_f32
mul_mm_f32_f32_l4_lm
mul_mm_f16_f32_l4_lm
mul
+141 -6
View File
@@ -365,6 +365,7 @@ struct ggml_backend_opencl_context {
cl_program program_mul_mv_q4_0_f32_1d_8x_flat;
cl_program program_mul_mv_q4_0_f32_1d_16x_flat;
cl_program program_mul_mv_q6_K;
cl_program program_mul_mv_mxfp4_f32;
cl_program program_mul_mv_f16_f16;
cl_program program_mul_mv_f16_f32_1row;
cl_program program_mul_mv_f16_f32_l4;
@@ -398,6 +399,7 @@ struct ggml_backend_opencl_context {
cl_program program_conv_2d_f16_f32;
cl_program program_tsembd;
cl_program program_mul_mv_id_q4_0_f32_8x_flat;
cl_program program_mul_mv_id_mxfp4_f32;
cl_program program_mul_mm_f32_f32_l4_lm;
cl_program program_mul_mm_f16_f32_l4_lm;
@@ -439,6 +441,7 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_convert_block_q4_0_noshuffle;
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
cl_kernel kernel_mul_mv_q6_K_f32;
cl_kernel kernel_mul_mv_mxfp4_f32;
cl_kernel kernel_im2col_f32, kernel_im2col_f16;
cl_kernel kernel_argsort_f32_i32;
cl_kernel kernel_sum_rows_f32;
@@ -455,6 +458,7 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_conv_2d_f16_f32;
cl_kernel kernel_timestep_embedding;
cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
cl_kernel kernel_mul_mv_id_mxfp4_f32;
cl_kernel kernel_mul_mm_f32_f32_l4_lm;
cl_kernel kernel_mul_mm_f16_f32_l4_lm;
@@ -577,6 +581,7 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_transpose_32;
cl_kernel kernel_transpose_32_16;
cl_kernel kernel_transpose_16;
cl_kernel kernel_transpose_16_4x1;
cl_mem A_s_d_max; // max scale buffer size for transpose
cl_mem A_q_d_max; // max weight buffer size for transpose
@@ -971,6 +976,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
GGML_LOG_CONT(".");
}
// mul_mv_mxfp4_f32
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "mul_mv_mxfp4_f32.cl.h"
};
#else
const std::string kernel_src = read_file("mul_mv_mxfp4_f32.cl");
#endif
backend_ctx->program_mul_mv_mxfp4_f32 =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_mul_mv_mxfp4_f32 = clCreateKernel(backend_ctx->program_mul_mv_mxfp4_f32, "kernel_mul_mv_mxfp4_f32", &err), err));
GGML_LOG_CONT(".");
}
// mul_mv_f16_f16
{
#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -1611,6 +1632,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
GGML_LOG_CONT(".");
}
// mul_mv_id_mxfp4_f32
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "mul_mv_id_mxfp4_f32.cl.h"
};
#else
const std::string kernel_src = read_file("mul_mv_id_mxfp4_f32.cl");
#endif
backend_ctx->program_mul_mv_id_mxfp4_f32 =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_mul_mv_id_mxfp4_f32 = clCreateKernel(backend_ctx->program_mul_mv_id_mxfp4_f32, "kernel_mul_mv_id_mxfp4_f32", &err), err));
GGML_LOG_CONT(".");
}
// Adreno kernels
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
// transpose
@@ -1628,6 +1665,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
CL_CHECK((backend_ctx->kernel_transpose_32_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32_16", &err), err));
CL_CHECK((backend_ctx->kernel_transpose_32 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32", &err), err));
CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16", &err), err));
CL_CHECK((backend_ctx->kernel_transpose_16_4x1 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16_4x1", &err), err));
GGML_LOG_CONT(".");
}
@@ -2552,13 +2590,14 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
return true;
} else if (op->src[0]->type == GGML_TYPE_F32) {
return op->src[1]->type == GGML_TYPE_F32;
} else if (op->src[0]->type == GGML_TYPE_Q4_0 ||
} else if (op->src[0]->type == GGML_TYPE_Q4_0 || op->src[0]->type == GGML_TYPE_MXFP4 ||
op->src[0]->type == GGML_TYPE_Q6_K) {
return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
}
return false;
case GGML_OP_MUL_MAT_ID:
if (op->src[0]->type == GGML_TYPE_Q4_0) {
if (op->src[0]->type == GGML_TYPE_Q4_0 ||
op->src[0]->type == GGML_TYPE_MXFP4) {
if (op->src[1]->type == GGML_TYPE_F32) {
return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
}
@@ -2944,7 +2983,10 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
// cl_mem qT_d = clCreateBuffer(context, CL_MEM_READ_WRITE, q_size_bytes, NULL, &err);
CL_CHECK(err);
// size_t d_size_bytes = M * (K / 32) / 2 * sizeof(float);
bool K_tile_trans = true;
if ((K / 32) % 4 != 0){
K_tile_trans =false;
}
size_t d_size_bytes = M * (K / 32) * 2;
region.origin = 0;
region.size = d_size_bytes;
@@ -2985,10 +3027,15 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
qT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
CL_CHECK(err);
img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
if (K_tile_trans) {
img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
img_desc_1d.image_width = M * K / 32 / 4;
} else {
img_fmt_1d = { CL_R, CL_HALF_FLOAT };
img_desc_1d.image_width = M * K / 32;
}
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
img_desc_1d.image_width = M * K / 32 / 4;
img_desc_1d.buffer = extra->d;
d_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
CL_CHECK(err);
@@ -3024,6 +3071,10 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
int width_s = K / 32 / 4;
kernel = backend_ctx->kernel_transpose_16;
if (!K_tile_trans) {
kernel = backend_ctx->kernel_transpose_16_4x1;
width_s = K / 32;
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_d_image1D));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &dT_d_image1D));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_s));
@@ -6254,11 +6305,47 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2));
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3));
break;
case GGML_TYPE_MXFP4: {
kernel = backend_ctx->kernel_mul_mv_mxfp4_f32;
if (backend_ctx->gpu_family == INTEL) {
nth0 = 16;
nth1 = 2;
ndst = nth1*2;
} else if (backend_ctx->gpu_family == ADRENO) {
nth0 = 64;
nth1 = 2;
ndst = nth1*2;
} else {
GGML_ASSERT(false && "TODO: Unknown GPU");
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb11));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb12));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb13));
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne0));
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne1));
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &r2));
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r3));
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(float)*nth0,nullptr));
break;
}
default:
GGML_ASSERT(false && "not implemented");
}
if (src0t == GGML_TYPE_Q4_0 ||
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_MXFP4 ||
src0t == GGML_TYPE_Q4_1 ||
src0t == GGML_TYPE_Q8_0 ||
src0t == GGML_TYPE_Q2_K) {
@@ -6307,10 +6394,12 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offset1 = extra1->offset + src1->view_offs;
cl_ulong offset2 = extra2->offset + src2->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
@@ -6325,7 +6414,9 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
const int ne03 = src0->ne[3];
const cl_ulong nb00 = src0->nb[0];
const cl_ulong nb01 = src0->nb[1];
const cl_ulong nb02 = src0->nb[2];
const cl_ulong nb03 = src0->nb[3];
const int ne10 = src1->ne[0];
const int ne11 = src1->ne[1];
@@ -6334,6 +6425,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
const cl_ulong nb11 = src1->nb[1];
const cl_ulong nb12 = src1->nb[2];
const cl_ulong nb13 = src1->nb[3];
const int ne20 = src2->ne[0];
const int ne21 = src2->ne[1];
@@ -6401,6 +6493,49 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
break;
}
case GGML_TYPE_MXFP4: {
kernel = backend_ctx->kernel_mul_mv_id_mxfp4_f32;
if (backend_ctx->gpu_family == INTEL) {
sgs = 16;
nsg = 2;
ndst = 2;
} else if (backend_ctx->gpu_family == ADRENO) {
sgs = 64;
nsg = 2;
ndst = 2;
} else {
GGML_ASSERT(false && "TODO: Unknown GPU");
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne11));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne12));
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb11));
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb12));
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb13));
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne20));
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne21));
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb21));
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &ne0));
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne1));
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &r2));
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r3));
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(float)*sgs,nullptr));
break;
}
default:
GGML_ASSERT(false && "not implemented");;
}
@@ -0,0 +1,189 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#ifdef cl_intel_subgroups
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
#else
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
#endif
#ifdef cl_intel_required_subgroup_size
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
#define INTEL_GPU 1
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
#elif defined(cl_qcom_reqd_sub_group_size)
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
#define ADRENO_GPU 1
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
#endif
#define QK_MXFP4 32
typedef struct {
uchar e; // E8M0
uchar qs[QK_MXFP4/2];
} block_mxfp4;
constant static float kvalues_mxfp4_f[16] = {
0, .5f, 1.f, 1.5f, 2.f, 3.f, 4.f, 6.f, -0, -.5f, -1.f, -1.5f, -2.f, -3.f, -4.f, -6.f
};
static inline float e8m0_to_fp32(uchar x) {
int bits;
if (x == 0) {
bits = 0x00400000;
} else {
bits = (uint) x << 23;
}
return as_float(bits);
}
#ifdef INTEL_GPU
#define N_R0_MXFP4 2 // number of rows each subgroup works on
#define N_SG_MXFP4 2 // number of subgroups in a work group
#define N_SIMDWIDTH 16 // subgroup size
#elif defined (ADRENO_GPU)
#define N_R0_MXFP4 2
#define N_SG_MXFP4 2
#define N_SIMDWIDTH 64
#endif
inline void mul_mv_mxfp4_f32(
global char * src0,
global char * src1,
global char * dst,
int ne00,
ulong nb01,
ulong nb02,
ulong nb03,
int ne12,
ulong nb11,
ulong nb12,
ulong nb13,
int ne0,
int ne1,
int r2,
int r3,
local char * shmem
) {
local float * shmem_f32 = (local float *) shmem;
int nb = ne00/QK_MXFP4;
int r0 = get_group_id(0);
int r1 = get_group_id(1);
int im = 0;
int first_row = (r0 * N_SG_MXFP4 + get_sub_group_id()) * N_R0_MXFP4;
uint i12 = im%ne12;
uint i13 = im/ne12;
ulong offset_src0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
global block_mxfp4 * x = (global block_mxfp4 *) (src0 + offset_src0);
global float * y = (global float *) (src1 + offset_src1);
const short ix = get_sub_group_local_id()/2; // 0...15
const short it = get_sub_group_local_id()%2; // 0 or 1
shmem_f32[get_sub_group_local_id()] = kvalues_mxfp4_f[get_sub_group_local_id()%16];
barrier(CLK_LOCAL_MEM_FENCE);
float4 yl[4];
float sumf[N_R0_MXFP4] = {0.f};
global float * yb = y + ix * QK_MXFP4 + it * 8;
for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
global float4 * y4 = (global float4 *)yb;
yl[0] = y4[0];
yl[1] = y4[4];
yl[2] = y4[1];
yl[3] = y4[5];
for (short row = 0; row < N_R0_MXFP4; row++) {
global block_mxfp4 * xb = x + row*nb + ib;
global uchar * q2 = (global uchar *)(xb->qs + 8*it);
float4 acc1 = yl[0]*(float4)(shmem_f32[q2[0] & 0x0F], shmem_f32[q2[1] & 0x0F], shmem_f32[q2[2] & 0x0F], shmem_f32[q2[3] & 0x0F]);
float4 acc2 = yl[1]*(float4)(shmem_f32[q2[0] >> 4 ], shmem_f32[q2[1] >> 4 ], shmem_f32[q2[2] >> 4 ], shmem_f32[q2[3] >> 4 ]);
float4 acc3 = yl[2]*(float4)(shmem_f32[q2[4] & 0x0F], shmem_f32[q2[5] & 0x0F], shmem_f32[q2[6] & 0x0F], shmem_f32[q2[7] & 0x0F]);
float4 acc4 = yl[3]*(float4)(shmem_f32[q2[4] >> 4 ], shmem_f32[q2[5] >> 4 ], shmem_f32[q2[6] >> 4 ], shmem_f32[q2[7] >> 4 ]);
acc1 = (acc1 + acc3) + (acc2 + acc4);
sumf[row] += e8m0_to_fp32(xb->e) * ((acc1.s0 + acc1.s1) + (acc1.s2 + acc1.s3));
}
yb += (N_SIMDWIDTH/2) * QK_MXFP4;
}
global float * dst_f32 = (global float *) dst + (ulong)im*ne0*ne1 + (ulong)r1*ne0;
for (int row = 0; row < N_R0_MXFP4 && first_row + row < ne0; ++row) {
float sum_all = sub_group_reduce_add(sumf[row]);
if (get_sub_group_local_id() == 0) {
dst_f32[first_row + row] = sum_all;
}
}
}
#ifdef INTEL_GPU
REQD_SUBGROUP_SIZE_16
#elif defined (ADRENO_GPU)
REQD_SUBGROUP_SIZE_64
#endif
kernel void kernel_mul_mv_id_mxfp4_f32(
global char * src0,
ulong offset0,
global char * src1,
ulong offset1,
global char * src2,
ulong offset2,
global char * dst,
ulong offsetd,
int ne00,
ulong nb01,
ulong nb02,
ulong nb03,
int ne11,
int ne12,
ulong nb11,
ulong nb12,
ulong nb13,
int ne20,
int ne21,
ulong nb21,
int ne0,
int ne1,
int r2,
int r3,
local char * shmem
) {
src0 = (global char *)((global char *)src0 + offset0);
src1 = (global char *)((global char *)src1 + offset1);
src2 = (global char *)((global char *)src2 + offset2);
dst = (global char *)((global char *)dst + offsetd);
const int iid1 = get_group_id(2)/ne20;
const int idx = get_group_id(2)%ne20;
int i02 = ((global int *) (src2 + iid1*nb21))[idx];
int i11 = idx % ne11;
int i12 = iid1;
int i1 = idx;
int i2 = i12;
global char * src0_cur = src0 + i02*nb02;
global char * src1_cur = src1 + i11*nb11 + i12*nb12;
global char * dst_cur = dst + (i1*ne0 + i2*ne1*ne0)*sizeof(float);
mul_mv_mxfp4_f32(src0_cur, src1_cur, dst_cur,
ne00, nb01, nb02, nb03, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, shmem);
}
@@ -0,0 +1,144 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#ifdef cl_intel_subgroups
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
#else
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
#endif
#ifdef cl_intel_required_subgroup_size
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
#define INTEL_GPU 1
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
#elif defined(cl_qcom_reqd_sub_group_size)
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
#define ADRENO_GPU 1
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
#endif
#define QK_MXFP4 32
typedef struct {
uchar e; // E8M0
uchar qs[QK_MXFP4/2];
} block_mxfp4;
constant static float kvalues_mxfp4_f[16] = {
0, .5f, 1.f, 1.5f, 2.f, 3.f, 4.f, 6.f, -0, -.5f, -1.f, -1.5f, -2.f, -3.f, -4.f, -6.f
};
static inline float e8m0_to_fp32(uchar x) {
int bits;
if (x == 0) {
bits = 0x00400000;
} else {
bits = (uint) x << 23;
}
return as_float(bits);
}
#ifdef INTEL_GPU
#define N_R0_MXFP4 2 // number of rows each subgroup works on
#define N_SG_MXFP4 2 // number of subgroups in a work group
#define N_SIMDWIDTH 16 // subgroup size
#elif defined (ADRENO_GPU)
#define N_R0_MXFP4 2
#define N_SG_MXFP4 2
#define N_SIMDWIDTH 64
#endif
#ifdef INTEL_GPU
REQD_SUBGROUP_SIZE_16
#elif defined (ADRENO_GPU)
REQD_SUBGROUP_SIZE_64
#endif
kernel void kernel_mul_mv_mxfp4_f32(
global char * src0,
ulong offset0,
global char * src1,
ulong offset1,
global char * dst,
ulong offsetd,
int ne00,
ulong nb01,
ulong nb02,
ulong nb03,
int ne12,
ulong nb11,
ulong nb12,
ulong nb13,
int ne0,
int ne1,
int r2,
int r3,
local char * shmem
) {
src0 = (global char*)((global char*)src0 + offset0);
src1 = (global char*)((global char*)src1 + offset1);
dst = (global char*)((global char*)dst + offsetd);
local float * shmem_f32 = (local float *) shmem;
int nb = ne00/QK_MXFP4;
int r0 = get_group_id(0);
int r1 = get_group_id(1);
int im = get_group_id(2);
int first_row = (r0 * N_SG_MXFP4 + get_sub_group_id()) * N_R0_MXFP4;
uint i12 = im%ne12;
uint i13 = im/ne12;
ulong offset_src0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
global block_mxfp4 * x = (global block_mxfp4 *) (src0 + offset_src0);
global float * y = (global float *) (src1 + offset_src1);
const short ix = get_sub_group_local_id()/2; // 0...15
const short it = get_sub_group_local_id()%2; // 0 or 1
shmem_f32[get_sub_group_local_id()] = kvalues_mxfp4_f[get_sub_group_local_id()%16];
barrier(CLK_LOCAL_MEM_FENCE);
float4 yl[4];
float sumf[N_R0_MXFP4] = {0.f};
global float * yb = y + ix * QK_MXFP4 + it * 8;
for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
global float4 * y4 = (global float4 *)yb;
yl[0] = y4[0];
yl[1] = y4[4];
yl[2] = y4[1];
yl[3] = y4[5];
for (short row = 0; row < N_R0_MXFP4; row++) {
global block_mxfp4 * xb = x + row*nb + ib;
global uchar * q2 = (global uchar *)(xb->qs + 8*it);
float4 acc1 = yl[0]*(float4)(shmem_f32[q2[0] & 0x0F], shmem_f32[q2[1] & 0x0F], shmem_f32[q2[2] & 0x0F], shmem_f32[q2[3] & 0x0F]);
float4 acc2 = yl[1]*(float4)(shmem_f32[q2[0] >> 4 ], shmem_f32[q2[1] >> 4 ], shmem_f32[q2[2] >> 4 ], shmem_f32[q2[3] >> 4 ]);
float4 acc3 = yl[2]*(float4)(shmem_f32[q2[4] & 0x0F], shmem_f32[q2[5] & 0x0F], shmem_f32[q2[6] & 0x0F], shmem_f32[q2[7] & 0x0F]);
float4 acc4 = yl[3]*(float4)(shmem_f32[q2[4] >> 4 ], shmem_f32[q2[5] >> 4 ], shmem_f32[q2[6] >> 4 ], shmem_f32[q2[7] >> 4 ]);
acc1 = (acc1 + acc3) + (acc2 + acc4);
sumf[row] += e8m0_to_fp32(xb->e) * ((acc1.s0 + acc1.s1) + (acc1.s2 + acc1.s3));
}
yb += (N_SIMDWIDTH/2) * QK_MXFP4;
}
global float * dst_f32 = (global float *) dst + (ulong)im*ne0*ne1 + (ulong)r1*ne0;
for (int row = 0; row < N_R0_MXFP4 && first_row + row < ne0; ++row) {
float sum_all = sub_group_reduce_add(sumf[row]);
if (get_sub_group_local_id() == 0) {
dst_f32[first_row + row] = sum_all;
}
}
}
+20
View File
@@ -24,6 +24,26 @@ kernel void kernel_transpose_16(
write_imageh(output, (i_2+3)*rows+j, (half4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
}
// Padded kernel for irregular shape
kernel void kernel_transpose_16_4x1(
__read_only image1d_buffer_t input,
__write_only image1d_buffer_t output,
const uint rows,
const uint cols
) {
const int i = get_global_id(0);
const int j = get_global_id(1);
const int j_2 = j << 2;
half temp0 = read_imageh(input, (j_2 + 0) * cols + i).x;
half temp1 = read_imageh(input, (j_2 + 1) * cols + i).x;
half temp2 = read_imageh(input, (j_2 + 2) * cols + i).x;
half temp3 = read_imageh(input, (j_2 + 3) * cols + i).x;
write_imageh(output, i * rows + j, (half4)(temp0, temp1, temp2, temp3));
}
// 32-bit transpose, loading/storing a 4x4 tile of elements
kernel void kernel_transpose_32(
__read_only image1d_buffer_t input,
+6 -9
View File
@@ -8360,7 +8360,7 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, cons
(uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1]}, theta_scale,
src2 != nullptr, (uint32_t)src0->ne[2], s1, s2,
sections[0], sections[1], sections[2], sections[3], backprop
{ sections[0], sections[1], sections[2], sections[3] }, backprop
}, dryrun);
}
@@ -8392,7 +8392,7 @@ static void ggml_vk_sum_rows(ggml_backend_vk_context * ctx, vk_context& subctx,
}
static void ggml_vk_argmax(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGMAX, { (uint32_t)src0->ne[0], 0, 0.0f, 0.0f }, dryrun);
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGMAX, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], 0.0f, 0.0f }, dryrun);
}
static void ggml_vk_count_equal(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
@@ -9627,7 +9627,6 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
default:
std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
GGML_ABORT("fatal error");
return false;
}
vk_context compute_ctx;
@@ -10912,7 +10911,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
default:
return false;
}
break;
case GGML_OP_GLU:
switch (ggml_get_glu_op(op)) {
case GGML_GLU_OP_GEGLU:
@@ -10928,7 +10926,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
default:
return false;
}
break;
case GGML_OP_MUL_MAT:
case GGML_OP_MUL_MAT_ID:
{
@@ -10992,7 +10989,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
}
return true;
} break;
}
case GGML_OP_FLASH_ATTN_EXT:
{
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
@@ -11082,7 +11079,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
default:
return false;
}
} break;
}
case GGML_OP_SET_ROWS:
{
switch (op->type) {
@@ -11099,7 +11096,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
default:
return false;
}
} break;
}
case GGML_OP_CONT:
case GGML_OP_CPY:
case GGML_OP_DUP:
@@ -11151,7 +11148,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
return true;
}
return false;
} break;
}
case GGML_OP_REPEAT:
return ggml_type_size(op->type) == sizeof(float) && ggml_type_size(op->src[0]->type) == sizeof(float);
case GGML_OP_REPEAT_BACK:
@@ -5,6 +5,8 @@
#extension GL_EXT_control_flow_attributes : enable
#define FLT_MAX 3.402823466e+38F
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
@@ -19,19 +21,26 @@ void main() {
const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
const uint col = gl_LocalInvocationID.x;
if (col >= p.KX) {
if (row >= p.KY) {
return;
}
A_TYPE amax = data_a[row*p.KX + col];
tmp[col] = col;
A_TYPE amax = -FLT_MAX;
uint acol = col;
if (col < p.KX) {
amax = data_a[row*p.KX + col];
}
for (uint i = col + BLOCK_SIZE; i < p.KX; i += BLOCK_SIZE) {
A_TYPE val = data_a[row*p.KX + i];
if (val > amax) {
amax = val;
tmp[col] = i;
acol = i;
}
}
tmp[col] = acol;
tmpmax[col] = amax;
barrier();
+36
View File
@@ -0,0 +1,36 @@
if (DEFINED ZDNN_ROOT)
message(STATUS "zdnn: using ZDNN_ROOT override: ${ZDNN_ROOT}")
set(ZDNN_HINT "${ZDNN_ROOT}")
else()
set(ZDNN_HINT "")
endif()
find_path(ZDNN_INCLUDE
NAMES zdnn.h
HINTS ${ZDNN_HINT} /usr /usr/local
PATH_SUFFIXES include)
if (ZDNN_INCLUDE)
message(STATUS "zdnn: found include: ${ZDNN_INCLUDE}")
else()
message(FATAL_ERROR "zdnn: include directory not found, please set ZDNN_ROOT to the proper path if necessary")
endif()
find_library(ZDNN_LIB
NAMES zdnn
HINTS ${ZDNN_HINT} /usr /usr/local
PATH_SUFFIXES lib lib64)
if (ZDNN_LIB)
message(STATUS "zdnn: found library: ${ZDNN_LIB}")
else()
message(FATAL_ERROR "zdnn: library not found, please set ZDNN_ROOT to the proper path if necessary")
endif()
file(GLOB GGML_SOURCES_ZDNN "*.c" "*.cpp")
file(GLOB GGML_HEADERS_ZDNN "*.h" "*.hpp")
ggml_add_backend_library(ggml-zdnn ${GGML_HEADERS_ZDNN} ${GGML_SOURCES_ZDNN})
target_link_libraries(ggml-zdnn PRIVATE ${ZDNN_LIB})
target_include_directories(ggml-zdnn PRIVATE ${ZDNN_INCLUDE})
target_link_directories(ggml-zdnn PRIVATE ${ZDNN_LIB})
target_compile_definitions(ggml-zdnn PRIVATE GGML_USE_ZDNN)
+97
View File
@@ -0,0 +1,97 @@
#ifndef GGML_ZDNN_IMPL
#define GGML_ZDNN_IMPL
#include "zdnn.h"
#include "ggml.h"
#include "ggml-zdnn.h"
#include <vector>
#include <memory>
#include <vecintrin.h>
#define GGML_ZDNN_NAME "zDNN"
#define GGML_ZDNN_VERSION ZDNN_VERNUM
#define vec_neg(a) (-(a)) // Vector Negate
#define vec_add(a, b) ((a) + (b)) // Vector Add
#define vec_sub(a, b) ((a) - (b)) // Vector Subtract
#define vec_mul(a, b) ((a) * (b)) // Vector Multiply
#define vec_div(a, b) ((a) / (b)) // Vector Divide
#define vec_sl(a, b) ((a) << (b)) // Vector Shift Left
#define vec_sra(a, b) ((a) >> (b)) // Vector Shift Right
#define vec_sr(a, b) ((a) >> (b)) // Vector Shift Right Algebraic
#define vec_slo(a, b) vec_slb(a, (b) << 64) // Vector Shift Left by Octet
#define vec_sro(a, b) vec_srb(a, (b) << 64) // Vector Shift Right by Octet
#ifndef vec_and
#define vec_and(a, b) ((a) & (b)) // Vector AND
#endif
#ifndef vec_or
#define vec_or(a, b) ((a) | (b)) // Vector OR
#endif
#ifndef vec_xor
#define vec_xor(a, b) ((a) ^ (b)) // Vector XOR
#endif
typedef signed char char8x16_t __attribute__((vector_size(16)));
typedef unsigned char uchar8x16_t __attribute__((vector_size(16)));
typedef int8_t int8x16_t __attribute__((vector_size(16)));
typedef int16_t int16x8_t __attribute__((vector_size(16)));
typedef int32_t int32x4_t __attribute__((vector_size(16)));
typedef uint8_t uint8x16_t __attribute__((vector_size(16)));
typedef uint16_t uint16x8_t __attribute__((vector_size(16)));
typedef uint32_t uint32x4_t __attribute__((vector_size(16)));
typedef float float32x4_t __attribute__((vector_size(16)));
typedef double double64x2_t __attribute__((vector_size(16)));
typedef signed long long long64x2_t __attribute__((vector_size(16)));
typedef unsigned long long ulong64x2_t __attribute__((vector_size(16)));
#define ZDNN_CHECK(stmt) \
do { \
zdnn_status status = (stmt); \
GGML_ASSERT(status == ZDNN_OK); \
} while (0);
struct ggml_backend_zdnn_device_context {
int zdnn_device;
int zdnn_device_ref_count;
bool has_parmblkformat_0;
bool has_parmblkformat_1;
size_t max_size;
char name[128];
};
struct ggml_backend_zdnn_context {
int device;
ggml_cgraph * gf;
};
struct ggml_backend_zdnn_buffer {
void * data;
size_t size;
zdnn_tensor_desc pre_tfm_desc;
zdnn_tensor_desc tfm_desc;
zdnn_ztensor ztensor;
char name[GGML_MAX_NAME];
};
struct ggml_backend_zdnn_buffer_context {
void * all_data;
size_t all_size;
bool owned;
int n_buffers;
std::vector<std::unique_ptr<ggml_backend_zdnn_buffer>> buffers;
};
#endif // GGML_ZDNN_IMPL
+846
View File
@@ -0,0 +1,846 @@
#include "zdnn.h"
#include "ggml-zdnn.h"
#include "ggml-zdnn-impl.h"
#include "ggml-impl.h"
#include "ggml-backend-impl.h"
#include <vector>
#include <memory>
#include <csignal>
#include <unistd.h>
inline zdnn_data_types ggml_zdnn_type_mapping(ggml_type type) {
switch (type) {
case GGML_TYPE_F32:
return FP32;
case GGML_TYPE_F16:
return FP16;
case GGML_TYPE_BF16:
return BFLOAT;
case GGML_TYPE_I8:
return INT8;
case GGML_TYPE_I32:
return INT32;
case GGML_TYPE_Q8_0:
return INT8;
default:
GGML_ABORT("%s: fatal: unable to determine zTensor data type",
__func__);
break;
}
}
inline void ggml_zdnn_create_tensor(zdnn_tensor_desc & pre_tfm_desc,
zdnn_tensor_desc & tfm_desc,
zdnn_ztensor & ztensor,
const ggml_tensor * src,
const int64_t * ne,
const zdnn_data_layouts layout) {
zdnn_init_pre_transformed_desc(
layout,
ggml_zdnn_type_mapping(src->type),
&pre_tfm_desc,
ne[3], ne[2], ne[1], ne[0]
);
ZDNN_CHECK(zdnn_generate_transformed_desc(&pre_tfm_desc, &tfm_desc));
ZDNN_CHECK(zdnn_init_ztensor_with_malloc(&pre_tfm_desc, &tfm_desc, &ztensor));
}
inline void ggml_zdnn_load_tensor(zdnn_ztensor & ztensor,
void * buffer) {
ZDNN_CHECK(zdnn_transform_ztensor(&ztensor, buffer));
}
inline void ggml_zdnn_init_tensor(ggml_backend_zdnn_buffer * buffer, const ggml_tensor * tensor) {
switch (tensor->op) {
case GGML_OP_MUL_MAT:
{
zdnn_init_pre_transformed_desc(
ZDNN_2D,
ggml_zdnn_type_mapping(tensor->type),
&buffer->pre_tfm_desc,
tensor->ne[1], tensor->ne[0]
);
} break;
default:
{
// For 4D tensors, GGML uses NCHW layout. However, because zDNN
// automatically transforms everything to NHWC, we will use it
// directly to avoid the performance penalty changing the
// layout and reshaping the tensor.
zdnn_init_pre_transformed_desc(
ZDNN_NHWC,
ggml_zdnn_type_mapping(tensor->type),
&buffer->pre_tfm_desc,
tensor->ne[3], tensor->ne[2], tensor->ne[1], tensor->ne[0]
);
// TODO: Consider adding a ggml check.
// TODO: If tensor = 4D, use ZDNN_NCHW by default.
// TODO: If tensor = 2D, use ZDNN_NHWC by default.
} break;
}
ZDNN_CHECK(zdnn_generate_transformed_desc(&buffer->pre_tfm_desc, &buffer->tfm_desc));
ZDNN_CHECK(zdnn_init_ztensor_with_malloc(&buffer->pre_tfm_desc, &buffer->tfm_desc, &buffer->ztensor));
}
static void ggml_zdnn_mul_mat_op(ggml_backend_zdnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_TENSOR_BINARY_OP_LOCALS;
const enum ggml_type type = src0->type;
GGML_ASSERT(ne0 == ne01);
GGML_ASSERT(ne1 == ne11);
GGML_ASSERT(ne2 == ne12);
GGML_ASSERT(ne3 == ne13);
// we don't support permuted src0 or src1
GGML_ASSERT(nb00 == ggml_type_size(type));
GGML_ASSERT(nb10 == ggml_type_size(src1->type));
// dst cannot be transposed or permuted
GGML_ASSERT(nb0 == sizeof(float));
GGML_ASSERT(nb0 <= nb1);
GGML_ASSERT(nb1 <= nb2);
GGML_ASSERT(nb2 <= nb3);
const ggml_tensor * weights = src0;
const ggml_tensor * inputs = src1;
ggml_tensor * output = dst;
ggml_backend_zdnn_buffer * weights_extra = (ggml_backend_zdnn_buffer *)weights->extra;
ggml_backend_zdnn_buffer * inputs_extra = (ggml_backend_zdnn_buffer *)inputs->extra;
ggml_backend_zdnn_buffer * output_extra = (ggml_backend_zdnn_buffer *)output->extra;
zdnn_tensor_desc ptd_bias, td_bias;
zdnn_ztensor zt_bias;
const int64_t weights_rows = ne01;
const int64_t weights_cols = ne00;
const int64_t inputs_rows = ne11;
const int64_t inputs_cols = ne10;
assert(inputs_cols == weights_cols);
const int64_t output_rows = ne1;
const int64_t output_cols = ne0;
const int64_t bias_dim [GGML_MAX_DIMS] = { 1, 1, 1, output_cols };
ggml_zdnn_create_tensor(ptd_bias, td_bias, zt_bias, output, bias_dim, ZDNN_1D);
void * bias_data = (void *)calloc(ne0, ggml_element_size(output));
if (weights_extra->ztensor.is_transformed == false) ggml_zdnn_load_tensor(weights_extra->ztensor, weights->data);
if (inputs_extra->ztensor.is_transformed == false) ggml_zdnn_load_tensor(inputs_extra->ztensor, inputs->data);
ggml_zdnn_load_tensor(zt_bias, bias_data);
// GGML_LOG_INFO("%s: tensor '%s' tensor dimensions: [%ld, %ld, %ld, %ld] pre_tfm_desc dimensions: [%ld, %ld, %ld, %ld]\n",
// __func__, weights_extra->name,
// weights->ne[3], weights->ne[2], weights->ne[1], weights->ne[0],
// weights_extra->pre_tfm_desc.dim1,
// weights_extra->pre_tfm_desc.dim2,
// weights_extra->pre_tfm_desc.dim3,
// weights_extra->pre_tfm_desc.dim4);
// GGML_LOG_INFO("%s: tensor '%s' tensor dimensions: [%ld, %ld, %ld, %ld] pre_tfm_desc dimensions: [%ld, %ld, %ld, %ld]\n",
// __func__, inputs_extra->name,
// inputs->ne[3], inputs->ne[2], inputs->ne[1], inputs->ne[0],
// inputs_extra->pre_tfm_desc.dim1,
// inputs_extra->pre_tfm_desc.dim2,
// inputs_extra->pre_tfm_desc.dim3,
// inputs_extra->pre_tfm_desc.dim4);
GGML_ASSERT(weights_extra->pre_tfm_desc.dim1 == weights->ne[0] && "weights_extra->pre_tfm_desc.dim1 must match weights->ne[0]");
GGML_ASSERT(weights_extra->pre_tfm_desc.dim2 == weights->ne[1] && "weights_extra->pre_tfm_desc.dim2 must match weights->ne[1]");
GGML_ASSERT(inputs_extra->pre_tfm_desc.dim1 == inputs->ne[0] && "inputs_extra->pre_tfm_desc.dim1 must match inputs->ne[0]");
GGML_ASSERT(inputs_extra->pre_tfm_desc.dim2 == inputs->ne[1] && "inputs_extra->pre_tfm_desc.dim2 must match inputs->ne[1]");
ZDNN_CHECK(zdnn_matmul_transpose_op(&inputs_extra->ztensor, &weights_extra->ztensor, &zt_bias,
false, true, MATMUL_OP_ADDITION, &output_extra->ztensor));
// TODO: Remove in the future as we are currently DLF16 -> FP32 then in the next op, FP32 -> DLF16 again. Inefficient.
ZDNN_CHECK(zdnn_transform_origtensor(&output_extra->ztensor, output->data));
ZDNN_CHECK(zdnn_free_ztensor_buffer(&zt_bias));
free(bias_data);
}
static void ggml_zdnn_mul_mat_dispatch(ggml_backend_zdnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
bool use_mul_mat_vec =
(src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_F16)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
&& src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
bool use_mul_mat_vec_q =
ggml_is_quantized(src0->type)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
bool use_mul_mat_q =
ggml_is_quantized(src0->type)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
// debug helpers
// GGML_LOG_INFO("%s: use_mul_mat_vec = %d\n", __func__, use_mul_mat_vec);
// GGML_LOG_INFO("%s: use_mul_mat_vec_q = %d\n", __func__, use_mul_mat_vec_q);
// GGML_LOG_INFO("%s: use_mul_mat_q = %d\n", __func__, use_mul_mat_q);
// GGML_LOG_INFO("%s: src0: %8d %8d %8d %8d\n", __func__, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
// GGML_LOG_INFO("%s: %8d %8d %8d %8d\n", __func__, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
// GGML_LOG_INFO("%s: src1: %8d %8d %8d %8d\n", __func__, src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
// GGML_LOG_INFO("%s: %8d %8d %8d %8d\n", __func__, src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
// GGML_LOG_INFO("%s: src0 is contiguous %d, transposed %d, type = %s, name = %s\n", __func__, ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
// GGML_LOG_INFO("%s: src1 is contiguous %d, transposed %d, type = %s, name = %s\n", __func__, ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16
&& !ggml_is_transposed(src0) && !ggml_is_transposed(src1)
&& src1->ne[2] * src1->ne[3] > 1) {
// general KQ + KQV multi-batch
GGML_LOG_INFO("%s: using zdnn_mul_mat_batched for KQ + KQV multi-batch\n", __func__);
// ggml_zdnn_mul_mat_batched(ctx, src0, src1, dst);
} else if (use_mul_mat_vec) {
GGML_LOG_INFO("%s: using zdnn_op_mul_mat_vec for vector multiplication\n", __func__);
// ggml_zdnn_op_mul_mat(ctx, src0, src1, dst, ggml_zdnn_op_mul_mat_vec, nullptr);
} else if (use_mul_mat_vec_q) {
GGML_LOG_INFO("%s: using zdnn_op_mul_mat_vec_q for quantized vector multiplication\n", __func__);
// ggml_zdnn_op_mul_mat(ctx, src0, src1, dst, ggml_zdnn_op_mul_mat_vec_q, ggml_zdnn_quantize_row_q8_1);
} else if (use_mul_mat_q) {
GGML_LOG_INFO("%s: using zdnn_op_mul_mat_q for quantized matrix multiplication\n", __func__);
// ggml_zdnn_op_mul_mat(ctx, src0, src1, dst, ggml_zdnn_op_mul_mat_q, ggml_zdnn_quantize_mmq_q8_1);
} else {
// GGML_LOG_INFO("%s: using zdnn_op_mul_mat for general matrix multiplication\n", __func__);
ggml_zdnn_mul_mat_op(ctx, src0, src1, dst);
}
}
static bool ggml_zdnn_compute_forward(ggml_backend_zdnn_context * ctx, ggml_tensor * dst) {
switch (dst->op) {
case GGML_OP_MUL_MAT:
ggml_zdnn_mul_mat_dispatch(ctx, dst->src[0], dst->src[1], dst);
break;
default:
return false;
}
return true;
}
static enum ggml_status ggml_zdnn_graph_compute(ggml_backend_t backend, ggml_cgraph * gf) {
ggml_backend_zdnn_context * ctx = ( ggml_backend_zdnn_context *)backend->context;
ggml_backend_zdnn_device_context * ctx_dev = (ggml_backend_zdnn_device_context *)backend->device->context;
ctx->gf = gf;
for (int i = 0; i < gf->n_nodes; i++) {
ggml_tensor * node = gf->nodes[i];
if (ggml_is_empty(node)
|| node->op == GGML_OP_NONE
|| node->op == GGML_OP_RESHAPE
|| node->op == GGML_OP_VIEW
|| node->op == GGML_OP_PERMUTE
|| node->op == GGML_OP_TRANSPOSE) {
continue;
}
bool ok = ggml_zdnn_compute_forward(ctx, node);
if (!ok) {
GGML_LOG_ERROR("%s: unsupported op %s (%s)\n",
__func__, node->name, ggml_op_name(node->op));
}
GGML_ASSERT(ok);
}
return GGML_STATUS_SUCCESS;
}
static bool ggml_zdnn_supports_op(const ggml_backend_zdnn_device_context * ctx_dev, const ggml_tensor * op) {
switch (op->op) {
case GGML_OP_NONE:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_TRANSPOSE:
case GGML_OP_PERMUTE:
return true;
case GGML_OP_MUL_MAT:
{
const ggml_tensor * src0 = op->src[0];
const ggml_tensor * src1 = op->src[1];
const int64_t ne10 = src1->ne[0];
const int64_t ne0 = op->ne[0];
const int64_t ne1 = op->ne[1];
const int64_t max_batch = ctx_dev->max_size;
return ggml_is_matrix(src0) &&
ggml_is_matrix(src1) &&
ggml_is_contiguous(src0) &&
ggml_is_contiguous(src1) &&
src0->view_src == nullptr && src1->view_src == nullptr &&
src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 &&
(ne0 <= max_batch && ne1 <= max_batch && ne10 <= max_batch);
} break;
default:
return false;
}
}
////////////////////////////////////////////////////////////////////////////////
//
// globals
//
// initialised in ggml_backend_zdnn_reg
static ggml_backend_reg g_ggml_backend_zdnn_reg;
static ggml_backend_device g_ggml_backend_zdnn_device;
static ggml_backend_zdnn_device_context g_ggml_ctx_dev_main = {
/* .zdnn_device = */ 0,
/* .zdnn_device_ref_count = */ 0,
/* .has_parmblkformat_0 = */ false,
/* .has_parmblkformat_1 = */ false,
/* .max_size = */ 0,
/* .name = */ "",
};
static int ggml_backend_zdnn_device_acq(ggml_backend_zdnn_device_context * ctx) {
assert(ctx != NULL);
if (ctx->zdnn_device == 0) {
ctx->zdnn_device = 1;
}
if (ctx->zdnn_device >= 1) {
ctx->has_parmblkformat_0 = zdnn_is_nnpa_parmblk_fmt_installed(1, NNPA_PARMBLKFORMAT_0);
ctx->has_parmblkformat_1 = zdnn_is_nnpa_parmblk_fmt_installed(1, NNPA_PARMBLKFORMAT_1);
ctx->max_size = zdnn_get_nnpa_max_dim_idx_size();
strncpy(ctx->name, GGML_ZDNN_NAME, sizeof(ctx->name) - 1);
}
ctx->zdnn_device_ref_count++;
return ctx->zdnn_device;
}
static void ggml_backend_zdnn_device_rel(ggml_backend_zdnn_device_context * ctx) {
assert(ctx != NULL);
assert(ctx->zdnn_device_ref_count > 0);
ctx->zdnn_device_ref_count--;
if (ctx->zdnn_device_ref_count == 0) {
if (ctx->zdnn_device >= 0) {
ctx->zdnn_device = 0;
}
}
}
static ggml_backend_zdnn_context * ggml_zdnn_init(ggml_backend_dev_t dev) {
GGML_LOG_INFO("%s: allocating\n", __func__);
GGML_LOG_INFO("%s: found 1 device\n", __func__);
#ifdef STATIC_LIB
zdnn_init();
#endif
ggml_backend_zdnn_context * ctx = new ggml_backend_zdnn_context();
ggml_backend_zdnn_device_context * ctx_dev = (ggml_backend_zdnn_device_context *)dev->context;
int device = 1;
GGML_LOG_INFO("%s: picking default device: %s\n", __func__, ctx_dev->name);
ctx->device = device;
GGML_LOG_INFO("%s: NNPA name: %s\n", __func__, ctx_dev->name);
GGML_LOG_INFO("%s: NNPA_PARMBLKFORMAT_0 = %s\n", __func__, ctx_dev->has_parmblkformat_0 ? "true" : "false");
GGML_LOG_INFO("%s: NNPA_PARMBLKFORMAT_1 = %s\n", __func__, ctx_dev->has_parmblkformat_1 ? "true" : "false");
ctx->gf = nullptr;
return ctx;
}
static void ggml_zdnn_free(ggml_backend_zdnn_context * ctx) {
GGML_LOG_INFO("%s: deallocating\n", __func__);
delete ctx;
}
//
// backend interface
//
static void ggml_backend_zdnn_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_zdnn_buffer_context * ctx = (ggml_backend_zdnn_buffer_context *)buffer->context;
for (int i = 0; i < ctx->n_buffers; i++) {
if (ctx->buffers[i]->ztensor.buffer != NULL && ctx->buffers[i]->ztensor.is_transformed) {
ZDNN_CHECK(zdnn_free_ztensor_buffer(&ctx->buffers[i]->ztensor));
}
}
delete ctx;
}
static void * ggml_backend_zdnn_buffer_get_base(ggml_backend_buffer_t buffer) {
ggml_backend_zdnn_buffer_context * ctx = (ggml_backend_zdnn_buffer_context *)buffer->context;
return ctx->all_data;
}
static enum ggml_status ggml_backend_zdnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
if (tensor->view_src != NULL) {
assert(tensor->view_src->buffer->buft == buffer->buft);
return GGML_STATUS_SUCCESS;
}
ggml_backend_zdnn_buffer_context * ctx = (ggml_backend_zdnn_buffer_context *)buffer->context;
const int64_t tsize = ggml_nbytes(tensor);
int buffer_idx = ctx->n_buffers;
std::unique_ptr<ggml_backend_zdnn_buffer> zdnn_buffer = std::make_unique<ggml_backend_zdnn_buffer>();
zdnn_buffer->data = tensor->data;
zdnn_buffer->size = tsize;
strncpy(zdnn_buffer->name, tensor->name, GGML_MAX_NAME - 1);
ggml_zdnn_init_tensor(zdnn_buffer.get(), tensor);
tensor->extra = zdnn_buffer.get();
ctx->buffers.push_back(std::move(zdnn_buffer));
ctx->n_buffers++;
// GGML_LOG_INFO("%s: initialised tensor '%s' in buffer %d, size = %8.2f MiB\n",
// __func__, tensor->name, buffer_idx, tsize);
return GGML_STATUS_SUCCESS;
}
static void ggml_backend_zdnn_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
memset((char *)tensor->data + offset, value, size);
GGML_UNUSED(buffer);
}
static void ggml_backend_zdnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
memcpy((char *)tensor->data + offset, data, size);
GGML_UNUSED(buffer);
}
static void ggml_backend_zdnn_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
memcpy(data, (const char *)tensor->data + offset, size);
GGML_UNUSED(buffer);
}
static void ggml_backend_zdnn_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
ggml_backend_zdnn_buffer_context * ctx = (ggml_backend_zdnn_buffer_context *)buffer->context;
memset(ctx->all_data, value, ctx->all_size);
}
static ggml_backend_buffer_i ggml_backend_zdnn_buffer_i = {
/* .free_buffer = */ ggml_backend_zdnn_buffer_free_buffer,
/* .get_base = */ ggml_backend_zdnn_buffer_get_base,
/* .init_tensor = */ ggml_backend_zdnn_buffer_init_tensor,
/* .memset_tensor = */ ggml_backend_zdnn_buffer_memset_tensor,
/* .set_tensor = */ ggml_backend_zdnn_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_zdnn_buffer_get_tensor,
/* .cpy_tensor = */ NULL,
/* .clear = */ ggml_backend_zdnn_buffer_clear,
/* .reset = */ NULL,
};
//
// default buffer type
//
static const char * ggml_backend_zdnn_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
return GGML_ZDNN_NAME;
GGML_UNUSED(buft);
}
static ggml_backend_buffer_t ggml_backend_zdnn_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
ggml_backend_zdnn_buffer_context * ctx = new ggml_backend_zdnn_buffer_context();
const size_t size_page = sysconf(_SC_PAGESIZE);
size_t size_aligned = size;
if ((size_aligned % size_page) != 0) {
size_aligned += size_page - (size_aligned % size_page);
}
ggml_backend_zdnn_device_context * ctx_dev = (ggml_backend_zdnn_device_context *)buft->device->context;
GGML_ASSERT(ctx_dev->zdnn_device >= 0);
int device = ctx_dev->zdnn_device; GGML_UNUSED(device);
ctx->all_data = ggml_aligned_malloc(size_aligned);
ctx->all_size = size_aligned;
ctx->owned = true;
ctx->n_buffers = 1;
if (ctx->all_data != NULL) {
std::unique_ptr<ggml_backend_zdnn_buffer> zdnn_buffer = std::make_unique<ggml_backend_zdnn_buffer>();
zdnn_buffer->data = ctx->all_data;
zdnn_buffer->size = size_aligned;
ctx->buffers.push_back(std::move(zdnn_buffer));
}
if (size_aligned > 0 && (ctx->all_data == NULL)) {
GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f\n",
__func__, size_aligned / 1024.0 / 1024.0);
delete ctx;
return NULL;
}
return ggml_backend_buffer_init(buft, ggml_backend_zdnn_buffer_i, ctx, size);
}
static size_t ggml_backend_zdnn_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
return 256;
GGML_UNUSED(buft);
}
static bool ggml_backend_zdnn_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
return true;
GGML_UNUSED(buft);
}
ggml_backend_buffer_type_t ggml_backend_zdnn_buffer_type(void) {
static ggml_backend_buffer_type ggml_backend_buffer_type_zdnn = {
/* .iface = */ {
/* .get_name = */ ggml_backend_zdnn_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_zdnn_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_zdnn_buffer_type_get_alignment,
/* .get_max_size = */ NULL,
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .is_host = */ ggml_backend_zdnn_buffer_type_is_host,
},
/* .device = */ &g_ggml_backend_zdnn_device,
/* .context = */ NULL,
};
return &ggml_backend_buffer_type_zdnn;
}
static const char * ggml_backend_zdnn_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
return GGML_ZDNN_NAME "_Mapped";
GGML_UNUSED(buft);
}
static ggml_backend_buffer_type_t ggml_backend_zdnn_buffer_from_ptr_type(void) {
static ggml_backend_buffer_type ggml_backend_buffer_from_ptr_type_zdnn = {
/* .iface = */ {
/* .get_name = */ ggml_backend_zdnn_buffer_from_ptr_type_get_name,
/* .alloc_buffer = */ ggml_backend_zdnn_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_zdnn_buffer_type_get_alignment,
/* .get_max_size = */ NULL,
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .is_host = */ ggml_backend_zdnn_buffer_type_is_host,
},
/* .device = */ &g_ggml_backend_zdnn_device,
/* .context = */ NULL,
};
return &ggml_backend_buffer_from_ptr_type_zdnn;
}
//
// backend
//
static const char * ggml_backend_zdnn_name(ggml_backend_t backend) {
return GGML_ZDNN_NAME;
GGML_UNUSED(backend);
}
static void ggml_backend_zdnn_free(ggml_backend_t backend) {
ggml_backend_zdnn_context * ctx = (ggml_backend_zdnn_context *)backend->context;
ggml_zdnn_free(ctx);
free(backend);
}
static enum ggml_status ggml_backend_zdnn_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
return ggml_zdnn_graph_compute(backend, cgraph);
}
static ggml_backend_i ggml_backend_zdnn_i = {
/* .get_name = */ ggml_backend_zdnn_name,
/* .free = */ ggml_backend_zdnn_free,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_zdnn_graph_compute,
/* .event_record = */ NULL,
/* .event_wait = */ NULL,
};
static ggml_guid_t ggml_backend_zdnn_guid(void) {
static const char * guid_str = "IBM-ZDNN-ACCELER";
return reinterpret_cast<ggml_guid_t>((void *)guid_str);
}
// TODO: remove in the future
ggml_backend_t ggml_backend_zdnn_init(void) {
ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_zdnn_reg(), 0);
ggml_backend_zdnn_context * ctx = ggml_zdnn_init(dev);
if (ctx == NULL) {
GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
return NULL;
}
ggml_backend_t backend = (ggml_backend_t)malloc(sizeof(ggml_backend));
*backend = (ggml_backend) {
/* .guid = */ ggml_backend_zdnn_guid(),
/* .iface = */ ggml_backend_zdnn_i,
/* .device = */ dev,
/* .context = */ ctx,
};
return backend;
}
bool ggml_backend_is_zdnn(ggml_backend_t backend) {
return backend != NULL &&
ggml_guid_matches(backend->guid, ggml_backend_zdnn_guid());
GGML_UNUSED(backend);
}
//
// backend device
//
static const char * ggml_backend_zdnn_device_get_name(ggml_backend_dev_t dev) {
return GGML_ZDNN_NAME;
GGML_UNUSED(dev);
}
static const char * ggml_backend_zdnn_device_get_description(ggml_backend_dev_t dev) {
return "IBM Z Neural Network Processing Assist (NNPA)";
}
static void ggml_backend_zdnn_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
*free = 0;
*total = 0;
}
static enum ggml_backend_dev_type ggml_backend_zdnn_device_get_type(ggml_backend_dev_t dev) {
return GGML_BACKEND_DEVICE_TYPE_ACCEL;
GGML_UNUSED(dev);
}
static void ggml_backend_zdnn_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
props->name = ggml_backend_zdnn_device_get_name(dev);
props->description = ggml_backend_zdnn_device_get_description(dev);
props->type = ggml_backend_zdnn_device_get_type(dev);
ggml_backend_zdnn_device_get_memory(dev, &props->memory_free, &props->memory_total);
props->caps = (ggml_backend_dev_caps) {
/* .async = */ false,
/* .host_buffer = */ false,
/* .buffer_from_host_ptr = */ true,
/* .events = */ false,
};
}
static ggml_backend_t ggml_backend_zdnn_device_init(ggml_backend_dev_t dev, const char * params) {
ggml_backend_zdnn_context * ctx = ggml_zdnn_init(dev);
if (ctx == NULL) {
GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
return NULL;
}
ggml_backend_t backend = (ggml_backend *)malloc(sizeof(ggml_backend));
*backend = (ggml_backend) {
/* .guid = */ ggml_backend_zdnn_guid(),
/* .iface = */ ggml_backend_zdnn_i,
/* .device = */ dev,
/* .context = */ ctx,
};
return backend;
GGML_UNUSED(params);
}
static ggml_backend_buffer_type_t ggml_backend_zdnn_device_get_buffer_type(ggml_backend_dev_t dev) {
return ggml_backend_zdnn_buffer_type();
GGML_UNUSED(dev);
}
static ggml_backend_buffer_t ggml_backend_zdnn_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
ggml_backend_zdnn_buffer_context * ctx = new ggml_backend_zdnn_buffer_context();
ctx->all_data = ptr;
ctx->all_size = size;
ctx->owned = false;
ctx->n_buffers = 0;
const size_t size_page = sysconf(_SC_PAGESIZE);
// page-align the data ptr
{
const uintptr_t offs = (uintptr_t) ptr % size_page;
ptr = (void *)((char *)ptr - offs);
size += offs;
}
size_t size_aligned = size;
if ((size_aligned % size_page) != 0) {
size_aligned += size_page - (size_aligned % size_page);
}
ggml_backend_zdnn_device_context * ctx_dev = (ggml_backend_zdnn_device_context *)dev->context;
GGML_ASSERT(ctx_dev->zdnn_device >= 0);
int device = ctx_dev->zdnn_device; GGML_UNUSED(device);
std::unique_ptr<ggml_backend_zdnn_buffer> zdnn_buffer = std::make_unique<ggml_backend_zdnn_buffer>();
zdnn_buffer->data = ptr;
zdnn_buffer->size = size;
ctx->buffers.push_back(std::move(zdnn_buffer));
GGML_LOG_INFO("%s: allocated buffer, size = %8.2f MiB\n",
__func__, size_aligned / 1024.0 / 1024.0);
++ctx->n_buffers;
return ggml_backend_buffer_init(ggml_backend_zdnn_buffer_from_ptr_type(), ggml_backend_zdnn_buffer_i, ctx, size);
}
static bool ggml_backend_zdnn_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
ggml_backend_zdnn_device_context * ctx_dev = (ggml_backend_zdnn_device_context *) dev->context;
return ggml_zdnn_supports_op(ctx_dev, op);
}
static bool ggml_backend_zdnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
return
buft->iface.get_name == ggml_backend_zdnn_buffer_type_get_name ||
buft->iface.get_name == ggml_backend_zdnn_buffer_from_ptr_type_get_name;
GGML_UNUSED(dev);
}
static ggml_backend_device_i ggml_backend_zdnn_device_i = {
/* .get_name = */ ggml_backend_zdnn_device_get_name,
/* .get_description = */ ggml_backend_zdnn_device_get_description,
/* .get_memory = */ ggml_backend_zdnn_device_get_memory,
/* .get_type = */ ggml_backend_zdnn_device_get_type,
/* .get_props = */ ggml_backend_zdnn_device_get_props,
/* .init_backend = */ ggml_backend_zdnn_device_init,
/* .get_buffer_type = */ ggml_backend_zdnn_device_get_buffer_type,
/* .get_host_buffer_type = */ NULL,
/* .buffer_from_host_ptr = */ ggml_backend_zdnn_device_buffer_from_ptr,
/* .supports_op = */ ggml_backend_zdnn_device_supports_op,
/* .supports_buft = */ ggml_backend_zdnn_device_supports_buft,
/* .offload_op = */ NULL,
/* .event_new = */ NULL,
/* .event_free = */ NULL,
/* .event_synchronize = */ NULL,
};
//
// backend registry
//
static const char * ggml_backend_zdnn_reg_get_name(ggml_backend_reg_t reg) {
return GGML_ZDNN_NAME;
GGML_UNUSED(reg);
}
static size_t ggml_backend_zdnn_reg_device_count(ggml_backend_reg_t reg) {
if (!zdnn_is_nnpa_installed()) {
return 0;
}
return 1;
GGML_UNUSED(reg);
}
static ggml_backend_dev_t ggml_backend_zdnn_reg_device_get(ggml_backend_reg_t reg, size_t index) {
GGML_ASSERT(index == 0);
return &g_ggml_backend_zdnn_device;
GGML_UNUSED(reg);
GGML_UNUSED(index);
}
static ggml_backend_feature g_ggml_backend_zdnn_features[] = {
{ "NNPA", zdnn_is_nnpa_installed() ? "1" : "0" },
{ "NNPA_PARMBLKFORMAT_0", zdnn_is_nnpa_parmblk_fmt_installed(1, NNPA_PARMBLKFORMAT_0) ? "1" : "0" },
{ "NNPA_PARMBLKFORMAT_1", zdnn_is_nnpa_parmblk_fmt_installed(1, NNPA_PARMBLKFORMAT_1) ? "1" : "0" },
{ NULL, NULL },
};
static ggml_backend_feature * ggml_backend_zdnn_get_features(ggml_backend_reg_t reg) {
return g_ggml_backend_zdnn_features;
GGML_UNUSED(reg);
}
static void * ggml_backend_zdnn_get_proc_address(ggml_backend_reg_t reg, const char * name) {
if (strcmp(name, "ggml_backend_get_features") == 0) {
return (void *) ggml_backend_zdnn_get_features;
}
return NULL;
GGML_UNUSED(reg);
}
static ggml_backend_reg_i ggml_backend_zdnn_reg_i = {
/* .get_name = */ ggml_backend_zdnn_reg_get_name,
/* .get_device_count = */ ggml_backend_zdnn_reg_device_count,
/* .get_device = */ ggml_backend_zdnn_reg_device_get,
/* .get_proc_address = */ ggml_backend_zdnn_get_proc_address,
};
static void ggml_zdnn_cleanup(void) {
ggml_backend_zdnn_device_rel(&g_ggml_ctx_dev_main);
}
// TODO: make thread-safe
ggml_backend_reg_t ggml_backend_zdnn_reg(void) {
ggml_backend_zdnn_device_acq(&g_ggml_ctx_dev_main);
// register cleanup callback
atexit(ggml_zdnn_cleanup);
{
g_ggml_backend_zdnn_reg = (ggml_backend_reg) {
/* .api_version = */ GGML_ZDNN_VERSION,
/* .iface = */ ggml_backend_zdnn_reg_i,
/* .context = */ NULL,
};
g_ggml_backend_zdnn_device = (ggml_backend_device) {
/* .iface = */ ggml_backend_zdnn_device_i,
/* .reg = */ &g_ggml_backend_zdnn_reg,
/* .context = */ &g_ggml_ctx_dev_main,
};
return &g_ggml_backend_zdnn_reg;
}
}
GGML_BACKEND_DL_IMPL(ggml_backend_zdnn_reg)
+1
View File
@@ -5528,6 +5528,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_count_equal(GGML_TYPE_F32, {4, 5000, 1, 1}));
test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32, 1, 1, 1}));
test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32, 513, 1, 1}));
test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {100, 10, 1, 1}));
test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 10, 1, 1}));
test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 12, 1, 1}));