Compare commits

..

14 Commits

Author SHA1 Message Date
o7si d4c8e2c29c vocab : add tokenizer support for jina-embeddings-v2-base-zh (#18756)
* vocab : add jina-embeddings-v2-base-zh (whitespace tokenizer)

* lowercase defaults to true

* type fix

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2026-05-31 12:37:35 +02:00
Eric Zhang 3292da09f6 ui: fix ETag truncation with MSVC compiler (#23917) 2026-05-31 11:21:23 +02:00
Vladislav e6123e2080 docs : update ZenDNN docs for Q8 support (#23791)
* docs zendnn added information about Q8 support

* docs zendnn rm unnecessary data

* docs update, links to ZenDNN docs provided

* docs zenDNN update: clarified explanation

* docs zenDNN update: one more explanation clarified

---------

Co-authored-by: plotnikov.v10 <plotnikov.v10@wb.ru>
2026-05-31 10:26:42 +02:00
Ruben Ortlam 22cadc1944 llama: only use one iGPU device by default (#23897) 2026-05-31 08:17:47 +02:00
Pascal d749821db3 webui: add custom CSS injection via config (#23904)
* webui: add custom CSS injection via config

register a customCSS setting in the Developer section under Custom JSON,
syncable so it rides the existing ui-config pass through. inject the value
into a single style element in the head, reactive on the setting. lets an
operator theme a prebuilt binary through --ui-config without rebuilding,
and lets a user set it from the settings panel.

* ui: address review from @niutech and @allozaur, rename custom JSON key and CSS field

* ui: address review from @allozaur, move custom CSS injection to a style tag in svelte:head

* ui: inject custom CSS through a svelte action instead of a bound element

move the textContent write into a use: action on the head style node.
the action is the idiomatic way to touch a node, so the no-dom-manipulating
lint rule is satisfied without a disable. value stays text through
textContent, never parsed as HTML.

* Update tools/ui/src/lib/constants/settings-keys.ts

Co-authored-by: Aleksander Grygier <aleksander.grygier@gmail.com>

* ui: address review from @allozaur, rename custom config key to customJson with migration

rename the custom config key to customJson across the type, the chat
request builder, the settings save check and the custom tools reader,
keeping the custom API param name unchanged. add a non destructive
migration that copies the legacy custom key to customJson at startup.
only render the head style tag when custom CSS is set.

---------

Co-authored-by: Aleksander Grygier <aleksander.grygier@gmail.com>
2026-05-30 23:49:31 +02:00
Gaurav Garg aa46bda89b Support -fa auto in llama-bench (#23714)
* Support `-fa auto` in llama-bench

Make the default value of `-ngl` -1, similar to other tools.

Update README with latest usage and examples

* Address review comments
2026-05-31 02:03:57 +05:30
lhez d6588daa80 opencl: support bf16 by converting to f16 (#23839) 2026-05-30 10:17:47 -07:00
Pascal d38d50e7ff ui: exclude generated build dirs from prettier and eslint so lint errors stop being masked (#23910) 2026-05-30 16:50:54 +02:00
Johannes Gäßler 8b0e0db606 TP: fix granularity for Qwen 3.5/3.6 + 3 GPUs (#23843)
* TP: fix granularity for Qwen 3.5/3.6 + 3 GPUs

* fix afmoe TP
2026-05-30 16:48:00 +03:00
Georgi Gerganov 2d9b7c8e98 metal : restore im2col implementation for large kernels (#23901) 2026-05-30 15:26:13 +03:00
Xuan-Son Nguyen e674b1279b test: (test-llama-archs) log the config name first (#23885) 2026-05-30 12:22:38 +02:00
Georgi Gerganov 4c4e91b799 ci : update ios-xcode release job to macos-26 (#23906)
* ci : disable libcommon build from xcframework

* ocd : fix name

* ci : ios-xcode change to macos-26

* cont : pin xcode

* cont : pin xcode to minor version
2026-05-30 13:21:46 +03:00
Jinyang He d48a56effb ggml : add some lsx support (#23798)
* loongarch : optimize LSX fp16 load/store with native intrinsics

Use __lsx_vfcvtl_s_h and __lsx_vfcvt_h_s instead of scalar loops in
__lsx_f16x4_load and __lsx_f16x4_store.

* loongarch : add LSX implementation for q8_0 dot product

* loongarch : add LSX implementation for q6_K dot product

* loongarch : add LSX implementation for iq4_xs dot product

* Improve reduce ops when sun int16 pairs to int32
2026-05-30 11:53:26 +03:00
Ruben Ortlam 6e093b80ea vulkan: add Flash Attention support for BFloat16 KV cache (#23420)
* vulkan: add flash attention bf16 kv support

* vulkan: bf16 FA coopmat1 support

* vulkan: bf16 FA coopmat2 support

* fix FA bf16 f32 fallback

* fix FA bf16 coopmat1 shader

* fix FA bf16 coopmat2 shader

* code cleanup

* cleanup comment change

* address feedback

* add O_TYPE for cm2 FA

* use O_TYPE for gqaStore function

* reduce BFLOAT16 ifdefs
2026-05-30 10:39:31 +02:00
43 changed files with 948 additions and 278 deletions
+28 -30
View File
@@ -38,7 +38,7 @@ concurrency:
queue: max
jobs:
check_release:
check-release:
runs-on: ubuntu-slim
outputs:
@@ -60,8 +60,8 @@ jobs:
fi
macos-cpu:
needs: [check_release]
if: ${{ needs.check_release.outputs.should_release == 'true' }}
needs: [check-release]
if: ${{ needs.check-release.outputs.should_release == 'true' }}
strategy:
matrix:
include:
@@ -141,8 +141,8 @@ jobs:
name: llama-bin-macos-${{ matrix.build }}.tar.gz
ubuntu-cpu:
needs: [check_release]
if: ${{ needs.check_release.outputs.should_release == 'true' }}
needs: [check-release]
if: ${{ needs.check-release.outputs.should_release == 'true' }}
strategy:
matrix:
include:
@@ -227,8 +227,8 @@ jobs:
name: llama-bin-ubuntu-${{ matrix.build }}.tar.gz
ubuntu-vulkan:
needs: [check_release]
if: ${{ needs.check_release.outputs.should_release == 'true' }}
needs: [check-release]
if: ${{ needs.check-release.outputs.should_release == 'true' }}
strategy:
matrix:
@@ -312,8 +312,8 @@ jobs:
name: llama-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz
android-arm64:
needs: [check_release]
if: ${{ needs.check_release.outputs.should_release == 'true' }}
needs: [check-release]
if: ${{ needs.check-release.outputs.should_release == 'true' }}
runs-on: ubuntu-latest
@@ -404,8 +404,8 @@ jobs:
name: llama-bin-android-arm64.tar.gz
ubuntu-24-openvino:
needs: [check_release]
if: ${{ needs.check_release.outputs.should_release == 'true' }}
needs: [check-release]
if: ${{ needs.check-release.outputs.should_release == 'true' }}
runs-on: ubuntu-24.04
@@ -501,8 +501,8 @@ jobs:
name: llama-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz
windows-cpu:
needs: [check_release]
if: ${{ needs.check_release.outputs.should_release == 'true' }}
needs: [check-release]
if: ${{ needs.check-release.outputs.should_release == 'true' }}
runs-on: windows-2025
@@ -569,8 +569,8 @@ jobs:
name: llama-bin-win-cpu-${{ matrix.arch }}.zip
windows:
needs: [check_release]
if: ${{ needs.check_release.outputs.should_release == 'true' }}
needs: [check-release]
if: ${{ needs.check-release.outputs.should_release == 'true' }}
runs-on: windows-2025
@@ -667,8 +667,8 @@ jobs:
name: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
windows-cuda:
needs: [check_release]
if: ${{ needs.check_release.outputs.should_release == 'true' }}
needs: [check-release]
if: ${{ needs.check-release.outputs.should_release == 'true' }}
runs-on: windows-2022
@@ -959,8 +959,8 @@ jobs:
# name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
ubuntu-22-rocm:
needs: [check_release]
if: ${{ needs.check_release.outputs.should_release == 'true' }}
needs: [check-release]
if: ${{ needs.check-release.outputs.should_release == 'true' }}
runs-on: ubuntu-22.04
@@ -1079,8 +1079,8 @@ jobs:
name: llama-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz
windows-hip:
needs: [check_release]
if: ${{ needs.check_release.outputs.should_release == 'true' }}
needs: [check-release]
if: ${{ needs.check-release.outputs.should_release == 'true' }}
runs-on: windows-2022
@@ -1202,11 +1202,9 @@ jobs:
name: llama-bin-win-hip-${{ matrix.name }}-x64.zip
ios-xcode:
needs: [check_release]
if: ${{ needs.check_release.outputs.should_release == 'true' }}
# TODO: figure out how to make this work with macos-26
# https://github.com/ggml-org/llama.cpp/actions/runs/26652714555/job/78604869474
runs-on: macos-15
needs: [check-release]
if: ${{ needs.check-release.outputs.should_release == 'true' }}
runs-on: macos-26
steps:
- name: Checkout code
@@ -1216,7 +1214,7 @@ jobs:
- name: Setup Xcode
run: |
sudo xcode-select -s /Applications/Xcode_16.4.app
sudo xcode-select -s /Applications/Xcode_26.4.app
- name: Build
id: cmake_build
@@ -1232,7 +1230,7 @@ jobs:
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \
-DCMAKE_SYSTEM_NAME=iOS \
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-DCMAKE_OSX_DEPLOYMENT_TARGET=16.0 \
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
@@ -1354,8 +1352,8 @@ jobs:
# name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
ui:
needs: [check_release]
if: ${{ needs.check_release.outputs.should_release == 'true' }}
needs: [check-release]
if: ${{ needs.check-release.outputs.should_release == 'true' }}
uses: ./.github/workflows/ui-build.yml
release:
+2
View File
@@ -8,6 +8,7 @@ TVOS_MIN_OS_VERSION=16.4
BUILD_SHARED_LIBS=OFF
LLAMA_BUILD_APP=OFF
LLAMA_BUILD_COMMON=OFF
LLAMA_BUILD_EXAMPLES=OFF
LLAMA_BUILD_TOOLS=OFF
LLAMA_BUILD_TESTS=OFF
@@ -33,6 +34,7 @@ COMMON_CMAKE_ARGS=(
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}
-DLLAMA_BUILD_APP=${LLAMA_BUILD_APP}
-DLLAMA_BUILD_COMMON=${LLAMA_BUILD_COMMON}
-DLLAMA_BUILD_EXAMPLES=${LLAMA_BUILD_EXAMPLES}
-DLLAMA_BUILD_TOOLS=${LLAMA_BUILD_TOOLS}
-DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS}
+10
View File
@@ -1692,6 +1692,16 @@ class TextModel(ModelBase):
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
special_vocab.add_to_gguf(self.gguf_writer)
def _set_vocab_whitespace(self) -> None:
tokens, toktypes, _ = self.get_vocab_base()
self.gguf_writer.add_tokenizer_model("whitespace")
self.gguf_writer.add_tokenizer_pre("whitespace") # pinned, not hash-detected: chktxt hash collides with jina-v1-en
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
special_vocab.add_to_gguf(self.gguf_writer)
def _set_vocab_hybriddna(self):
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
+10 -1
View File
@@ -571,7 +571,16 @@ class JinaBertV2Model(BertModel):
if tokenizer_class == 'BertTokenizer':
super().set_vocab()
elif tokenizer_class == 'RobertaTokenizer':
self._set_vocab_gpt2()
pre_tokenizer_type = None
tokenizer_json_path = self.dir_model / "tokenizer.json"
if tokenizer_json_path.is_file():
with open(tokenizer_json_path, "r", encoding="utf-8") as f:
pre_tokenizer_type = json.load(f).get("pre_tokenizer", {}).get("type")
if pre_tokenizer_type == "Whitespace":
self._set_vocab_whitespace()
else:
self._set_vocab_gpt2()
self.gguf_writer.add_token_type_count(2)
else:
raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel')
+18 -1
View File
@@ -72,10 +72,13 @@ The ZenDNN backend accelerates **matrix multiplication (MUL_MAT)** and **expert-
|:----------------------:|:-------:|:---------------------------------------------:|
| FP32 | Support | Full precision floating point |
| BF16 | Support | BFloat16 (best performance on Zen 4/Zen 5) |
| Q8_0 | Support | 8-bit quantized weights via [dynamic quantization](https://github.com/amd/ZenDNN/blob/main/docs/operator/lowoha_matmul_operator.md) |
*Notes:*
- **BF16** provides best performance on Zen 4 and Zen 5 EPYC™ processors (Genoa, Turin).
- **Q8_0** is available for quantized model weights since ZenDNN supports dynamic quantization [LowOHA MatMul operator](https://github.com/amd/ZenDNN/blob/main/docs/operator/lowoha_matmul_operator.md).
- Other quantization formats fall back to the standard CPU backend unless explicitly supported by the ZenDNN backend.
## Linux
@@ -140,6 +143,15 @@ Download LLaMA 3.1 8B Instruct BF16 model:
huggingface-cli download meta-llama/Llama-3.1-8B-Instruct-GGUF --local-dir models/
```
You can also use a Q8_0 GGUF model:
```sh
# Download a Q8_0 GGUF model from Hugging Face
huggingface-cli download meta-llama/Llama-3.1-8B-Instruct-GGUF \
Llama-3.1-8B-Instruct-Q8_0.gguf \
--local-dir models/
```
#### 2. Start Server
Run llama.cpp server with ZenDNN acceleration:
@@ -176,6 +188,10 @@ export ZENDNNL_MATMUL_ALGO=1 # Blocked AOCL DLP algo (recommended)
For more details on available algorithms, see the [ZenDNN MatMul Algorithm Documentation](https://github.com/amd/ZenDNN/blob/a18adf8c605fb5f5e52cefd7eda08a7b18febbaf/docs/runtime_env.md#algorithm-details).
### Q8_0 Performance Notes
Q8_0 support is mainly beneficial for prompt processing / prefill workloads where large matrix multiplications dominate execution. Token generation performance may remain close to the standard CPU backend depending on the model, batch size, number of threads, and CPU topology.
### Profiling and Debugging
For detailed profiling and logging options, refer to the [ZenDNN Logging Documentation](https://github.com/amd/ZenDNN/blob/a18adf8c605fb5f5e52cefd7eda08a7b18febbaf/docs/logging.md).
@@ -184,6 +200,7 @@ For detailed profiling and logging options, refer to the [ZenDNN Logging Documen
- **Limited operation support**: Currently matrix multiplication (MUL_MAT) and expert-based matrix multiplication (MUL_MAT_ID) are accelerated via ZenDNN. Other operations fall back to the standard CPU backend. Future updates may expand supported operations.
- **BF16 support**: BF16 operations require AMD Zen 4 or Zen 5 architecture (EPYC 9004/9005 series). On older CPUs, operations will use FP32.
- **Q8_0 support scope**: Q8_0 acceleration is available for supported matrix multiplication paths. Other quantization formats still fall back to the standard CPU backend.
- **NUMA awareness**: For multi-socket systems, manual NUMA binding may be required for optimal performance.
## Q&A
@@ -202,7 +219,7 @@ A: ZenDNN is optimized specifically for AMD processors. While it may work on oth
**Q: Does ZenDNN support quantized models?**
A: Currently, ZenDNN primarily supports FP32 and BF16 data types. Quantized model support is not available at this time.
A: Yes. The ZenDNN backend supports Q8_0 quantized models for supported matrix multiplication operations. FP32 and BF16 are also supported. Other quantization formats may fall back to the standard CPU backend unless explicitly supported by the ZenDNN backend.
**Q: Why is my inference not faster with ZenDNN?**
+1
View File
@@ -22,6 +22,7 @@ The following sections describe how to build with different backends and options
* [HIP](#hip)
* [Vulkan](#vulkan)
* [CANN](#cann)
* [ZenDNN](#zendnn)
* [Arm® KleidiAI™](#arm-kleidiai)
* [OpenCL](#opencl)
* [Android](#android-1)
+151
View File
@@ -977,6 +977,35 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
sumf = hsum_float_8(acc);
*s = sumf;
#elif defined(__loongarch_sx)
__m128 acc = (__m128)__lsx_vldi(0);
for (; ib < nb; ++ib) {
const float d = GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d);
const __m128i qx_0 = __lsx_vld((const __m128i *)x[ib].qs, 0);
const __m128i qx_1 = __lsx_vld((const __m128i *)x[ib].qs + 1, 0);
const __m128i qy_0 = __lsx_vld((const __m128i *)y[ib].qs, 0);
const __m128i qy_1 = __lsx_vld((const __m128i *)y[ib].qs + 1, 0);
const __m128i p16_0 = lsx_maddubs_h(qx_0, qy_0);
const __m128i p16_1 = lsx_maddubs_h(qx_1, qy_1);
// Sum int16 pairs → int32
const __m128i s_0 = __lsx_vaddwev_w_h(p16_0, p16_1);
const __m128i s_1 = __lsx_vaddwod_w_h(p16_0, p16_1);
const __m128 q = __lsx_vffint_s_w(__lsx_vadd_w(s_0, s_1));
acc = __lsx_vfmadd_s(__lsx_vreplfr2vr_s(d), q, acc);
}
__m128 res = lsx_hadd_s(acc, acc);
res = lsx_hadd_s(res, res);
sumf = ((v4f32)res)[0];
*s = sumf;
#else
UNUSED(nb);
UNUSED(ib);
@@ -1443,6 +1472,99 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
*s = hsum_float_8(acc);
#elif defined(__loongarch_sx)
const __m128i m32s = __lsx_vreplgr2vr_b(32);
__m128 acc_0 = (__m128)__lsx_vldi(0);
__m128 acc_1 = (__m128)__lsx_vldi(0);
for (int i = 0; i < nb; ++i) {
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
const uint8_t * GGML_RESTRICT qh = x[i].qh;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
const __m128i scale_i8 = __lsx_vld(x[i].scales, 0);
const __m128i scales_lo = __lsx_vsllwil_h_b(scale_i8, 0);
const __m128i scales_hi = __lsx_vsllwil_h_b(__lsx_vbsrl_v(scale_i8, 8), 0);
__m128i sumi_0 = __lsx_vldi(0);
__m128i sumi_1 = __lsx_vldi(0);
for (int j = 0; j < QK_K/128; ++j) {
const __m128i q4bitsH_0 = __lsx_vld((const __m128i*)qh, 0); qh += 16;
const __m128i q4bitsH_1 = __lsx_vld((const __m128i*)qh, 0); qh += 16;
const __m128i q4h_0 = __lsx_vslli_b(__lsx_vandi_b(q4bitsH_0, 3), 4);
const __m128i q4h_1 = __lsx_vslli_b(__lsx_vandi_b(q4bitsH_1, 3), 4);
const __m128i q4h_2 = __lsx_vslli_b(__lsx_vandi_b(q4bitsH_0, 3 << 2), 2);
const __m128i q4h_3 = __lsx_vslli_b(__lsx_vandi_b(q4bitsH_1, 3 << 2), 2);
const __m128i q4h_4 = __lsx_vandi_b(q4bitsH_0, 3 << 4);
const __m128i q4h_5 = __lsx_vandi_b(q4bitsH_1, 3 << 4);
const __m128i q4h_6 = __lsx_vsrli_b(__lsx_vandi_b(q4bitsH_0, 3 << 6), 2);
const __m128i q4h_7 = __lsx_vsrli_b(__lsx_vandi_b(q4bitsH_1, 3 << 6), 2);
const __m128i q4bits1_0 = __lsx_vld((const __m128i*)q4, 0); q4 += 16;
const __m128i q4bits1_1 = __lsx_vld((const __m128i*)q4, 0); q4 += 16;
const __m128i q4bits2_0 = __lsx_vld((const __m128i*)q4, 0); q4 += 16;
const __m128i q4bits2_1 = __lsx_vld((const __m128i*)q4, 0); q4 += 16;
const __m128i q4_0 = __lsx_vor_v(__lsx_vandi_b(q4bits1_0, 0xf), q4h_0);
const __m128i q4_1 = __lsx_vor_v(__lsx_vandi_b(q4bits1_1, 0xf), q4h_1);
const __m128i q4_2 = __lsx_vor_v(__lsx_vandi_b(q4bits2_0, 0xf), q4h_2);
const __m128i q4_3 = __lsx_vor_v(__lsx_vandi_b(q4bits2_1, 0xf), q4h_3);
const __m128i q4_4 = __lsx_vor_v(__lsx_vsrli_b(q4bits1_0, 4), q4h_4);
const __m128i q4_5 = __lsx_vor_v(__lsx_vsrli_b(q4bits1_1, 4), q4h_5);
const __m128i q4_6 = __lsx_vor_v(__lsx_vsrli_b(q4bits2_0, 4), q4h_6);
const __m128i q4_7 = __lsx_vor_v(__lsx_vsrli_b(q4bits2_1, 4), q4h_7);
const __m128i q8_0 = __lsx_vld((const __m128i*)q8, 0); q8 += 16;
const __m128i q8_1 = __lsx_vld((const __m128i*)q8, 0); q8 += 16;
const __m128i q8_2 = __lsx_vld((const __m128i*)q8, 0); q8 += 16;
const __m128i q8_3 = __lsx_vld((const __m128i*)q8, 0); q8 += 16;
const __m128i q8_4 = __lsx_vld((const __m128i*)q8, 0); q8 += 16;
const __m128i q8_5 = __lsx_vld((const __m128i*)q8, 0); q8 += 16;
const __m128i q8_6 = __lsx_vld((const __m128i*)q8, 0); q8 += 16;
const __m128i q8_7 = __lsx_vld((const __m128i*)q8, 0); q8 += 16;
__m128i p16_0 = lsx_maddubs_h(__lsx_vsub_b(q4_0, m32s), q8_0);
__m128i p16_1 = lsx_maddubs_h(__lsx_vsub_b(q4_1, m32s), q8_1);
__m128i p16_2 = lsx_maddubs_h(__lsx_vsub_b(q4_2, m32s), q8_2);
__m128i p16_3 = lsx_maddubs_h(__lsx_vsub_b(q4_3, m32s), q8_3);
__m128i p16_4 = lsx_maddubs_h(__lsx_vsub_b(q4_4, m32s), q8_4);
__m128i p16_5 = lsx_maddubs_h(__lsx_vsub_b(q4_5, m32s), q8_5);
__m128i p16_6 = lsx_maddubs_h(__lsx_vsub_b(q4_6, m32s), q8_6);
__m128i p16_7 = lsx_maddubs_h(__lsx_vsub_b(q4_7, m32s), q8_7);
const __m128i sc_vec = j == 0 ? scales_lo : scales_hi;
p16_0 = lsx_madd_h(__lsx_vreplvei_h(sc_vec, 0), p16_0);
p16_1 = lsx_madd_h(__lsx_vreplvei_h(sc_vec, 1), p16_1);
p16_2 = lsx_madd_h(__lsx_vreplvei_h(sc_vec, 2), p16_2);
p16_3 = lsx_madd_h(__lsx_vreplvei_h(sc_vec, 3), p16_3);
p16_4 = lsx_madd_h(__lsx_vreplvei_h(sc_vec, 4), p16_4);
p16_5 = lsx_madd_h(__lsx_vreplvei_h(sc_vec, 5), p16_5);
p16_6 = lsx_madd_h(__lsx_vreplvei_h(sc_vec, 6), p16_6);
p16_7 = lsx_madd_h(__lsx_vreplvei_h(sc_vec, 7), p16_7);
sumi_0 = __lsx_vadd_w(sumi_0, __lsx_vadd_w(p16_0, p16_2));
sumi_1 = __lsx_vadd_w(sumi_1, __lsx_vadd_w(p16_1, p16_3));
sumi_0 = __lsx_vadd_w(sumi_0, __lsx_vadd_w(p16_4, p16_6));
sumi_1 = __lsx_vadd_w(sumi_1, __lsx_vadd_w(p16_5, p16_7));
}
__m128 p_0 = __lsx_vfmul_s(__lsx_vreplfr2vr_s(d), __lsx_vffint_s_w(sumi_0));
__m128 p_1 = __lsx_vfmul_s(__lsx_vreplfr2vr_s(d), __lsx_vffint_s_w(sumi_1));
acc_0 = __lsx_vfadd_s(p_0, acc_0);
acc_1 = __lsx_vfadd_s(p_1, acc_1);
}
*s = hsum_float_4x4(acc_0, acc_1, (__m128)__lsx_vldi(0), (__m128)__lsx_vldi(0));
#else
UNUSED(x);
UNUSED(y);
@@ -2149,6 +2271,35 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
*s = hsum_float_8(accum);
#elif defined(__loongarch_sx)
const __m128i values128 = __lsx_vld((const __m128i*)kvalues_iq4nl, 0);
__m128 accum = (__m128)__lsx_vldi(0);
for (int ibl = 0; ibl < nb; ++ibl) {
const uint8_t * qs = x[ibl].qs;
const int8_t * q8 = y[ibl].qs;
uint16_t sh = x[ibl].scales_h;
__m128i sumi = __lsx_vldi(0);
for (int ib = 0; ib < QK_K/32; ++ib) {
const __m128i q4bits = __lsx_vld((const __m128i*)qs, 0); qs += 16;
const __m128i q8b_0 = __lsx_vld((const __m128i*)q8, 0); q8 += 16;
const __m128i q8b_1 = __lsx_vld((const __m128i*)q8, 0); q8 += 16;
const __m128i q4b_0 = __lsx_vshuf_b(values128, values128, __lsx_vandi_b(q4bits, 0xf));
const __m128i q4b_1 = __lsx_vshuf_b(values128, values128, __lsx_vsrli_b(q4bits, 4));
const __m128i p16_0 = lsx_maddubs_h(q4b_0, q8b_0);
const __m128i p16_1 = lsx_maddubs_h(q4b_1, q8b_1);
const int16_t ls = (((x[ibl].scales_l[ib/2] >> ((ib & 1) * 4)) & 0xf) | ((sh & 0x3) << 4)) - 32;
sh >>= 2;
sumi = __lsx_vadd_w(lsx_madd_h(p16_0, __lsx_vreplgr2vr_h(ls)), sumi);
sumi = __lsx_vadd_w(lsx_madd_h(p16_1, __lsx_vreplgr2vr_h(ls)), sumi);
}
const float ds = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
accum = __lsx_vfadd_s(__lsx_vfmul_s(__lsx_vreplfr2vr_s(ds), __lsx_vffint_s_w(sumi)), accum);
}
*s = ((v4f32)lsx_hadd_s(lsx_hadd_s(accum, accum), lsx_hadd_s(accum, accum)))[0];
#else
UNUSED(x);
UNUSED(y);
+3 -16
View File
@@ -1125,25 +1125,12 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
#define GGML_F16_EPR 4
static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
float tmp[4];
tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]);
tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]);
tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
return (__m128)__lsx_vld(tmp, 0);
return __lsx_vfcvtl_s_h(__lsx_vld((const void *)x, 0));
}
static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
float arr[4];
__lsx_vst(y, arr, 0);
x[0] = GGML_CPU_FP32_TO_FP16(arr[0]);
x[1] = GGML_CPU_FP32_TO_FP16(arr[1]);
x[2] = GGML_CPU_FP32_TO_FP16(arr[2]);
x[3] = GGML_CPU_FP32_TO_FP16(arr[3]);
__m128i a = __lsx_vfcvt_h_s(y, y);
memcpy(x, &a, sizeof(ggml_fp16_t) * 4);
}
#define GGML_F32Cx4 __m128
+7 -1
View File
@@ -1732,6 +1732,8 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rope(ggml_metal_
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_im2col(ggml_metal_library_t lib, const ggml_tensor * op) {
assert(op->op == GGML_OP_IM2COL);
GGML_TENSOR_LOCALS(int64_t, ne0, op->src[0], ne);
GGML_ASSERT(ggml_is_contiguous(op->src[1]));
GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32);
GGML_ASSERT(op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_F32);
@@ -1739,7 +1741,11 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_im2col(ggml_meta
char base[256];
char name[256];
snprintf(base, 256, "kernel_im2col_%s", ggml_type_name(op->type));
if (ne00*ne01 <= 1024) {
snprintf(base, 256, "kernel_im2col_%s", ggml_type_name(op->type));
} else {
snprintf(base, 256, "kernel_im2col_ext_%s", ggml_type_name(op->type));
}
snprintf(name, 256, "%s", base);
ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
+17 -7
View File
@@ -3635,16 +3635,26 @@ int ggml_metal_op_im2col(ggml_metal_op_t ctx, int idx) {
auto pipeline = ggml_metal_library_get_pipeline_im2col(lib, op);
GGML_ASSERT(KH*KW <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
if (KH*KW <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
const uint64_t ntptg0 = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)/(KH*KW), N);
const uint64_t ntptg0 = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)/(KH*KW), N);
ggml_metal_encoder_set_pipeline(enc, pipeline);
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 1);
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
ggml_metal_encoder_set_pipeline(enc, pipeline);
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 1);
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
ggml_metal_encoder_dispatch_threadgroups(enc, IC, OH, OW, ntptg0, KH, KW);
} else {
const uint64_t n_threads = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), N);
const int64_t quotient = N / n_threads + (N % n_threads > 0 ? 1 : 0);
ggml_metal_encoder_dispatch_threadgroups(enc, IC, OH, OW, ntptg0, KH, KW);
ggml_metal_encoder_set_pipeline(enc, pipeline);
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 1);
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
ggml_metal_encoder_dispatch_threadgroups(enc, quotient * CHW, OH, OW, n_threads, 1, 1);
}
return 1;
}
+53 -53
View File
@@ -4696,59 +4696,59 @@ kernel void kernel_im2col(
template [[host_name("kernel_im2col_f32")]] kernel im2col_t kernel_im2col<float>;
template [[host_name("kernel_im2col_f16")]] kernel im2col_t kernel_im2col<half>;
// TODO: obsolete -- remove
//typedef void (im2col_ext_t)(
// constant ggml_metal_kargs_im2col & args,
// device const float * x,
// device char * dst,
// uint3 tgpig[[threadgroup_position_in_grid]],
// uint3 tgpg[[threadgroups_per_grid]],
// uint3 tpitg[[thread_position_in_threadgroup]],
// uint3 ntg[[threads_per_threadgroup]]);
//
//template <typename T>
//kernel void kernel_im2col_ext(
// constant ggml_metal_kargs_im2col & args,
// device const float * x,
// device char * dst,
// uint3 tgpig[[threadgroup_position_in_grid]],
// uint3 tgpg[[threadgroups_per_grid]], // tgpg[0] = D x IC x KH x KW, CHW = IC x KH x KW
// uint3 tpitg[[thread_position_in_threadgroup]],
// uint3 ntg[[threads_per_threadgroup]]) { // [M, 1, 1]
// const int64_t KHW = (int64_t)args.KHW;
//
// const int64_t d = tgpig[0] / args.CHW;
// const int64_t chw = tgpig[0] % args.CHW;
// const int64_t tgpig_0 = chw / KHW; // 0 ~ (IC - 1)
// const int64_t HW = tgpig[0] % KHW;
//
// const int64_t tpitg_0 = (d * ntg[0]) + tpitg[0];
// if (tpitg_0 >= args.N) {
// return;
// }
//
// const int64_t tpitg_1 = HW / args.KW;
// const int64_t tpitg_2 = HW % args.KW;
//
// const int64_t iiw = tgpig[2] * args.s0 + tpitg_2 * args.d0 - args.p0;
// const int64_t iih = tgpig[1] * args.s1 + tpitg_1 * args.d1 - args.p1;
//
// const int64_t offset_dst =
// (tpitg_0 * tgpg[1] * tgpg[2] + tgpig[1] * tgpg[2] + tgpig[2]) * args.CHW +
// (tgpig_0 * KHW + tpitg_1 * args.KW + tpitg_2);
//
// device T * pdst = (device T *) (dst);
//
// if (iih < 0 || iih >= args.IH || iiw < 0 || iiw >= args.IW) {
// pdst[offset_dst] = 0.0f;
// } else {
// const int64_t offset_src = tpitg_0 * args.ofs0 + tgpig_0 * args.ofs1;
// pdst[offset_dst] = x[offset_src + iih * args.IW + iiw];
// }
//}
//
//template [[host_name("kernel_im2col_ext_f32")]] kernel im2col_ext_t kernel_im2col_ext<float>;
//template [[host_name("kernel_im2col_ext_f16")]] kernel im2col_ext_t kernel_im2col_ext<half>;
// TODO: optimize
typedef void (im2col_ext_t)(
constant ggml_metal_kargs_im2col & args,
device const float * x,
device char * dst,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tgpg[[threadgroups_per_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]);
template <typename T>
kernel void kernel_im2col_ext(
constant ggml_metal_kargs_im2col & args,
device const float * x,
device char * dst,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tgpg[[threadgroups_per_grid]], // tgpg[0] = D x IC x KH x KW, CHW = IC x KH x KW
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) { // [M, 1, 1]
const int64_t KHW = (int64_t)args.KHW;
const int64_t d = tgpig[0] / args.CHW;
const int64_t chw = tgpig[0] % args.CHW;
const int64_t tgpig_0 = chw / KHW; // 0 ~ (IC - 1)
const int64_t HW = tgpig[0] % KHW;
const int64_t tpitg_0 = (d * ntg[0]) + tpitg[0];
if (tpitg_0 >= args.N) {
return;
}
const int64_t tpitg_1 = HW / args.KW;
const int64_t tpitg_2 = HW % args.KW;
const int64_t iiw = tgpig[2] * args.s0 + tpitg_2 * args.d0 - args.p0;
const int64_t iih = tgpig[1] * args.s1 + tpitg_1 * args.d1 - args.p1;
const int64_t offset_dst =
(tpitg_0 * tgpg[1] * tgpg[2] + tgpig[1] * tgpg[2] + tgpig[2]) * args.CHW +
(tgpig_0 * KHW + tpitg_1 * args.KW + tpitg_2);
device T * pdst = (device T *) (dst);
if (iih < 0 || iih >= args.IH || iiw < 0 || iiw >= args.IW) {
pdst[offset_dst] = 0.0f;
} else {
const int64_t offset_src = tpitg_0 * args.ofs0 + tgpig_0 * args.ofs1;
pdst[offset_dst] = x[offset_src + iih * args.IW + iiw];
}
}
template [[host_name("kernel_im2col_ext_f32")]] kernel im2col_ext_t kernel_im2col_ext<float>;
template [[host_name("kernel_im2col_ext_f16")]] kernel im2col_ext_t kernel_im2col_ext<half>;
template <typename TK>
kernel void kernel_conv_2d(
+79 -2
View File
@@ -585,6 +585,7 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_convert_block_mxfp4_trans4_ns, kernel_restore_block_mxfp4_trans4_ns;
cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0, kernel_restore_block_q8_0_trans;
cl_kernel kernel_convert_block_q6_K_noshuffle, kernel_restore_block_q6_K_noshuffle;
cl_kernel kernel_convert_bf16_to_f16, kernel_convert_f16_to_bf16;
cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
cl_kernel kernel_convert_block_q4_0_noshuffle;
cl_kernel kernel_restore_block_q4_0_noshuffle;
@@ -1175,6 +1176,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
CL_CHECK((backend_ctx->kernel_restore_block_iq4_nl = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_iq4_nl", &err), err));
CL_CHECK((backend_ctx->kernel_convert_block_iq4_nl_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_iq4_nl_noshuffle", &err), err));
CL_CHECK((backend_ctx->kernel_restore_block_iq4_nl_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_iq4_nl_noshuffle", &err), err));
CL_CHECK((backend_ctx->kernel_convert_bf16_to_f16 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_bf16_to_f16", &err), err));
CL_CHECK((backend_ctx->kernel_convert_f16_to_bf16 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_f16_to_bf16", &err), err));
GGML_LOG_CONT(".");
}
@@ -5019,6 +5022,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
case GGML_OP_MUL_MAT:
if (op->src[0]->type == GGML_TYPE_F16) {
return true;
} else if (op->src[0]->type == GGML_TYPE_BF16) {
return true;
} else if (op->src[0]->type == GGML_TYPE_F32) {
return op->src[1]->type == GGML_TYPE_F32;
} else if (op->src[0]->type == GGML_TYPE_Q4_0 || op->src[0]->type == GGML_TYPE_Q4_1 ||
@@ -6828,6 +6833,40 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
}
#endif // GGML_OPENCL_SOA_Q
// convert bf16 to f16 and store as f16 in device buffer
if (tensor->type == GGML_TYPE_BF16) {
GGML_ASSERT(offset % sizeof(ggml_fp16_t) == 0 && size % sizeof(ggml_fp16_t) == 0
&& "Offset and size must be multiples of 2 for bf16 tensors");
ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra;
GGML_ASSERT(extra);
cl_ulong n_elements = size / sizeof(ggml_fp16_t);
cl_ulong off_dst = (extra->offset + offset) / sizeof(ggml_fp16_t);
cl_int err;
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
size, (void *) data, &err);
CL_CHECK(err);
cl_kernel kernel = backend_ctx->kernel_convert_bf16_to_f16;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->data_device));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_ulong), &off_dst));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &n_elements));
size_t global_work_size[] = { (size_t)CEIL_DIV(n_elements, 64)*64, 1, 1 };
size_t local_work_size[] = { 64, 1, 1 };
cl_event evt;
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
CL_CHECK(clWaitForEvents(1, &evt));
CL_CHECK(clReleaseMemObject(data_device));
CL_CHECK(clReleaseEvent(evt));
return;
}
ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra;
GGML_ASSERT(extra);
@@ -7676,6 +7715,41 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
}
#endif // GGML_OPENCL_SOA_Q
if (tensor->type == GGML_TYPE_BF16) {
GGML_ASSERT(offset % sizeof(ggml_fp16_t) == 0 && size % sizeof(ggml_fp16_t) == 0
&& "Offset and size must be multiples of 2 for bf16 tensors");
ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra;
GGML_ASSERT(extra);
cl_ulong n_elements = size / sizeof(ggml_fp16_t);
cl_ulong off_src = (extra->offset + tensor->view_offs + offset) / sizeof(ggml_fp16_t);
cl_int err;
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err);
CL_CHECK(err);
cl_kernel kernel = backend_ctx->kernel_convert_f16_to_bf16;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &n_elements));
size_t global_work_size[] = { (size_t)CEIL_DIV(n_elements, 64)*64, 1, 1 };
size_t local_work_size[] = { 64, 1, 1 };
cl_event evt;
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
CL_CHECK(clWaitForEvents(1, &evt));
CL_CHECK(clReleaseEvent(evt));
CL_CHECK(clEnqueueReadBuffer(
queue, data_device, CL_TRUE, 0, size, data, 0, NULL, NULL));
CL_CHECK(clReleaseMemObject(data_device));
return;
}
ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra;
CL_CHECK(clEnqueueReadBuffer(
@@ -8165,6 +8239,7 @@ static void ggml_cl_copy_to_contiguous(ggml_backend_t backend, const ggml_tensor
kernel = backend_ctx->kernel_cpy_f32_f32;
break;
case GGML_TYPE_F16:
case GGML_TYPE_BF16: // stored as f16 on device
kernel = backend_ctx->kernel_cpy_f16_f16;
break;
default:
@@ -11125,7 +11200,8 @@ static bool ggml_cl_can_use_adreno_xmem_gemm_f16_f32(
if (backend_ctx->gpu_family != GPU_FAMILY::ADRENO) {
return false;
}
if (src0->type != GGML_TYPE_F16 || src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
if ((src0->type != GGML_TYPE_F16 && src0->type != GGML_TYPE_BF16) ||
src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
return false;
}
if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
@@ -12843,7 +12919,8 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
GGML_ASSERT(dst);
GGML_ASSERT(dst->extra);
const enum ggml_type src0t = src0->type;
// bf16 is stored as f16 on device
const enum ggml_type src0t = (src0->type == GGML_TYPE_BF16) ? GGML_TYPE_F16 : src0->type;
const enum ggml_type src1t = src1->type;
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+42
View File
@@ -117,6 +117,48 @@ struct block_iq4_nl
uint8_t qs[QK4_NL / 2];
};
//------------------------------------------------------------------------------
// bf16 to f16
//------------------------------------------------------------------------------
kernel void kernel_convert_bf16_to_f16(
global const ushort * src,
global half * dst,
ulong off_dst,
ulong n
) {
uint i = get_global_id(0);
if (i >= n) {
return;
}
dst[i + off_dst] = (half) as_float((uint) src[i] << 16);
}
//------------------------------------------------------------------------------
// f16 to bf16
//------------------------------------------------------------------------------
kernel void kernel_convert_f16_to_bf16(
global const half * src,
ulong off_src,
global ushort * dst,
ulong n
) {
uint i = get_global_id(0);
if (i >= n) {
return;
}
float f = (float) src[i + off_src];
uint bits = as_uint(f);
if ((bits & 0x7fffffffu) > 0x7f800000u) {
// nan to quiet nan
dst[i] = (ushort)((bits >> 16) | 0x40u);
} else {
uint rounded = bits + 0x7fffu + ((bits >> 16) & 1u);
dst[i] = (ushort)(rounded >> 16);
}
}
//------------------------------------------------------------------------------
// kernel_convert_block_q4_0
// Convert the block_q4_0 format to 2 separate arrays (AOS -> SOA).
+109 -37
View File
@@ -691,6 +691,7 @@ struct vk_device_struct {
uint32_t coopmat_int_k;
bool coopmat2;
bool coopmat2_bf16_support {};
bool coopmat2_decode_vector;
bool pipeline_executable_properties_support {};
@@ -3139,7 +3140,7 @@ struct vk_fa_tuning_params {
};
static bool ggml_vk_flash_attn_scalar_shmem_support(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool f32acc, ggml_type k_type, ggml_type v_type);
static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool f32acc);
static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool f32acc, ggml_type k_type = GGML_TYPE_F16);
static vk_fa_tuning_params get_fa_tuning_params_scalar(const vk_device& device, uint32_t hsk, uint32_t hsv, uint32_t n_rows, uint32_t n_kv, ggml_type k_type, ggml_type v_type, bool f32acc) {
@@ -3279,6 +3280,13 @@ static vk_fa_tuning_params get_fa_tuning_params(const vk_device& device, uint32_
FaCodePath path = device->coopmat2 ? FA_COOPMAT2 :
device->coopmat1_fa_support ? FA_COOPMAT1 : FA_SCALAR;
if (path == FA_COOPMAT2 && k_type == GGML_TYPE_BF16 && !device->coopmat2_bf16_support) {
path = FA_COOPMAT1;
}
if (path == FA_COOPMAT1 && k_type == GGML_TYPE_BF16 && !device->coopmat_bf16_support) {
path = FA_SCALAR;
}
if (path == FA_COOPMAT1 && device->architecture == vk_device_architecture::NVIDIA_TURING) {
// Nvidia compiler bug, see https://github.com/ggml-org/llama.cpp/pull/19075#issuecomment-3820716090
path = FA_SCALAR;
@@ -3288,7 +3296,7 @@ static vk_fa_tuning_params get_fa_tuning_params(const vk_device& device, uint32_
bool shape_ok = (f32acc && device->coopmat_support_16x16x16_f32acc) ||
(!f32acc && device->coopmat_support_16x16x16_f16acc);
const vk_fa_tuning_params params = get_fa_tuning_params_coopmat1(device, hsk, hsv, n_rows, n_kv, k_type, v_type, f32acc);
bool shmem_ok = ggml_vk_flash_attn_coopmat_shmem_support(device, params, hsk, hsv, f32acc);
bool shmem_ok = ggml_vk_flash_attn_coopmat_shmem_support(device, params, hsk, hsv, f32acc, k_type);
if (!shape_ok || !shmem_ok) {
path = FA_SCALAR;
@@ -3334,8 +3342,8 @@ static vk_fa_pipeline_state get_fa_pipeline_state(const vk_device& device, const
static std::vector<uint32_t> get_fa_spec_constants(const vk_fa_pipeline_state& state) {
const auto fa_block_bytes = [](ggml_type t) -> uint32_t {
// decodeBufF32 uses a block of vec4s for a better memory access pattern.
return t == GGML_TYPE_F32 ? 16u : (uint32_t) ggml_type_size(t);
if (t == GGML_TYPE_F32) return 16u;
return (uint32_t) ggml_type_size(t);
};
return {
/* 0 WorkGroupSize */ state.workgroup_size,
@@ -3849,10 +3857,16 @@ static void ggml_vk_load_shaders(vk_device& device) {
const uint32_t fa_sgs = fa.first.subgroup_size;
const bool fa_ds = fa.first.subgroup_size == 0;
const bool bf16_kv = fa.first.k_type == GGML_TYPE_BF16;
const bool use_mmq = ggml_vk_fa_scalar_uses_mmq(device, fa.first.k_type);
const void * spv_data = nullptr;
size_t spv_size = 0;
if (use_mmq) {
const char *name = nullptr;
if (bf16_kv) {
spv_data = flash_attn_f32_f16_fp32_data;
spv_size = flash_attn_f32_f16_fp32_len;
name = aligned ? "flash_attn_f32_bf16_aligned" : "flash_attn_f32_bf16";
} else if (use_mmq) {
#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
if (device->fp16) {
if (f32acc) { spv_data = flash_attn_f32_f16_int8_data; spv_size = flash_attn_f32_f16_int8_len; }
@@ -3862,6 +3876,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
spv_size = flash_attn_f32_f16_fp32_int8_len;
}
#endif
name = aligned ? "flash_attn_f32_f16_aligned" : "flash_attn_f32_f16";
} else {
if (device->fp16) {
if (f32acc) { spv_data = flash_attn_f32_f16_data; spv_size = flash_attn_f32_f16_len; }
@@ -3870,8 +3885,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
spv_data = flash_attn_f32_f16_fp32_data;
spv_size = flash_attn_f32_f16_fp32_len;
}
name = aligned ? "flash_attn_f32_f16_aligned" : "flash_attn_f32_f16";
}
const char *name = aligned ? "flash_attn_f32_f16_aligned" : "flash_attn_f32_f16";
ggml_vk_create_pipeline(device, fa.second, name, spv_size, spv_data, "main", 7,
sizeof(vk_flash_attn_push_constants), {Br, 1, 1},
get_fa_spec_constants(fa.first), aligned ? Bc : 1, true,
@@ -3889,11 +3904,25 @@ static void ggml_vk_load_shaders(vk_device& device) {
const uint32_t fa_sgs = fa.first.subgroup_size;
const bool fa_ds = fa.first.subgroup_size == 0;
const bool bf16_kv = fa.first.k_type == GGML_TYPE_BF16;
const void * spv_data;
size_t spv_size;
if (f32acc) { spv_data = flash_attn_f32_f16_cm1_data; spv_size = flash_attn_f32_f16_cm1_len; }
else { spv_data = flash_attn_f32_f16_f16acc_cm1_data; spv_size = flash_attn_f32_f16_f16acc_cm1_len; }
const char *name = aligned ? "flash_attn_f32_f16_aligned_cm1" : "flash_attn_f32_f16_cm1";
const char *name;
if (bf16_kv) {
#if defined(VK_KHR_shader_bfloat16) && defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT)
if (!device->coopmat_bf16_support) continue;
spv_data = flash_attn_f32_f16_bf16_cm1_data;
spv_size = flash_attn_f32_f16_bf16_cm1_len;
name = aligned ? "flash_attn_f32_bf16_aligned_cm1" : "flash_attn_f32_bf16_cm1";
#else
continue;
#endif
} else {
if (f32acc) { spv_data = flash_attn_f32_f16_cm1_data; spv_size = flash_attn_f32_f16_cm1_len; }
else { spv_data = flash_attn_f32_f16_f16acc_cm1_data; spv_size = flash_attn_f32_f16_f16acc_cm1_len; }
name = aligned ? "flash_attn_f32_f16_aligned_cm1" : "flash_attn_f32_f16_cm1";
}
ggml_vk_create_pipeline(device, fa.second, name, spv_size, spv_data, "main", 7,
sizeof(vk_flash_attn_push_constants), {Br, 1, 1},
get_fa_spec_constants(fa.first), aligned ? Bc : 1, true,
@@ -3911,10 +3940,20 @@ static void ggml_vk_load_shaders(vk_device& device) {
const bool aligned = fa.first.aligned;
const bool f32acc = fa.first.f32acc;
const bool bf16_kv = fa.first.k_type == GGML_TYPE_BF16;
const void * spv_data;
size_t spv_size;
const char * name;
if (aligned) {
if (bf16_kv) {
#if defined(VK_KHR_shader_bfloat16) && defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT)
if (!device->coopmat2_bf16_support) continue;
spv_data = flash_attn_f32_f16_bf16_cm2_data;
spv_size = flash_attn_f32_f16_bf16_cm2_len;
name = aligned ? "flash_attn_f32_bf16_aligned_cm2" : "flash_attn_f32_bf16_cm2";
#else
continue;
#endif
} else if (aligned) {
if (f32acc) { spv_data = flash_attn_f32_f16_cm2_data; spv_size = flash_attn_f32_f16_cm2_len; name = "flash_attn_f32_f16_aligned_f32acc_cm2"; }
else { spv_data = flash_attn_f32_f16_f16acc_cm2_data; spv_size = flash_attn_f32_f16_f16acc_cm2_len; name = "flash_attn_f32_f16_aligned_f16acc_cm2"; }
} else {
@@ -5784,46 +5823,72 @@ static vk_device ggml_vk_get_device(size_t idx) {
found_fp16_256 = false,
found_fp32_128 = false,
found_fp32_256 = false;
bool found_bf16_128 = false,
found_bf16_256 = false;
// need to support fp16*fp16 with fp16/fp32 accumulator, for workgroupsize 128
// with 32x16x16 and 256 with 32x32x16.
for (auto &prop : flexible_dimensions) {
if (prop.saturatingAccumulation == VK_FALSE &&
prop.scope == VK_SCOPE_WORKGROUP_KHR &&
prop.AType == VK_COMPONENT_TYPE_FLOAT16_KHR &&
prop.BType == VK_COMPONENT_TYPE_FLOAT16_KHR) {
prop.scope == VK_SCOPE_WORKGROUP_KHR) {
if (prop.workgroupInvocations == 128 &&
prop.MGranularity <= 32 &&
prop.NGranularity <= 16 &&
prop.KGranularity <= 16) {
if (prop.CType == VK_COMPONENT_TYPE_FLOAT16_KHR &&
prop.ResultType == VK_COMPONENT_TYPE_FLOAT16_KHR) {
found_fp16_128 = true;
if (prop.AType == VK_COMPONENT_TYPE_FLOAT16_KHR &&
prop.BType == VK_COMPONENT_TYPE_FLOAT16_KHR) {
if (prop.workgroupInvocations == 128 &&
prop.MGranularity <= 32 &&
prop.NGranularity <= 16 &&
prop.KGranularity <= 16) {
if (prop.CType == VK_COMPONENT_TYPE_FLOAT16_KHR &&
prop.ResultType == VK_COMPONENT_TYPE_FLOAT16_KHR) {
found_fp16_128 = true;
}
if (prop.CType == VK_COMPONENT_TYPE_FLOAT32_KHR &&
prop.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR) {
found_fp32_128 = true;
}
}
if (prop.CType == VK_COMPONENT_TYPE_FLOAT32_KHR &&
prop.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR) {
found_fp32_128 = true;
if (prop.workgroupInvocations == 256 &&
prop.MGranularity <= 32 &&
prop.NGranularity <= 32 &&
prop.KGranularity <= 16) {
if (prop.CType == VK_COMPONENT_TYPE_FLOAT16_KHR &&
prop.ResultType == VK_COMPONENT_TYPE_FLOAT16_KHR) {
found_fp16_256 = true;
}
if (prop.CType == VK_COMPONENT_TYPE_FLOAT32_KHR &&
prop.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR) {
found_fp32_256 = true;
}
}
}
if (prop.workgroupInvocations == 256 &&
prop.MGranularity <= 32 &&
prop.NGranularity <= 32 &&
prop.KGranularity <= 16) {
if (prop.CType == VK_COMPONENT_TYPE_FLOAT16_KHR &&
prop.ResultType == VK_COMPONENT_TYPE_FLOAT16_KHR) {
found_fp16_256 = true;
#if defined(VK_KHR_shader_bfloat16) && defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT)
if (prop.AType == VK_COMPONENT_TYPE_BFLOAT16_KHR &&
prop.BType == VK_COMPONENT_TYPE_BFLOAT16_KHR &&
prop.CType == VK_COMPONENT_TYPE_FLOAT32_KHR &&
prop.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR) {
if (prop.workgroupInvocations == 128 &&
prop.MGranularity <= 32 &&
prop.NGranularity <= 16 &&
prop.KGranularity <= 16) {
found_bf16_128 = true;
}
if (prop.CType == VK_COMPONENT_TYPE_FLOAT32_KHR &&
prop.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR) {
found_fp32_256 = true;
if (prop.workgroupInvocations == 256 &&
prop.MGranularity <= 32 &&
prop.NGranularity <= 32 &&
prop.KGranularity <= 16) {
found_bf16_256 = true;
}
}
#endif
}
}
if (found_fp16_128 && found_fp16_256 &&
found_fp32_128 && found_fp32_256 &&
coopmat2_props.cooperativeMatrixFlexibleDimensionsMaxDimension >= 512) {
device->coopmat2 = true;
device->coopmat2_bf16_support = found_bf16_128 && found_bf16_256;
device->coopmat2_decode_vector = coopmat2_decode_vector_support && coopmat2_decode_vector_features.cooperativeMatrixDecodeVector;
}
}
@@ -9448,7 +9513,8 @@ static bool ggml_vk_flash_attn_scalar_shmem_support(const vk_device& device, con
const uint32_t Br = params.block_rows;
const uint32_t Bc = params.block_cols;
const uint32_t float_type_size = device->fp16 ? sizeof(ggml_fp16_t) : sizeof(float);
// BF16 uses the fp32 shader (FLOAT_TYPE=float)
const uint32_t float_type_size = (device->fp16 && k_type != GGML_TYPE_BF16) ? sizeof(ggml_fp16_t) : sizeof(float);
const bool mmq = ggml_vk_fa_scalar_uses_mmq(device, k_type);
@@ -9489,7 +9555,7 @@ static bool ggml_vk_flash_attn_scalar_shmem_support(const vk_device& device, con
return supported;
}
static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool f32acc) {
static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool f32acc, ggml_type k_type) {
// Needs to be kept up to date on shader changes
const uint32_t Br = params.block_rows;
const uint32_t Bc = params.block_cols;
@@ -9519,8 +9585,10 @@ static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, co
const uint32_t vsh_stride = MatBc / 4 * row_split;
const uint32_t ksh = ((kvshstride >= vsh_stride) ? (Bc * kvshstride) : (Bc * vsh_stride)) * f16vec4;
// BF16 PVMat accumulator is f32 (no bf16 accumulator support), so pvsh is vec4 (16 bytes)
const uint32_t pvsh_elem_size = (k_type == GGML_TYPE_BF16) ? 16u : f16vec4;
const uint32_t osh_stride = params.row_split * MatBr / 4;
const uint32_t pvsh = MatBc * osh_stride * f16vec4;
const uint32_t pvsh = MatBc * osh_stride * pvsh_elem_size;
const uint32_t slope = Br * acctype;
@@ -9589,7 +9657,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
uint32_t workgroups_y = (uint32_t)neq2;
uint32_t workgroups_z = (uint32_t)neq3;
const bool f32acc = !ctx->device->fp16 || dst->op_params[3] == GGML_PREC_F32;
const bool f32acc = !ctx->device->fp16 || dst->op_params[3] == GGML_PREC_F32 || k->type == GGML_TYPE_BF16;
// For scalar/coopmat1 FA, we can use the "large" size to accommodate qga.
// For coopmat2 FA, we always use the small size (which is still pretty large for gqa).
@@ -16400,6 +16468,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
switch (t) {
case GGML_TYPE_F32:
case GGML_TYPE_F16:
case GGML_TYPE_BF16:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q5_0:
@@ -16415,6 +16484,9 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
if (!fa_kv_ok(op->src[1]->type) || !fa_kv_ok(op->src[2]->type)) {
return false;
}
if ((op->src[1]->type == GGML_TYPE_BF16) != (op->src[2]->type == GGML_TYPE_BF16)) {
return false;
}
if (!coopmat2 && !(device->subgroup_shuffle && device->subgroup_vote)) {
// scalar/coopmat1 FA uses subgroupShuffle/subgroupAll
return false;
@@ -97,8 +97,17 @@ layout (binding = 6) readonly buffer MO {uint32_t data_mask_opt[];};
#define FA_TYPE_Q5_0 6u
#define FA_TYPE_Q5_1 7u
#define FA_TYPE_Q8_0 8u
#define FA_TYPE_BF16 30u
#define FA_TYPE_Q1_0 41u
#if defined(BFLOAT16)
#define O_TYPE float
#define O_TYPEV4 vec4
#else
#define O_TYPE FLOAT_TYPE
#define O_TYPEV4 FLOAT_TYPEV4
#endif
// Number of matrix elements per buffer block, derived from the K/V type spec
// constant. F32 is treated as a vec4 "block" of 4 floats. F16 uses block size 1
// and bypasses the dequant path entirely. Quants follow their ggml block sizes.
@@ -111,6 +120,7 @@ uint fa_block_elems(uint ty) {
case FA_TYPE_Q5_0: return uint(QUANT_K_Q5_0);
case FA_TYPE_Q5_1: return uint(QUANT_K_Q5_1);
case FA_TYPE_Q8_0: return uint(QUANT_K_Q8_0);
case FA_TYPE_BF16: return 1u;
case FA_TYPE_Q1_0: return uint(QUANT_K_Q1_0); // cm2-only, harmless elsewhere
default: return 1u;
}
@@ -248,7 +258,7 @@ const float FATTN_KQ_MAX_OFFSET = 3.0f*0.6931f;
// Store the output when doing grouped query attention.
// Rows index by Q's dimension 2, and the first N rows are valid.
void gqaStore(const in uint32_t r, const in uint32_t c, const in FLOAT_TYPEV4 elems, const in uint32_t o_offset, const in uint32_t iq2, const in uint32_t N)
void gqaStore(const in uint32_t r, const in uint32_t c, const in O_TYPEV4 elems, const in uint32_t o_offset, const in uint32_t iq2, const in uint32_t N)
{
uint32_t offset = (iq2 + r) * HSV / 4 + c;
data_ov4[o_offset + offset] = D_TYPEV4(elems);
@@ -6,6 +6,10 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
#if defined(BFLOAT16)
#extension GL_EXT_bfloat16 : enable
#endif
#extension GL_KHR_shader_subgroup_basic : enable
#extension GL_KHR_shader_subgroup_arithmetic : enable
#extension GL_KHR_shader_subgroup_vote : enable
@@ -14,7 +18,9 @@
#include "types.glsl"
#include "flash_attn_base.glsl"
#if !defined(BFLOAT16)
#include "flash_attn_dequant.glsl"
#endif
// These need to be supported N,M values for a MatBc x MatBr x 16 coopmatmuladd
const uint32_t MatBr = 16;
@@ -27,32 +33,32 @@ const uint32_t cols_per_thread = Bc / cols_per_iter;
layout (binding = 0) readonly buffer Q {float data_q[];};
layout (binding = 0) readonly buffer QV4 {vec4 data_qv4[];};
layout (binding = 1) readonly buffer K {float16_t data_k[];};
layout (binding = 1) readonly buffer KV4 {f16vec4 data_kv4[];};
layout (binding = 2) readonly buffer V {float16_t data_v[];};
layout (binding = 2) readonly buffer VV4 {f16vec4 data_vv4[];};
layout (binding = 1) readonly buffer K {FLOAT_TYPE data_k[];};
layout (binding = 1) readonly buffer KV4 {FLOAT_TYPEV4 data_kv4[];};
layout (binding = 2) readonly buffer V {FLOAT_TYPE data_v[];};
layout (binding = 2) readonly buffer VV4 {FLOAT_TYPEV4 data_vv4[];};
layout (binding = 3) readonly buffer M {float16_t data_m[];};
shared float tmpsh[row_split];
const uint32_t qstride = HSK_pad / 4 + 2; // in units of f16vec4
shared f16vec4 Qf[Br * qstride];
const uint32_t qstride = HSK_pad / 4 + 2;
shared FLOAT_TYPEV4 Qf[Br * qstride];
const uint psh_stride = Br / 4 + 2;
shared f16vec4 Psh[Bc * psh_stride];
shared FLOAT_TYPEV4 Psh[Bc * psh_stride];
// Avoid padding for hsk==256 to make it fit in 48KB shmem.
const uint32_t sfshstride = (HSK <= 128) ? (Br / 4 + 2) : Br / 4;
shared ACC_TYPEV4 sfsh[Bc * sfshstride];
const uint32_t D_pad = HSK_pad > HSV_pad ? HSK_pad : HSV_pad;
const uint32_t kvsh_stride = (SHMEM_STAGING != 0 ? D_pad : MatBr) / 4 + 2; // in units of f16vec4
const uint32_t kvsh_stride = (SHMEM_STAGING != 0 ? D_pad : MatBr) / 4 + 2;
const uint v_cols = MatBc / 4 * row_split; // total cols, 4 vec4s per MatBc * number of subgroups
const uint vsh_stride = v_cols;
shared f16vec4 kvsh[(kvsh_stride >= vsh_stride) ? (Bc * kvsh_stride) : (Bc * vsh_stride)];
shared FLOAT_TYPEV4 kvsh[(kvsh_stride >= vsh_stride) ? (Bc * kvsh_stride) : (Bc * vsh_stride)];
const uint32_t osh_stride = row_split * MatBr / 4;
shared f16vec4 pvsh[MatBc * osh_stride];
shared O_TYPEV4 pvsh[MatBc * osh_stride];
shared ACC_TYPE slope[Br];
@@ -76,7 +82,7 @@ void main() {
if ((HSK % 16) != 0) {
[[unroll]] for (uint i = 0; i < Br * qstride; i += gl_WorkGroupSize.x) {
if (i + tid < Br * qstride) {
Qf[i + tid] = f16vec4(0);
Qf[i + tid] = FLOAT_TYPEV4(0);
}
}
barrier();
@@ -89,15 +95,15 @@ void main() {
uint32_t r = (idx + tid) / (HSK / 4);
if (r < Br && d < HSK / 4 &&
i * Br + r < N) {
Qf[r * qstride + d] = f16vec4(data_qv4[q_offset / 4 + (i * Br + r) * q_stride / 4 + d] * p.scale);
Qf[r * qstride + d] = FLOAT_TYPEV4(data_qv4[q_offset / 4 + (i * Br + r) * q_stride / 4 + d] * p.scale);
}
}
barrier();
f16vec4 Of[rows_per_thread][d_per_thread];
O_TYPEV4 Of[rows_per_thread][d_per_thread];
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
[[unroll]] for (uint32_t d = 0; d < d_per_thread; ++d) {
Of[r][d] = f16vec4(0.0);
Of[r][d] = O_TYPEV4(0.0);
}
}
@@ -222,15 +228,18 @@ void main() {
uint32_t d = (idx + tid) % (HSK_pad / 4);
uint32_t c = (idx + tid) / (HSK_pad / 4);
if (idx + gl_WorkGroupSize.x <= Bc * HSK_pad / 4 || c < Bc) {
f16vec4 K_Tf = f16vec4(0);
FLOAT_TYPEV4 K_Tf = FLOAT_TYPEV4(0);
if ((!KV_bounds_check || j * Bc + c < KV) && (HSK == HSK_pad || d < HSK / 4)) {
#if !defined(BFLOAT16)
if (USE_DECODE_K) {
uint coord = (j * Bc + c) * k_stride * BLOCK_SIZE_K + 4 * d;
uint ib = coord / BLOCK_SIZE_K;
uint iqs = (coord % BLOCK_SIZE_K);
K_Tf = dequantize4(ib, iqs, k_offset, BINDING_IDX_K);
} else {
K_Tf = f16vec4(data_kv4[k_offset / 4 + (j * Bc + c) * k_stride / 4 + d]);
} else
#endif
{
K_Tf = FLOAT_TYPEV4(data_kv4[k_offset / 4 + (j * Bc + c) * k_stride / 4 + d]);
}
}
@@ -244,16 +253,16 @@ void main() {
// Bc split across workgroup (four subgroups), loop over HSK in chunks of 16: 16 x 16 * 16 x 16 -> 16 x 16
// This is written transposed in order to allow for N being 8 if implementations need it
coopmat<ACC_TYPE, gl_ScopeSubgroup, MatBc, MatBr, gl_MatrixUseAccumulator> SfMat = coopmat<ACC_TYPE, gl_ScopeSubgroup, MatBc, MatBr, gl_MatrixUseAccumulator>(0);
coopmat<float16_t, gl_ScopeSubgroup, MatBc, 16, gl_MatrixUseA> KMat;
coopmat<float16_t, gl_ScopeSubgroup, 16, MatBr, gl_MatrixUseB> QMat;
coopmat<FLOAT_TYPE, gl_ScopeSubgroup, MatBc, 16, gl_MatrixUseA> KMat;
coopmat<FLOAT_TYPE, gl_ScopeSubgroup, 16, MatBr, gl_MatrixUseB> QMat;
[[unroll]] for (uint32_t d = 0; d < HSK_pad / 16; ++d) {
// If SHMEM_STAGING is set, a Bc * HSK_pad size tile of K is loaded to shmem
// If not, f16 K is loaded directly from global memory if aligned, otherwise
// If not, K is loaded directly from global memory if aligned, otherwise
// staged through a Bc * MatBr size staging buffer.
// If K is not type f16, then it is always staged for dequantization.
// If K is a quant type, then it is always staged for dequantization.
if (SHMEM_STAGING == 0) {
// For quants we always need to dequant into kvsh; for f16 we can load
// For quants we always need to dequant into kvsh; for f16/bf16 we can load
// directly from global memory when alignment / bounds allow it.
const bool stage_k = USE_DECODE_K || KV_bounds_check || d * 16 + 16 > HSK;
if (stage_k) {
@@ -262,15 +271,18 @@ void main() {
uint32_t col_vec = (idx + tid) % (MatBr / 4);
uint32_t row = (idx + tid) / (MatBr / 4);
if (idx + tid < Bc * MatBr / 4) {
f16vec4 K_Tf = f16vec4(0);
FLOAT_TYPEV4 K_Tf = FLOAT_TYPEV4(0);
if ((!KV_bounds_check || j * Bc + row < KV) && (HSK == HSK_pad || d * 16 + col_vec * 4 < HSK)) {
#if !defined(BFLOAT16)
if (USE_DECODE_K) {
uint coord = (j * Bc + row) * k_stride * BLOCK_SIZE_K + d * 16 + col_vec * 4;
uint ib = coord / BLOCK_SIZE_K;
uint iqs = (coord % BLOCK_SIZE_K);
K_Tf = dequantize4(ib, iqs, k_offset, BINDING_IDX_K);
} else {
K_Tf = f16vec4(data_kv4[k_offset / 4 + (j * Bc + row) * k_stride / 4 + d * 16 / 4 + col_vec]);
} else
#endif
{
K_Tf = FLOAT_TYPEV4(data_kv4[k_offset / 4 + (j * Bc + row) * k_stride / 4 + d * 16 / 4 + col_vec]);
}
}
@@ -357,7 +369,7 @@ void main() {
[[unroll]] for (uint32_t d0 = 0; d0 < HSV / 4; d0 += threads_per_rowgroup) {
const uint d_local = d0 / threads_per_rowgroup;
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
Of[r][d_local] = float16_t(eMf[r]) * Of[r][d_local];
Of[r][d_local] = O_TYPE(eMf[r]) * Of[r][d_local];
}
}
@@ -368,10 +380,10 @@ void main() {
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; r += 4) {
const uint row = tile_row(r);
if (KV_bounds_check && j * Bc + col >= KV) {
Psh[col * psh_stride + row / 4] = f16vec4(0.0f);
Psh[col * psh_stride + row / 4] = FLOAT_TYPEV4(0.0f);
} else {
const vec4 mfvec = vec4(Mf[r], Mf[r + 1], Mf[r + 2], Mf[r + 3]);
const f16vec4 Pf = f16vec4(exp(vec4(sfsh[row / 4 + col * sfshstride]) - mfvec));
const FLOAT_TYPEV4 Pf = FLOAT_TYPEV4(exp(vec4(sfsh[row / 4 + col * sfshstride]) - mfvec));
[[unroll]] for (uint32_t vec_idx = 0; vec_idx < 4; ++vec_idx) {
Lf[r + vec_idx] += Pf[vec_idx];
}
@@ -385,15 +397,18 @@ void main() {
uint32_t d = (idx + tid) % (HSV_pad / 4);
uint32_t c = (idx + tid) / (HSV_pad / 4);
if (idx + gl_WorkGroupSize.x <= Bc * HSV_pad / 4 || c < Bc) {
f16vec4 V_Tf = f16vec4(0);
FLOAT_TYPEV4 V_Tf = FLOAT_TYPEV4(0);
if ((!KV_bounds_check || j * Bc + c < KV) && (HSV == HSV_pad || d < HSV / 4)) {
#if !defined(BFLOAT16)
if (USE_DECODE_V) {
uint coord = (j * Bc + c) * v_stride * BLOCK_SIZE_V + 4 * d;
uint ib = coord / BLOCK_SIZE_V;
uint iqs = (coord % BLOCK_SIZE_V);
V_Tf = dequantize4(ib, iqs, v_offset, BINDING_IDX_V);
} else {
V_Tf = f16vec4(data_vv4[v_offset / 4 + (j * Bc + c) * v_stride / 4 + d]);
} else
#endif
{
V_Tf = FLOAT_TYPEV4(data_vv4[v_offset / 4 + (j * Bc + c) * v_stride / 4 + d]);
}
}
@@ -409,7 +424,7 @@ void main() {
[[unroll]] for (uint32_t hsv_tile = 0; hsv_tile < num_hsv_tiles; ++hsv_tile) {
const uint hsv_offset = (hsv_tile * row_split + gl_SubgroupID) * 16;
coopmat<float16_t, gl_ScopeSubgroup, MatBc, MatBr, gl_MatrixUseAccumulator> PVMat = coopmat<float16_t, gl_ScopeSubgroup, MatBc, MatBr, gl_MatrixUseAccumulator>(0);
coopmat<O_TYPE, gl_ScopeSubgroup, MatBc, MatBr, gl_MatrixUseAccumulator> PVMat = coopmat<O_TYPE, gl_ScopeSubgroup, MatBc, MatBr, gl_MatrixUseAccumulator>(0);
// Preload V tiles for [Bc, 16 * num subgroups]
const uint v_rows = Bc;
@@ -417,11 +432,11 @@ void main() {
const uint v_loads_per_thread = v_total / gl_WorkGroupSize.x;
// If SHMEM_STAGING is set, a Bc * HSV_pad size tile of V is loaded to shmem.
// If not, f16 V is loaded directly from global memory if aligned, otherwise
// If not, V is loaded directly from global memory if aligned, otherwise
// staged through a Bc * MatBr size staging buffer.
// If V is not type f16, then it is always staged for dequantization.
// If V is a quant type, then it is always staged for dequantization.
if (SHMEM_STAGING == 0) {
// For quants we always preload via kvsh. For f16 we only preload when
// For quants we always preload via kvsh. For f16/bf16 we only preload when
// alignment / bounds force it (otherwise we coopMatLoad direct from data_vv4).
const bool stage_v = USE_DECODE_V || KV_bounds_check;
if (stage_v) {
@@ -438,13 +453,16 @@ void main() {
const uint iqs = coord % BLOCK_SIZE_V;
if (!KV_bounds_check || (v_row < KV && v_col < HSV)) {
#if !defined(BFLOAT16)
if (USE_DECODE_V) {
kvsh[row * vsh_stride + col] = dequantize4(ib, iqs, v_offset, BINDING_IDX_V);
} else {
} else
#endif
{
kvsh[row * vsh_stride + col] = data_vv4[(v_offset + v_row * v_stride + v_col) / 4];
}
} else {
kvsh[row * vsh_stride + col] = f16vec4(0.0f);
kvsh[row * vsh_stride + col] = FLOAT_TYPEV4(0.0f);
}
}
}
@@ -459,7 +477,7 @@ void main() {
if (SHMEM_STAGING == 0) {
if (!USE_DECODE_V && !KV_bounds_check) {
// F16 values can be loaded directly from global memory
// F16/BF16 values can be loaded directly from global memory
const uint v_tile_row = j * Bc + bc_chunk * MatBc;
const uint v_tile_offset = v_offset / 4 + v_tile_row * v_stride / 4 + hsv_offset / 4;
coopMatLoad(QMat, data_vv4, v_tile_offset, v_stride / 4, gl_CooperativeMatrixLayoutRowMajor);
@@ -573,7 +591,7 @@ void main() {
[[unroll]] for (uint32_t d0 = 0; d0 < HSV / 4; d0 += threads_per_rowgroup) {
const uint d_local = d0 / threads_per_rowgroup;
Of[r][d_local] *= float16_t(ms);
Of[r][d_local] *= O_TYPE(ms);
}
} else {
vs = exp(sink - Mf[r]);
@@ -591,7 +609,7 @@ void main() {
[[unroll]] for (uint32_t d0 = 0; d0 < HSV / 4; d0 += threads_per_rowgroup) {
const uint d_local = d0 / threads_per_rowgroup;
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
Of[r][d_local] *= float16_t(Lfrcp[r]);
Of[r][d_local] *= O_TYPE(Lfrcp[r]);
#if defined(FLOAT_TYPE_MAX)
Of[r][d_local] = clamp(Of[r][d_local], -FLOAT_TYPE_MAX, FLOAT_TYPE_MAX);
#endif
@@ -8,6 +8,10 @@
#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
#if defined(BFLOAT16)
#extension GL_EXT_bfloat16 : enable
#endif
#extension GL_KHR_memory_scope_semantics : enable
#extension GL_KHR_cooperative_matrix : enable
#extension GL_NV_cooperative_matrix2 : enable
@@ -21,7 +25,9 @@
#include "types.glsl"
#include "flash_attn_base.glsl"
#if !defined(BFLOAT16)
#include "dequant_funcs_cm2.glsl"
#endif
// buffer_reference stride = sizeof(struct) = FaBlockBytesK/V.
layout(buffer_reference, std430, buffer_reference_align = 1) buffer decodeBufFA_K {
@@ -31,6 +37,7 @@ layout(buffer_reference, std430, buffer_reference_align = 1) buffer decodeBufFA_
uint8_t raw[FaBlockBytesV];
};
#if !defined(BFLOAT16)
float16_t faDecodeK(const decodeBufFA_K bl_in, const uint blockCoords[2], const uint coordInBlock[2]) {
switch (FaTypeK) {
case FA_TYPE_F32: return dequantFuncF32 (decodeBufF32 (bl_in), blockCoords, coordInBlock);
@@ -91,6 +98,7 @@ f16vec4 faDecodeVVector(const decodeBufFA_V bl_in, const uint blockCoords[2], co
#define FADECODEK , faDecodeK
#define FADECODEV , faDecodeV
#endif
#endif
layout (binding = 0) readonly buffer Q {uint8_t data_q[];};
layout (binding = 1) readonly buffer K {uint8_t data_k[];};
@@ -195,15 +203,15 @@ void main() {
tensorLayoutV = setTensorLayoutStrideNV(tensorLayoutV, v_stride, 1);
coopmat<Q_TYPE, gl_ScopeWorkgroup, Br, HSK_pad, gl_MatrixUseAccumulator> Q;
coopmat<float16_t, gl_ScopeWorkgroup, Br, HSK_pad, gl_MatrixUseA> Qf16;
coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, Br, HSK_pad, gl_MatrixUseA> Qf16;
uint32_t q_offset = gqa_iq1*p.nb01*4/*sizeof(float)*/ + iq2*p.nb02+iq3*p.nb03;
coopMatLoadTensorNV(Q, data_q, q_offset, sliceTensorLayoutNV(tensorLayoutQ, i * Br, Br, 0, HSK_pad));
Qf16 = coopmat<float16_t, gl_ScopeWorkgroup, Br, HSK_pad, gl_MatrixUseA>(Q);
Qf16 *= float16_t(p.scale);
Q *= Q_TYPE(p.scale);
Qf16 = coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, Br, HSK_pad, gl_MatrixUseA>(Q);
coopmat<float16_t, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator> O = coopmat<float16_t, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator>(0);
coopmat<O_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator> O = coopmat<O_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator>(0);
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> L, M;
@@ -291,16 +299,20 @@ void main() {
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> S = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(0);
coopmat<float16_t, gl_ScopeWorkgroup, HSK_pad, Bc, gl_MatrixUseB> K_T;
coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, HSK_pad, Bc, gl_MatrixUseB> K_T;
uint32_t k_offset = ik2*p.nb12 + ik3*p.nb13;
// F16: bs_k==1 (direct load). F32: bs_k==4 (vec4 / dequantFuncF32). Q4/Q8 family: bs_k==32. Q1_0: bs_k==128.
#if defined(BFLOAT16)
coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, HSK_pad), tensorViewTranspose);
#else
const bool k_use_decode = (bs_k > 1u);
if (k_use_decode) {
coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, HSK_pad), tensorViewTranspose FADECODEK);
} else {
coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, HSK_pad), tensorViewTranspose);
}
#endif
S = coopMatMulAdd(Qf16, K_T, S);
if (LOGIT_SOFTCAP) {
@@ -351,22 +363,26 @@ void main() {
coopMatPerElementNV(P, P, replacePadding, ACC_TYPE(0.0), R, C);
}
coopmat<float16_t, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseA> P_A = coopmat<float16_t, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseA>(P);
coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseA> P_A = coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseA>(P);
// compute rowsum by multiplying by matrix of all ones.
coopmat<float16_t, gl_ScopeWorkgroup, Bc, Bc, gl_MatrixUseB> One = coopmat<float16_t, gl_ScopeWorkgroup, Bc, Bc, gl_MatrixUseB>(1.0);
coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, Bc, Bc, gl_MatrixUseB> One = coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, Bc, Bc, gl_MatrixUseB>(1.0);
rowsum = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(0.0);
rowsum = coopMatMulAdd(P_A, One, rowsum);
coopmat<float16_t, gl_ScopeWorkgroup, Bc, HSV_pad, gl_MatrixUseB> V;
coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, Bc, HSV_pad, gl_MatrixUseB> V;
uint32_t v_offset = iv2*p.nb22 + iv3*p.nb23;
#if defined(BFLOAT16)
coopMatLoadTensorNV(V, data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, HSV_pad));
#else
const bool v_use_decode = (bs_v > 1u);
if (v_use_decode) {
coopMatLoadTensorNV(V, data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, HSV_pad) FADECODEV);
} else {
coopMatLoadTensorNV(V, data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, HSV_pad));
}
#endif
L = eM*L + rowsum;
@@ -378,7 +394,7 @@ void main() {
// resize eM by using smear/reduce
coopMatReduceNV(eMdiag, eM, gl_CooperativeMatrixReduceRowNV, smearReduce);
O *= coopmat<float16_t, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator>(eMdiag);
O *= coopmat<O_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator>(eMdiag);
O = coopMatMulAdd(P_A, V, O);
}
@@ -427,7 +443,7 @@ void main() {
if (sink > Mr[i]) {
ms = exp(Mr[i] - sink);
O[i] *= float16_t(ms);
O[i] *= O_TYPE(ms);
} else {
vs = exp(sink - Mr[i]);
}
@@ -28,6 +28,9 @@ layout (binding = 2) readonly buffer V_PACKED_Q5_1 { block_q5_1_packed16 data[];
layout (binding = 1) readonly buffer K_PACKED_Q8_0 { block_q8_0_packed16 data[]; } k_packed_q8_0;
layout (binding = 2) readonly buffer V_PACKED_Q8_0 { block_q8_0_packed16 data[]; } v_packed_q8_0;
layout (binding = 1) readonly buffer K_PACKED_BF16 { u16vec4 data[]; } k_packed_bf16;
layout (binding = 2) readonly buffer V_PACKED_BF16 { u16vec4 data[]; } v_packed_bf16;
// Q4_1 and Q5_1 packed32 views: aliased to the same memory as the packed16
// views, used by the MMQ K-side hot path for fast 4-uint loads.
layout (binding = 1) readonly buffer K_PACKED_Q4_1_P32 { block_q4_1_packed32 data[]; } k_packed_q4_1_p32;
@@ -99,6 +102,9 @@ layout (binding = 1) readonly buffer K_PACKED_Q5_1_P32 { block_q5_1_packed32 dat
return FLOAT_TYPE(BUF.data[a_offset + ib].d) * FLOAT_TYPEV4(v0.x, v0.y, v1.x, v1.y); \
}
#define FA_DEQUANT4_BF16(BUF) \
return FLOAT_TYPEV4(bf16_to_fp32(uvec4(BUF.data[(a_offset + ib) / 4])));
FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
if (binding_idx == BINDING_IDX_K) {
switch (FaTypeK) {
@@ -108,6 +114,7 @@ FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
case FA_TYPE_Q5_0: FA_DEQUANT4_Q5_0(k_packed_q5_0)
case FA_TYPE_Q5_1: FA_DEQUANT4_Q5_1(k_packed_q5_1)
case FA_TYPE_Q8_0: FA_DEQUANT4_Q8_0(k_packed_q8_0)
case FA_TYPE_BF16: FA_DEQUANT4_BF16(k_packed_bf16)
}
} else {
switch (FaTypeV) {
@@ -117,6 +124,7 @@ FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
case FA_TYPE_Q5_0: FA_DEQUANT4_Q5_0(v_packed_q5_0)
case FA_TYPE_Q5_1: FA_DEQUANT4_Q5_1(v_packed_q5_1)
case FA_TYPE_Q8_0: FA_DEQUANT4_Q8_0(v_packed_q8_0)
case FA_TYPE_BF16: FA_DEQUANT4_BF16(v_packed_bf16)
}
}
return FLOAT_TYPEV4(0);
@@ -662,6 +662,28 @@ void process_shaders() {
}
}
const std::map<std::string, std::string> fa_bf16_dict = {
{"FLOAT_TYPE", "bfloat16_t"},
{"FLOAT_TYPEV2", "bf16vec2"},
{"FLOAT_TYPEV4", "bf16vec4"},
{"ACC_TYPE", "float"},
{"ACC_TYPEV2", "vec2"},
{"ACC_TYPEV4", "vec4"},
{"BFLOAT16", "1"},
};
#if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
string_to_spv("flash_attn_f32_f16_bf16", "flash_attn_cm1.comp",
merge_maps(fa_bf16_dict, {{"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"D_TYPEV4", "vec4"}, {"COOPMAT", "1"}}),
true, true, false, false);
#endif
#if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
string_to_spv("flash_attn_f32_f16_bf16", "flash_attn_cm2.comp",
merge_maps(fa_bf16_dict, {{"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"D_TYPEV4", "vec4"}}),
true, false, true, false);
#endif
std::map<std::string, std::string> base_dict = {{"FLOAT_TYPE", "float"}, {"FLOAT_TYPEV2", "vec2"}};
for (const auto& tname : type_names) {
+2
View File
@@ -268,6 +268,8 @@ class Keys:
CHAT_TEMPLATE = "tokenizer.chat_template"
CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}"
CHAT_TEMPLATES = "tokenizer.chat_templates"
# Normalizer constants
NORMALIZER_LOWERCASE = "tokenizer.ggml.normalizer.lowercase"
# FIM/Infill special tokens constants
FIM_PRE_ID = "tokenizer.ggml.fim_pre_token_id"
FIM_SUF_ID = "tokenizer.ggml.fim_suf_token_id"
+3
View File
@@ -1110,6 +1110,9 @@ class GGUFWriter:
self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value)
def add_normalizer_lowercase(self, value: bool) -> None:
self.add_bool(Keys.Tokenizer.NORMALIZER_LOWERCASE, value)
def add_eot_token_id(self, id: int) -> None:
self.add_uint32(Keys.Tokenizer.EOT_ID, id)
+27
View File
@@ -52,6 +52,7 @@ class SpecialVocab:
add_special_token: dict[str, bool]
special_token_ids: dict[str, int]
chat_template: str | Sequence[Mapping[str, str]] | None
normalizer_lowercase: bool | None
def __init__(
self, path: str | os.PathLike[str], load_merges: bool = False,
@@ -64,6 +65,7 @@ class SpecialVocab:
self.load_merges = load_merges
self.merges = []
self.chat_template = None
self.normalizer_lowercase = None
if special_token_types is not None:
self.special_token_types = special_token_types
else:
@@ -102,6 +104,10 @@ class SpecialVocab:
if not quiet:
logger.info(f'Setting chat_template to {self.chat_template}')
gw.add_chat_template(self.chat_template)
if self.normalizer_lowercase is not None:
if not quiet:
logger.info(f'Setting normalizer_lowercase to {self.normalizer_lowercase}')
gw.add_normalizer_lowercase(self.normalizer_lowercase)
def _load(self, path: Path) -> None:
self._try_load_from_tokenizer_json(path)
@@ -146,6 +152,24 @@ class SpecialVocab:
return
logger.warning(f'Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping')
def _parse_normalizer(self, normalizer: dict) -> None:
# ref: https://huggingface.co/docs/tokenizers/api/normalizers
#
# Detects lowercase normalization in three possible formats:
# 1. Standalone: {"type": "Lowercase"}
# 2. BertNormalizer attribute: {"type": "BertNormalizer", "lowercase": true, ...}
# 3. Nested in Sequence: {"type": "Sequence", "normalizers": [...]}
normalizer_type = normalizer.get('type')
if normalizer_type == 'Lowercase':
self.normalizer_lowercase = True
elif normalizer_type == 'BertNormalizer':
if 'lowercase' in normalizer:
self.normalizer_lowercase = normalizer['lowercase']
elif normalizer_type == 'Sequence':
for norm in normalizer.get('normalizers', []):
self._parse_normalizer(norm)
def _try_load_from_tokenizer_json(self, path: Path) -> bool:
tokenizer = None
tokenizer_file = path / 'tokenizer.json'
@@ -178,6 +202,9 @@ class SpecialVocab:
]
else:
raise ValueError("Unknown tokenizer merges format")
# Parse normalizer configuration (e.g. Lowercase) into metadata
if normalizer := tokenizer.get('normalizer'):
self._parse_normalizer(normalizer)
added_tokens = tokenizer.get('added_tokens', {})
else:
added_tokens = {}
+1
View File
@@ -319,6 +319,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
{ LLM_KV_TOKENIZER_CHAT_TEMPLATE, "tokenizer.chat_template" },
{ LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, "tokenizer.ggml.normalizer.lowercase" },
{ LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" },
{ LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" },
{ LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" },
+1
View File
@@ -308,6 +308,7 @@ enum llm_kv {
LLM_KV_TOKENIZER_HF_JSON,
LLM_KV_TOKENIZER_RWKV,
LLM_KV_TOKENIZER_CHAT_TEMPLATE,
LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE,
LLM_KV_TOKENIZER_FIM_PRE_ID,
LLM_KV_TOKENIZER_FIM_SUF_ID,
LLM_KV_TOKENIZER_FIM_MID_ID,
+5 -5
View File
@@ -410,16 +410,16 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str
auto get_tensor_config = [&]() -> tensor_config {
// standard attention
if (std::regex_match(tensor_name, pattern_q_weight) || std::regex_match(tensor_name, pattern_kv_weight)) {
return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1, "attn_output.weight");
return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1, "attn_output.weight", "ssm_out.weight");
}
if (std::regex_match(tensor_name, pattern_q_bias) || std::regex_match(tensor_name, pattern_kv_bias)) {
return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0, "attn_output.weight");
return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0, "attn_output.weight", "ssm_out.weight");
}
if (std::regex_match(tensor_name, pattern_qkv_weight)) {
return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1);
return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1, "attn_output.weight", "ssm_out.weight");
}
if ( std::regex_match(tensor_name, pattern_qkv_bias)) {
return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0);
return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0, "attn_output.weight", "ssm_out.weight");
}
if (std::regex_match(tensor_name, pattern_qk_norm)) {
return get_tensor_config_impl(tensor->ne[1] == 1 ? GGML_BACKEND_SPLIT_AXIS_MIRRORED : GGML_BACKEND_SPLIT_AXIS_1, "attn_output.weight");
@@ -435,7 +435,7 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str
}
if (std::regex_match(tensor_name, pattern_attn_gate_weight)) {
return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1);
return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1, "attn_output.weight", "ssm_out.weight");
}
if (std::regex_match(tensor_name, pattern_ssm_dt) || std::regex_match(tensor_name, pattern_ssm_a)) {
return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0, "ssm_out.weight");
+50 -3
View File
@@ -519,6 +519,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}+| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
};
break;
case LLAMA_VOCAB_PRE_TYPE_WHITESPACE:
// whitespace pre-tokenizer (jinaai/jina-embeddings-v2-base-zh)
regex_exprs = {
"\\S+",
};
byte_encode = false;
break;
default:
// default regex for BPE tokenization pre-processing
regex_exprs = {
@@ -1671,6 +1678,35 @@ private:
const llama_vocab & vocab;
};
struct llm_tokenizer_whitespace_session : llm_tokenizer_bpe_session {
llm_tokenizer_whitespace_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : llm_tokenizer_bpe_session{vocab, tokenizer}, vocab{vocab} {}
void tokenize(const std::string & text, std::vector<llama_token> & output) override {
const bool lowercase = vocab.get_normalizer_lowercase();
std::string segment;
auto flush = [&]() {
if (!segment.empty()) {
llm_tokenizer_bpe_session::tokenize(segment, output);
segment.clear();
}
};
for (uint32_t cpt : unicode_cpts_from_utf8(text)) {
// drop whitespace
if (unicode_cpt_flags_from_cpt(cpt).is_whitespace) {
flush();
} else {
segment += unicode_cpt_to_utf8(lowercase ? unicode_tolower(cpt) : cpt);
}
}
flush();
}
private:
const llama_vocab & vocab;
};
//
// impl
//
@@ -1751,6 +1787,7 @@ struct llama_vocab::impl {
bool remove_extra_whitespaces = false;
bool escape_whitespaces = true;
bool treat_whitespace_as_suffix = false;
bool normalizer_lowercase = true; // Lowercase normalizer (tokenizer.json)
std::unordered_map<std::string, llama_token> token_to_id;
std::vector<token_data> id_to_token;
@@ -1900,7 +1937,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
special_mask_id = 103;
add_sep = true;
} else if (tokenizer_model == "gpt2" || tokenizer_model == "hybriddna") {
} else if (tokenizer_model == "gpt2" || tokenizer_model == "hybriddna" || tokenizer_model == "whitespace") {
type = LLAMA_VOCAB_TYPE_BPE;
// read bpe merges and populate bpe ranks
@@ -2119,6 +2156,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "roberta-bpe") {
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
add_sep = true;
} else if (
tokenizer_pre == "whitespace") {
pre_type = LLAMA_VOCAB_PRE_TYPE_WHITESPACE;
} else if (
tokenizer_pre == "refact") {
pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT;
@@ -2299,8 +2339,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
}
ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, add_space_prefix, false);
ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, remove_extra_whitespaces, false);
ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, add_space_prefix, false);
ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, remove_extra_whitespaces, false);
ml.get_key(LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, normalizer_lowercase, false);
}
const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
@@ -3264,6 +3305,8 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
std::unique_ptr<llm_tokenizer_bpe_session> session;
if (vocab.get_tokenizer_model() == "hybriddna") {
session = std::make_unique<llm_tokenizer_hybriddna_session>(vocab, *tok_bpe);
} else if (vocab.get_tokenizer_model() == "whitespace") {
session = std::make_unique<llm_tokenizer_whitespace_session>(vocab, *tok_bpe);
} else {
session = std::make_unique<llm_tokenizer_bpe_session>(vocab, *tok_bpe);
}
@@ -3892,6 +3935,10 @@ bool llama_vocab::get_treat_whitespace_as_suffix() const {
return pimpl->treat_whitespace_as_suffix;
}
bool llama_vocab::get_normalizer_lowercase() const {
return pimpl->normalizer_lowercase;
}
int llama_vocab::max_token_len() const {
return pimpl->max_token_len;
}
+2
View File
@@ -61,6 +61,7 @@ enum llama_vocab_pre_type {
LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50,
LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE = 51,
LLAMA_VOCAB_PRE_TYPE_MINICPM5 = 52,
LLAMA_VOCAB_PRE_TYPE_WHITESPACE = 53,
};
struct LLM_KV;
@@ -138,6 +139,7 @@ struct llama_vocab {
bool get_remove_extra_whitespaces () const;
bool get_escape_whitespaces () const;
bool get_treat_whitespace_as_suffix() const;
bool get_normalizer_lowercase () const;
int max_token_len() const;
+3 -1
View File
@@ -225,7 +225,9 @@ static bool llama_prepare_model_devices(const llama_model_params & params, llama
}
case GGML_BACKEND_DEVICE_TYPE_IGPU:
igpus.push_back({false, dev});
if (igpus.empty()) {
igpus.push_back({false, dev});
}
break;
case GGML_BACKEND_DEVICE_TYPE_META:
GGML_ABORT("fatal error");
+2
View File
@@ -7812,6 +7812,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 2560}, {3, 3, 2, 2560}, 1, 1, 1, 1, 1, 1, true));
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {5, 5, 1, 32}, {3, 4, 1, 32}, 1, 1, 0, 0, 1, 1, true));
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {2, 2, 1536, 729}, {2, 2, 1536, 4096}, 1, 1, 0, 0, 1, 1, true));
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {128, 128, 1, 2}, {32, 33, 1, 2}, 1, 1, 1, 1, 1, 1, true));
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {128, 128, 2, 1}, {33, 34, 2, 1}, 1, 1, 1, 1, 1, 1, true));
// im2col 3D
test_cases.emplace_back(new test_im2col_3d(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32));
+11 -4
View File
@@ -526,8 +526,9 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg
max_arch_name_length = std::max(max_arch_name_length, strlen(llm_arch_name(arch)));
}
const std::string template_header = std::string("|%" + std::to_string(max_arch_name_length) + "s|%") + std::to_string(max_device_label_length) + "s|%6s|%15s|%9s|\n";
const std::string template_row = std::string("|%" + std::to_string(max_arch_name_length) + "s|%") + std::to_string(max_device_label_length) + "s|%6s|%15s %10s|%20s|\n";
const std::string template_header = std::string("|%" + std::to_string(max_arch_name_length) + "s|%") + std::to_string(max_device_label_length) + "s|%6s|%15s|%9s|\n";
const std::string template_row_cfg = std::string("|%" + std::to_string(max_arch_name_length) + "s|%") + std::to_string(max_device_label_length) + "s|%6s|";
const std::string template_row_res = "%15s %10s|%20s|\n";
bool all_ok = true;
common_log_flush(common_log_main());
@@ -565,6 +566,11 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg
std::pair<llama_model_ptr, llama_context_ptr> model_and_ctx_cpu;
std::vector<float> logits_cpu;
for (device_config & dc : dev_configs) {
// print test config first; should anything fail during model loading or inference, at least we know which test case caused it
printf(template_row_cfg.c_str(),
llm_arch_name(arch), dc.label.c_str(), config_name.c_str());
fflush(stdout);
std::pair<llama_model_ptr, llama_context_ptr> model_and_ctx_dev;
std::vector<float> logits_dev;
std::string status_nmse = "\033[1;33mSKIP\033[0m";
@@ -617,8 +623,9 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg
}
}
printf(template_row.c_str(), llm_arch_name(arch), dc.label.c_str(),
config_name.c_str(), status_nmse.c_str(), nmse_str, status_roundtrip.c_str());
// log the results for this test case
printf(template_row_res.c_str(),
status_nmse.c_str(), nmse_str, status_roundtrip.c_str());
}
}
}
+70 -35
View File
@@ -26,17 +26,28 @@ options:
-h, --help
--numa <distribute|isolate|numactl> numa mode (default: disabled)
-r, --repetitions <n> number of times to repeat each test (default: 5)
--prio <0|1|2|3> process/thread priority (default: 0)
--prio <-1|0|1|2|3> process/thread priority (default: 0)
--delay <0...N> (seconds) delay between each test (default: 0)
-o, --output <csv|json|jsonl|md|sql> output format printed to stdout (default: md)
-oe, --output-err <csv|json|jsonl|md|sql> output format printed to stderr (default: none)
--list-devices list available devices and exit
-v, --verbose verbose output
--progress print test progress indicators
--no-warmup skip warmup runs before benchmarking
-fitt, --fit-target <MiB> fit model to device memory with this margin per device in MiB (default: off)
-fitc, --fit-ctx <n> minimum ctx size for --fit-target (default: 4096)
-rpc, --rpc <rpc_servers> register RPC devices (comma separated)
test parameters:
-m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf)
-hf, -hfr, --hf-repo <user>/<model>[:quant] Hugging Face model repository; quant is optional, case-insensitive
default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.
example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M
(default: unused)
-hff, --hf-file <file> Hugging Face model file. If specified, it will override the quant in --hf-repo
(default: unused)
-hft, --hf-token <token> Hugging Face access token
(default: value from HF_TOKEN environment variable)
-p, --n-prompt <n> (default: 512)
-n, --n-gen <n> (default: 128)
-pg <pp,tg> (default: )
@@ -49,21 +60,21 @@ test parameters:
-C, --cpu-mask <hex,hex> (default: 0x0)
--cpu-strict <0|1> (default: 0)
--poll <0...100> (default: 50)
-ngl, --n-gpu-layers <n> (default: 99)
-ngl, --n-gpu-layers <n> (default: -1)
-ncmoe, --n-cpu-moe <n> (default: 0)
-sm, --split-mode <none|layer|row> (default: layer)
-sm, --split-mode <none|layer|row|tensor> (default: layer)
-mg, --main-gpu <i> (default: 0)
-nkvo, --no-kv-offload <0|1> (default: 0)
-fa, --flash-attn <0|1> (default: 0)
-fa, --flash-attn <on|off|auto> (default: auto)
-dev, --device <dev0/dev1/...> (default: auto)
-mmp, --mmap <0|1> (default: 1)
-dio, --direct-io <0|1> (default: 0)
-embd, --embeddings <0|1> (default: 0)
-ts, --tensor-split <ts0/ts1/..> (default: 0)
-ot --override-tensors <tensor name pattern>=<buffer type>;...
-ot --override-tensor <tensor name pattern>=<buffer type>;...
(default: disabled)
-nopo, --no-op-offload <0|1> (default: 0)
-fitt, --fit-target <MiB> fit model to device memory with this margin per device in MiB (default: off)
-fitc, --fit-ctx <n> minimum ctx size for --fit-target (default: 4096)
--no-host <0|1> (default: 0)
Multiple values can be given for each parameter by separating them with ','
or by specifying the parameter multiple times. Ranges can be given as
@@ -97,12 +108,12 @@ $ ./llama-bench -m models/7B/ggml-model-q4_0.gguf -m models/13B/ggml-model-q4_0.
| model | size | params | backend | ngl | test | t/s |
| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 128 | 132.19 ± 0.55 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 256 | 129.37 ± 0.54 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 512 | 123.83 ± 0.25 |
| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 128 | 82.17 ± 0.31 |
| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 256 | 80.74 ± 0.23 |
| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 512 | 78.08 ± 0.07 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | tg 128 | 132.19 ± 0.55 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | tg 256 | 129.37 ± 0.54 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | tg 512 | 123.83 ± 0.25 |
| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | -1 | tg 128 | 82.17 ± 0.31 |
| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | -1 | tg 256 | 80.74 ± 0.23 |
| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | -1 | tg 512 | 78.08 ± 0.07 |
### Prompt processing with different batch sizes
@@ -112,10 +123,10 @@ $ ./llama-bench -n 0 -p 1024 -b 128,256,512,1024
| model | size | params | backend | ngl | n_batch | test | t/s |
| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------: | ---------- | ---------------: |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 128 | pp 1024 | 1436.51 ± 3.66 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 256 | pp 1024 | 1932.43 ± 23.48 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 512 | pp 1024 | 2254.45 ± 15.59 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 1024 | pp 1024 | 2498.61 ± 13.58 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | 128 | pp 1024 | 1436.51 ± 3.66 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | 256 | pp 1024 | 1932.43 ± 23.48 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | 512 | pp 1024 | 2254.45 ± 15.59 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | 1024 | pp 1024 | 2498.61 ± 13.58 |
### Different numbers of threads
@@ -171,10 +182,10 @@ $ ./llama-bench -d 0,512
| model | size | params | backend | ngl | test | t/s |
| ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: |
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | pp512 | 7340.20 ± 23.45 |
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | tg128 | 120.60 ± 0.59 |
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | pp512 @ d512 | 6425.91 ± 18.88 |
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | tg128 @ d512 | 116.71 ± 0.60 |
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | -1 | pp512 | 7340.20 ± 23.45 |
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | -1 | tg128 | 120.60 ± 0.59 |
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | -1 | pp512 @ d512 | 6425.91 ± 18.88 |
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | -1 | tg128 @ d512 | 116.71 ± 0.60 |
## Output formats
@@ -188,8 +199,8 @@ $ ./llama-bench -o md
| model | size | params | backend | ngl | test | t/s |
| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | pp 512 | 2368.80 ± 93.24 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 128 | 131.42 ± 0.59 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | pp 512 | 2368.80 ± 93.24 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | tg 128 | 131.42 ± 0.59 |
### CSV
@@ -198,9 +209,9 @@ $ ./llama-bench -o csv
```
```csv
build_commit,build_number,cpu_info,gpu_info,backends,model_filename,model_type,model_size,model_n_params,n_batch,n_ubatch,n_threads,cpu_mask,cpu_strict,poll,type_k,type_v,n_gpu_layers,split_mode,main_gpu,no_kv_offload,flash_attn,tensor_split,use_mmap,embeddings,n_prompt,n_gen,n_depth,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","512","0","0","2025-04-24T11:57:09Z","70285660","982040","7285.676949","100.064434"
"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","0","128","0","2025-04-24T11:57:10Z","1067431600","3834831","119.915244","0.430617"
build_commit,build_number,cpu_info,gpu_info,backends,model_filename,model_type,model_size,model_n_params,n_batch,n_ubatch,n_threads,cpu_mask,cpu_strict,poll,type_k,type_v,n_gpu_layers,n_cpu_moe,split_mode,main_gpu,no_kv_offload,flash_attn,devices,tensor_split,tensor_buft_overrides,use_mmap,use_direct_io,embeddings,no_op_offload,no_host,fit_target,fit_min_ctx,n_prompt,n_gen,n_depth,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","-1","0","layer","0","0","-1","auto","0.00","none","1","0","0","0","0","0","0","512","0","0","2025-04-24T11:57:09Z","70285660","982040","7285.676949","100.064434"
"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","-1","0","layer","0","0","-1","auto","0.00","none","1","0","0","0","0","0","0","0","128","0","2025-04-24T11:57:10Z","1067431600","3834831","119.915244","0.430617"
```
### JSON
@@ -229,14 +240,22 @@ $ ./llama-bench -o json
"poll": 50,
"type_k": "f16",
"type_v": "f16",
"n_gpu_layers": 99,
"n_gpu_layers": -1,
"n_cpu_moe": 0,
"split_mode": "layer",
"main_gpu": 0,
"no_kv_offload": false,
"flash_attn": false,
"flash_attn": -1,
"devices": "auto",
"tensor_split": "0.00",
"tensor_buft_overrides": "none",
"use_mmap": true,
"use_direct_io": false,
"embeddings": false,
"no_op_offload": 0,
"no_host": false,
"fit_target": 0,
"fit_min_ctx": 0,
"n_prompt": 512,
"n_gen": 0,
"n_depth": 0,
@@ -266,14 +285,22 @@ $ ./llama-bench -o json
"poll": 50,
"type_k": "f16",
"type_v": "f16",
"n_gpu_layers": 99,
"n_gpu_layers": -1,
"n_cpu_moe": 0,
"split_mode": "layer",
"main_gpu": 0,
"no_kv_offload": false,
"flash_attn": false,
"flash_attn": -1,
"devices": "auto",
"tensor_split": "0.00",
"tensor_buft_overrides": "none",
"use_mmap": true,
"use_direct_io": false,
"embeddings": false,
"no_op_offload": 0,
"no_host": false,
"fit_target": 0,
"fit_min_ctx": 0,
"n_prompt": 0,
"n_gen": 128,
"n_depth": 0,
@@ -296,8 +323,8 @@ $ ./llama-bench -o jsonl
```
```json lines
{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 512, "n_gen": 0, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 70497220, "stddev_ns": 883196, "avg_ts": 7263.609157, "stddev_ts": 90.940578, "samples_ns": [ 71551000, 71222800, 70364100, 69439100, 69909100 ],"samples_ts": [ 7155.74, 7188.71, 7276.44, 7373.37, 7323.8 ]}
{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 0, "n_gen": 128, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 1068078400, "stddev_ns": 6279455, "avg_ts": 119.844681, "stddev_ts": 0.699739, "samples_ns": [ 1066331700, 1064864900, 1079042600, 1063328400, 1066824400 ],"samples_ts": [ 120.038, 120.203, 118.624, 120.377, 119.982 ]}
{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": -1, "n_cpu_moe": 0, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": -1, "devices": "auto", "tensor_split": "0.00", "tensor_buft_overrides": "none", "use_mmap": true, "use_direct_io": false, "embeddings": false, "no_op_offload": 0, "no_host": false, "fit_target": 0, "fit_min_ctx": 0, "n_prompt": 512, "n_gen": 0, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 70497220, "stddev_ns": 883196, "avg_ts": 7263.609157, "stddev_ts": 90.940578, "samples_ns": [ 71551000, 71222800, 70364100, 69439100, 69909100 ],"samples_ts": [ 7155.74, 7188.71, 7276.44, 7373.37, 7323.8 ]}
{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": -1, "n_cpu_moe": 0, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": -1, "devices": "auto", "tensor_split": "0.00", "tensor_buft_overrides": "none", "use_mmap": true, "use_direct_io": false, "embeddings": false, "no_op_offload": 0, "no_host": false, "fit_target": 0, "fit_min_ctx": 0, "n_prompt": 0, "n_gen": 128, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 1068078400, "stddev_ns": 6279455, "avg_ts": 119.844681, "stddev_ts": 0.699739, "samples_ns": [ 1066331700, 1064864900, 1079042600, 1063328400, 1066824400 ],"samples_ts": [ 120.038, 120.203, 118.624, 120.377, 119.982 ]}
```
@@ -310,7 +337,7 @@ $ ./llama-bench -o sql
```
```sql
CREATE TABLE IF NOT EXISTS test (
CREATE TABLE IF NOT EXISTS llama_bench (
build_commit TEXT,
build_number INTEGER,
cpu_info TEXT,
@@ -329,13 +356,21 @@ CREATE TABLE IF NOT EXISTS test (
type_k TEXT,
type_v TEXT,
n_gpu_layers INTEGER,
n_cpu_moe INTEGER,
split_mode TEXT,
main_gpu INTEGER,
no_kv_offload INTEGER,
flash_attn INTEGER,
devices TEXT,
tensor_split TEXT,
tensor_buft_overrides TEXT,
use_mmap INTEGER,
use_direct_io INTEGER,
embeddings INTEGER,
no_op_offload INTEGER,
no_host INTEGER,
fit_target INTEGER,
fit_min_ctx INTEGER,
n_prompt INTEGER,
n_gen INTEGER,
n_depth INTEGER,
@@ -346,6 +381,6 @@ CREATE TABLE IF NOT EXISTS test (
stddev_ts REAL
);
INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '512', '0', '0', '2025-04-24T12:00:08Z', '69905000', '519516', '7324.546977', '54.032613');
INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '0', '128', '0', '2025-04-24T12:00:09Z', '1063608780', '4464130', '120.346696', '0.504647');
INSERT INTO llama_bench (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, n_cpu_moe, split_mode, main_gpu, no_kv_offload, flash_attn, devices, tensor_split, tensor_buft_overrides, use_mmap, use_direct_io, embeddings, no_op_offload, no_host, fit_target, fit_min_ctx, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '-1', '0', 'layer', '0', '0', '-1', 'auto', '0.00', 'none', '1', '0', '0', '0', '0', '0', '0', '512', '0', '0', '2025-04-24T12:00:08Z', '69905000', '519516', '7324.546977', '54.032613');
INSERT INTO llama_bench (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, n_cpu_moe, split_mode, main_gpu, no_kv_offload, flash_attn, devices, tensor_split, tensor_buft_overrides, use_mmap, use_direct_io, embeddings, no_op_offload, no_host, fit_target, fit_min_ctx, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '-1', '0', 'layer', '0', '0', '-1', 'auto', '0.00', 'none', '1', '0', '0', '0', '0', '0', '0', '0', '128', '0', '2025-04-24T12:00:09Z', '1063608780', '4464130', '120.346696', '0.504647');
```
+38 -16
View File
@@ -19,6 +19,7 @@
#include <vector>
#include <unordered_set>
#include "arg.h"
#include "build-info.h"
#include "common.h"
#include "download.h"
@@ -275,9 +276,11 @@ static std::string pair_str(const std::pair<int, int> & p) {
return buf;
}
static std::vector<int> parse_int_range(const std::string & s) {
static std::vector<int> parse_int_range(const std::string & s, bool allow_negative = false) {
// first[-last[(+|*)step]]
std::regex range_regex(R"(^(\d+)(?:-(\d+)(?:([\+|\*])(\d+))?)?(?:,|$))");
std::regex range_regex(allow_negative
? R"(^(-?\d+)(?:-(\d+)(?:([\+|\*])(\d+))?)?(?:,|$))"
: R"(^(\d+)(?:-(\d+)(?:([\+|\*])(\d+))?)?(?:,|$))");
std::smatch match;
std::string::const_iterator search_start(s.cbegin());
@@ -337,7 +340,7 @@ struct cmd_params {
std::vector<llama_split_mode> split_mode;
std::vector<int> main_gpu;
std::vector<bool> no_kv_offload;
std::vector<bool> flash_attn;
std::vector<llama_flash_attn_type> flash_attn;
std::vector<std::vector<ggml_backend_dev_t>> devices;
std::vector<std::vector<float>> tensor_split;
std::vector<std::vector<llama_model_tensor_buft_override>> tensor_buft_overrides;
@@ -376,12 +379,12 @@ static const cmd_params cmd_params_defaults = {
/* cpu_mask */ { "0x0" },
/* cpu_strict */ { false },
/* poll */ { 50 },
/* n_gpu_layers */ { 99 },
/* n_gpu_layers */ { -1 },
/* n_cpu_moe */ { 0 },
/* split_mode */ { LLAMA_SPLIT_MODE_LAYER },
/* main_gpu */ { 0 },
/* no_kv_offload */ { false },
/* flash_attn */ { false },
/* flash_attn */ { LLAMA_FLASH_ATTN_TYPE_AUTO },
/* devices */ { {} },
/* tensor_split */ { std::vector<float>(llama_max_devices(), 0.0f) },
/* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{ { nullptr, nullptr } } },
@@ -451,7 +454,7 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -sm, --split-mode <none|layer|row|tensor> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
printf(" -fa, --flash-attn <on|off|auto> (default: %s)\n", join(transform_to_str(cmd_params_defaults.flash_attn, llama_flash_attn_type_name), ",").c_str());
printf(" -dev, --device <dev0/dev1/...> (default: auto)\n");
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
printf(" -dio, --direct-io <0|1> (default: %s)\n", join(cmd_params_defaults.use_direct_io, ",").c_str());
@@ -710,7 +713,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
invalid_param = true;
break;
}
auto p = parse_int_range(argv[i]);
auto p = parse_int_range(argv[i], /*allow_negative=*/true);
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
} else if (arg == "-ncmoe" || arg == "--n-cpu-moe") {
if (++i >= argc) {
@@ -793,8 +796,27 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
invalid_param = true;
break;
}
auto p = string_split<bool>(argv[i], split_delim);
params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
auto p = string_split<std::string>(argv[i], split_delim);
std::vector<llama_flash_attn_type> types;
for (const auto & v : p) {
llama_flash_attn_type type;
if (common_arg_utils::is_truthy(v)) {
type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
} else if (common_arg_utils::is_falsey(v)) {
type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
} else if (common_arg_utils::is_autoy(v)) {
type = LLAMA_FLASH_ATTN_TYPE_AUTO;
} else {
invalid_param = true;
break;
}
types.push_back(type);
}
if (invalid_param) {
break;
}
params.flash_attn.insert(params.flash_attn.end(), types.begin(), types.end());
} else if (arg == "-mmp" || arg == "--mmap") {
if (++i >= argc) {
invalid_param = true;
@@ -1138,7 +1160,7 @@ struct cmd_params_instance {
llama_split_mode split_mode;
int main_gpu;
bool no_kv_offload;
bool flash_attn;
llama_flash_attn_type flash_attn;
std::vector<ggml_backend_dev_t> devices;
std::vector<float> tensor_split;
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
@@ -1222,7 +1244,7 @@ struct cmd_params_instance {
cparams.type_k = type_k;
cparams.type_v = type_v;
cparams.offload_kqv = !no_kv_offload;
cparams.flash_attn_type = flash_attn ? LLAMA_FLASH_ATTN_TYPE_ENABLED : LLAMA_FLASH_ATTN_TYPE_DISABLED;
cparams.flash_attn_type = flash_attn;
cparams.embeddings = embeddings;
cparams.op_offload = !no_op_offload;
cparams.swa_full = false;
@@ -1400,7 +1422,7 @@ struct test {
llama_split_mode split_mode;
int main_gpu;
bool no_kv_offload;
bool flash_attn;
llama_flash_attn_type flash_attn;
std::vector<ggml_backend_dev_t> devices;
std::vector<float> tensor_split;
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
@@ -1522,10 +1544,10 @@ struct test {
field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" ||
field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" || field == "avg_ns" ||
field == "stddev_ns" || field == "no_op_offload" || field == "n_cpu_moe" ||
field == "fit_target" || field == "fit_min_ctx") {
field == "fit_target" || field == "fit_min_ctx" || field == "flash_attn") {
return INT;
}
if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" ||
field == "use_mmap" || field == "use_direct_io" || field == "embeddings" || field == "no_host") {
return BOOL;
}
@@ -1594,7 +1616,7 @@ struct test {
split_mode_str(split_mode),
std::to_string(main_gpu),
std::to_string(no_kv_offload),
std::to_string(flash_attn),
std::to_string((int) flash_attn),
devices_to_string(devices),
tensor_split_str,
tensor_buft_overrides_str,
@@ -1779,7 +1801,7 @@ struct markdown_printer : public printer {
return 6;
}
if (field == "flash_attn") {
return 2;
return 3;
}
if (field == "devices") {
return -12;
+6
View File
@@ -7,3 +7,9 @@ bun.lockb
# Miscellaneous
/static/
# Build output
/dist/
/build/
/.svelte-kit/
test-results
+5 -4
View File
@@ -9,6 +9,7 @@
#include <fstream>
#include <string>
#include <vector>
#include <cinttypes>
#include <cstdint>
// Computes FNV-1a hash of the data
@@ -126,10 +127,10 @@ int main(int argc, char ** argv) {
append_bytes_hex(cpp, bytes);
const auto hash = fnv_hash(bytes.data(), bytes.size());
cpp += fmt("};\nstatic const size_t asset_%d_size = %lu;\n",
i, static_cast<unsigned long>(bytes.size()));
cpp += fmt("static const char asset_%d_etag[] = \"\\\"0x%016lx\\\"\";\n\n",
i, static_cast<unsigned long>(hash));
cpp += fmt("};\nstatic const size_t asset_%d_size = %zu;\n",
i, bytes.size());
cpp += fmt("static const char asset_%d_etag[] = \"\\\"0x%016" PRIx64 "\\\"\";\n\n",
i, hash);
}
cpp += "static const llama_ui_asset g_assets[] = {\n";
+2 -2
View File
@@ -45,8 +45,8 @@ export default ts.config(
}
},
{
// Exclude Storybook files from main ESLint rules
ignores: ['.storybook/**/*']
// Exclude generated build output and Storybook files from ESLint
ignores: ['dist/**', 'build/**', '.svelte-kit/**', 'test-results/**', '.storybook/**/*']
},
storybook.configs['flat/recommended']
);
@@ -75,9 +75,13 @@
}
function handleSave() {
if (localConfig.custom && typeof localConfig.custom === 'string' && localConfig.custom.trim()) {
if (
localConfig.customJson &&
typeof localConfig.customJson === 'string' &&
localConfig.customJson.trim()
) {
try {
JSON.parse(localConfig.custom);
JSON.parse(localConfig.customJson);
} catch (error) {
alert('Invalid JSON in custom parameters. Please check the format and try again.');
console.error(error);
+2 -1
View File
@@ -66,5 +66,6 @@ export const SETTINGS_KEYS = {
EXCLUDE_REASONING_FROM_CONTEXT: 'excludeReasoningFromContext',
SHOW_RAW_OUTPUT_SWITCH: 'showRawOutputSwitch',
// PY_INTERPRETER_ENABLED: 'pyInterpreterEnabled',
CUSTOM: 'custom'
CUSTOM_JSON: 'customJson',
CUSTOM_CSS: 'customCss'
} as const;
@@ -659,12 +659,24 @@ const SETTINGS_REGISTRY: Record<string, SettingsSectionEntry> = {
}
},
{
key: SETTINGS_KEYS.CUSTOM,
key: SETTINGS_KEYS.CUSTOM_JSON,
label: 'Custom JSON',
help: 'Custom JSON parameters to send to the API. Must be valid JSON format.',
defaultValue: '',
type: SettingsFieldType.TEXTAREA,
section: SETTINGS_SECTION_SLUGS.DEVELOPER
},
{
key: SETTINGS_KEYS.CUSTOM_CSS,
label: 'Custom CSS',
help: 'CSS injected into the page at runtime. Set it here, or ship it server side via the --ui-config customCss field.',
defaultValue: '',
type: SettingsFieldType.TEXTAREA,
section: SETTINGS_SECTION_SLUGS.DEVELOPER,
sync: {
serverKey: SETTINGS_KEYS.CUSTOM_CSS,
paramType: SyncableParameterType.STRING
}
}
]
},
+26 -1
View File
@@ -470,11 +470,36 @@ const themeMigration: Migration = {
// Migration Registry & Runner
const CUSTOM_JSON_MIGRATION_ID = 'custom-json-key-v1';
const customJsonKeyMigration: Migration = {
id: CUSTOM_JSON_MIGRATION_ID,
description: 'Copy legacy custom config key to customJson (non-destructive)',
async run(): Promise<void> {
const configRaw = localStorage.getItem(CONFIG_LOCALSTORAGE_KEY);
if (configRaw === null) return;
const config = JSON.parse(configRaw);
if (!('custom' in config)) return;
if (SETTINGS_KEYS.CUSTOM_JSON in config) return;
config[SETTINGS_KEYS.CUSTOM_JSON] = config.custom;
localStorage.setItem(CONFIG_LOCALSTORAGE_KEY, JSON.stringify(config));
// Non-destructive: keep the legacy custom key for downgrade compatibility
if (import.meta.env.DEV && import.meta.env.VITE_DEBUG)
console.log(`[Migration] Custom JSON: copied custom to customJson (preserved old key)`);
}
};
const migrations: Migration[] = [
localStorageMigration,
idxdbMigration,
legacyMessageMigration,
themeMigration
themeMigration,
customJsonKeyMigration
];
export const MigrationService = {
+1 -1
View File
@@ -1869,7 +1869,7 @@ class ChatStore {
apiOptions.backend_sampling = currentConfig.backend_sampling;
if (currentConfig.custom) apiOptions.custom = currentConfig.custom;
if (currentConfig.customJson) apiOptions.custom = currentConfig.customJson;
return apiOptions;
}
+1 -1
View File
@@ -57,7 +57,7 @@ class ToolsStore {
}
get customTools(): OpenAIToolDefinition[] {
const raw = config().custom;
const raw = config().customJson;
if (!raw || typeof raw !== 'string') return [];
try {
+2 -2
View File
@@ -90,8 +90,8 @@ export interface SettingsChatServiceOptions {
// Sampler configuration
samplers?: string | string[];
backend_sampling?: boolean;
// Custom parameters
custom?: string;
// Custom JSON parameters
customJson?: string;
timings_per_token?: boolean;
// Continuation control (vLLM compat), opt in to the explicit continue final message flag
continueFinalMessage?: boolean;
+14
View File
@@ -169,6 +169,14 @@
}
});
// Inject custom CSS at runtime through an action on the head style node
// textContent keeps the value as text, never parsed as HTML
function customCss(node: HTMLStyleElement) {
$effect(() => {
node.textContent = (config().customCss as string | undefined) ?? '';
});
}
// Fetch router models when in router mode (for status and modalities)
// Wait for models to be loaded first, run only once
let routerModelsFetched = false;
@@ -227,6 +235,12 @@
});
</script>
<svelte:head>
{#if config().customCss}
<style use:customCss></style>
{/if}
</svelte:head>
<Tooltip.Provider delayDuration={TOOLTIP_DELAY_DURATION}>
<ModeWatcher />
<Toaster richColors />