mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-30 17:47:40 +02:00
Compare commits
63 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 87f18f760e | |||
| cf285e195e | |||
| 07ec9fd8d9 | |||
| 5a2e768430 | |||
| 5a727def3d | |||
| f0bbb1a9ea | |||
| a0a98e702c | |||
| 8c75e6ee7e | |||
| 651afdb47d | |||
| 5f0e5348ba | |||
| 549b9d8433 | |||
| 5d246a792d | |||
| 63248fc3e3 | |||
| 83eebe9d08 | |||
| fff63b5108 | |||
| f3061116ff | |||
| 1c0f6db545 | |||
| cec51c7a7d | |||
| b22ff4b7b4 | |||
| c0c7e147e7 | |||
| b0df4c0cfd | |||
| a497476330 | |||
| 95405ac65f | |||
| 0f3cb3fc8b | |||
| 1acee6bf89 | |||
| ef570f6308 | |||
| cc9e331213 | |||
| bcfd1989e9 | |||
| 56f16f235c | |||
| 8cc67efcd4 | |||
| 95feeab52e | |||
| 99d4026b11 | |||
| 9c92e96a64 | |||
| afcda09d15 | |||
| bbce619adb | |||
| 4f0e43da6f | |||
| bb28c1fe24 | |||
| ee7c30578a | |||
| 47c0eda9d4 | |||
| 5306f4b3b5 | |||
| 40d5358d3c | |||
| b65bb4baae | |||
| a1a69f777a | |||
| 52fb93a2bd | |||
| c9021714e8 | |||
| 1d7ab2b947 | |||
| 12e5d99078 | |||
| 7ea23ddf7b | |||
| 2fc8d1851e | |||
| 5e932a1c8d | |||
| 2754ce1b3e | |||
| eeeaf6180b | |||
| 0be84685bd | |||
| ce02093fdd | |||
| 6a257d4463 | |||
| 3a479c9132 | |||
| ad27757261 | |||
| 3a6db741a8 | |||
| 510b5c2a35 | |||
| a8681a0ed2 | |||
| acd604fb27 | |||
| 6ce96713de | |||
| c9872a2575 |
@@ -59,6 +59,7 @@ jobs:
|
||||
cmake -B build -G Xcode \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_BUILD_APP=OFF \
|
||||
-DLLAMA_BUILD_COMMON=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
@@ -89,6 +90,7 @@ jobs:
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_BUILD_APP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
@@ -138,6 +140,7 @@ jobs:
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_BUILD_COMMON=OFF \
|
||||
-DLLAMA_BUILD_APP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
@@ -163,6 +166,7 @@ jobs:
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_BUILD_COMMON=OFF \
|
||||
-DLLAMA_BUILD_APP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
@@ -206,6 +210,7 @@ jobs:
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_BUILD_APP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
|
||||
@@ -5,23 +5,23 @@ on:
|
||||
|
||||
jobs:
|
||||
linux:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: [self-hosted, Linux, CPU]
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y build-essential tcl cmake
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
PREFIX="$(pwd)"/inst
|
||||
cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX" \
|
||||
-DLLAMA_OPENSSL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release
|
||||
cmake -S . -B build \
|
||||
-DCMAKE_PREFIX_PATH="$PREFIX" \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_APP=OFF \
|
||||
-DCMAKE_BUILD_TYPE=Release
|
||||
cmake --build build --config Release
|
||||
cmake --install build --prefix "$PREFIX" --config Release
|
||||
|
||||
|
||||
@@ -55,24 +55,7 @@ env:
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
determine-tag:
|
||||
name: Determine tag name
|
||||
runs-on: ubuntu-slim
|
||||
outputs:
|
||||
tag_name: ${{ steps.tag.outputs.name }}
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
env:
|
||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
|
||||
ggml-ci-nvidia-cuda:
|
||||
needs: determine-tag
|
||||
runs-on: [self-hosted, Linux, NVIDIA]
|
||||
|
||||
steps:
|
||||
@@ -82,14 +65,11 @@ jobs:
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
env:
|
||||
HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
|
||||
run: |
|
||||
nvidia-smi
|
||||
GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-nvidia-vulkan-cm:
|
||||
needs: determine-tag
|
||||
runs-on: [self-hosted, Linux, NVIDIA]
|
||||
|
||||
steps:
|
||||
@@ -99,14 +79,11 @@ jobs:
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
env:
|
||||
HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
|
||||
run: |
|
||||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-nvidia-vulkan-cm2:
|
||||
needs: determine-tag
|
||||
runs-on: [self-hosted, Linux, NVIDIA, COOPMAT2]
|
||||
|
||||
steps:
|
||||
@@ -116,14 +93,12 @@ jobs:
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
env:
|
||||
HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
|
||||
run: |
|
||||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-nvidia-webgpu:
|
||||
runs-on: [self-hosted, Linux, NVIDIA]
|
||||
runs-on: [self-hosted, Linux, NVIDIA, X64]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -149,7 +124,7 @@ jobs:
|
||||
GG_BUILD_WEBGPU=1 \
|
||||
GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
|
||||
GG_BUILD_WEBGPU_DAWN_DIR="$GITHUB_WORKSPACE/dawn/lib64/cmake/Dawn" \
|
||||
bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
# TODO: provision AMX-compatible machine
|
||||
#ggml-ci-cpu-amx:
|
||||
@@ -163,7 +138,7 @@ jobs:
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
# bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
# TODO: provision AMD GPU machine
|
||||
# ggml-ci-amd-vulkan:
|
||||
@@ -178,7 +153,7 @@ jobs:
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# vulkaninfo --summary
|
||||
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
# TODO: provision AMD GPU machine
|
||||
# ggml-ci-amd-rocm:
|
||||
@@ -193,10 +168,9 @@ jobs:
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# amd-smi static
|
||||
# GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
# GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-mac-metal:
|
||||
needs: determine-tag
|
||||
runs-on: [self-hosted, macOS, ARM64]
|
||||
|
||||
steps:
|
||||
@@ -206,13 +180,10 @@ jobs:
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
env:
|
||||
HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
|
||||
run: |
|
||||
GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-mac-webgpu:
|
||||
needs: determine-tag
|
||||
runs-on: [self-hosted, macOS, ARM64]
|
||||
|
||||
steps:
|
||||
@@ -235,14 +206,11 @@ jobs:
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
env:
|
||||
HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
|
||||
run: |
|
||||
GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
|
||||
bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-mac-vulkan:
|
||||
needs: determine-tag
|
||||
runs-on: [self-hosted, macOS, ARM64]
|
||||
|
||||
steps:
|
||||
@@ -252,14 +220,11 @@ jobs:
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
env:
|
||||
HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
|
||||
run: |
|
||||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-linux-intel-vulkan:
|
||||
needs: determine-tag
|
||||
runs-on: [self-hosted, Linux, Intel]
|
||||
|
||||
steps:
|
||||
@@ -271,14 +236,11 @@ jobs:
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
env:
|
||||
HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
|
||||
run: |
|
||||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-win-intel-vulkan:
|
||||
needs: determine-tag
|
||||
runs-on: [self-hosted, Windows, X64, Intel]
|
||||
|
||||
steps:
|
||||
@@ -293,7 +255,6 @@ jobs:
|
||||
MSYSTEM: UCRT64
|
||||
CHERE_INVOKING: 1
|
||||
PATH: C:\msys64\ucrt64\bin;C:\msys64\usr\bin;C:\Windows\System32;${{ env.PATH }}
|
||||
HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
|
||||
run: |
|
||||
vulkaninfo --summary
|
||||
# Skip python related tests with GG_BUILD_LOW_PERF=1 since Windows MSYS2 UCRT64 currently fails to create
|
||||
@@ -301,7 +262,6 @@ jobs:
|
||||
LLAMA_FATAL_WARNINGS=OFF GG_BUILD_NINJA=1 GG_BUILD_VULKAN=1 GG_BUILD_LOW_PERF=1 ./ci/run.sh ./results/llama.cpp ./mnt/llama.cpp
|
||||
|
||||
ggml-ci-intel-openvino-gpu-low-perf:
|
||||
needs: determine-tag
|
||||
runs-on: [self-hosted, Linux, Intel, OpenVINO]
|
||||
|
||||
concurrency:
|
||||
@@ -333,8 +293,64 @@ jobs:
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
env:
|
||||
HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
|
||||
run: |
|
||||
source ./openvino_toolkit/setupvars.sh
|
||||
GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-arm64-cpu-low-perf:
|
||||
runs-on: [self-hosted, Linux, ARM64, CPU]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-arm64-cpu-high-perf:
|
||||
runs-on: [self-hosted, Linux, ARM64, CPU]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
# TODO: not sure how to detect ARM flags on DGX Spark. currently get this error during cmake:
|
||||
# CMake Warning at ggml/src/ggml-cpu/CMakeLists.txt:147 (message):
|
||||
# ARM -march/-mcpu not found, -mcpu=native will be used
|
||||
#
|
||||
# if we resolve this, we should be able to offload these jobs to the self-hosted runners
|
||||
#
|
||||
# ggml-ci-arm64-cpu-high-perf-sve:
|
||||
# runs-on: [self-hosted, Linux, ARM64, CPU]
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
#
|
||||
# ggml-ci-arm64-cpu-kleidiai:
|
||||
# runs-on: [self-hosted, Linux, ARM64, CPU]
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# GG_BUILD_KLEIDIAI=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
+52
-50
@@ -931,31 +931,32 @@ jobs:
|
||||
run: |
|
||||
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
|
||||
ggml-ci-arm64-cpu-low-perf:
|
||||
runs-on: ubuntu-22.04-arm
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ggml-ci-arm64-cpu-low-perf
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
# note: moved to build-self-hosted.yml - can remove from here when everything is stable
|
||||
# ggml-ci-arm64-cpu-low-perf:
|
||||
# runs-on: ubuntu-22.04-arm
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: ggml-ci-arm64-cpu-low-perf
|
||||
# evict-old-files: 1d
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
#
|
||||
# - name: Dependencies
|
||||
# id: depends
|
||||
# run: |
|
||||
# sudo apt-get update
|
||||
# sudo apt-get install build-essential
|
||||
#
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
|
||||
ggml-ci-x64-cpu-high-perf:
|
||||
runs-on: ubuntu-22.04
|
||||
@@ -983,31 +984,32 @@ jobs:
|
||||
run: |
|
||||
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
|
||||
ggml-ci-arm64-cpu-high-perf:
|
||||
runs-on: ubuntu-22.04-arm
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ggml-ci-arm64-cpu-high-perf
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
# note: moved to build-self-hosted.yml - can remove from here when everything is stable
|
||||
# ggml-ci-arm64-cpu-high-perf:
|
||||
# runs-on: ubuntu-22.04-arm
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: ggml-ci-arm64-cpu-high-perf
|
||||
# evict-old-files: 1d
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
#
|
||||
# - name: Dependencies
|
||||
# id: depends
|
||||
# run: |
|
||||
# sudo apt-get update
|
||||
# sudo apt-get install build-essential
|
||||
#
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
|
||||
ggml-ci-arm64-cpu-high-perf-sve:
|
||||
runs-on: ubuntu-22.04-arm
|
||||
|
||||
@@ -19,7 +19,7 @@ on:
|
||||
|
||||
jobs:
|
||||
check-vendor:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: [self-hosted, fast]
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
|
||||
@@ -15,7 +15,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
model-naming:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: [self-hosted, fast]
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- name: Check model naming conventions
|
||||
|
||||
@@ -15,7 +15,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
editorconfig:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: [self-hosted, fast]
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: editorconfig-checker/action-editorconfig-checker@840e866d93b8e032123c23bac69dece044d4d84c # v2.2.0
|
||||
|
||||
@@ -12,7 +12,7 @@ on:
|
||||
|
||||
jobs:
|
||||
pre-tokenizer-hashes:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: [self-hosted, fast]
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
|
||||
@@ -20,7 +20,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
python-check-requirements:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: [self-hosted, CPU, fast]
|
||||
name: check-requirements
|
||||
steps:
|
||||
- name: Check out source repository
|
||||
|
||||
@@ -21,7 +21,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
flake8-lint:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: [self-hosted, fast]
|
||||
name: Lint
|
||||
steps:
|
||||
- name: Check out source repository
|
||||
|
||||
@@ -22,7 +22,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
python-type-check:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: [self-hosted, fast]
|
||||
name: python type-check
|
||||
steps:
|
||||
- name: Check out source repository
|
||||
|
||||
@@ -1108,6 +1108,7 @@ jobs:
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_BUILD_APP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
@@ -1233,6 +1234,9 @@ jobs:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
|
||||
name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
|
||||
|
||||
ui-build:
|
||||
uses: ./.github/workflows/ui-build.yml
|
||||
|
||||
release:
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
|
||||
@@ -1258,6 +1262,7 @@ jobs:
|
||||
- macOS-cpu
|
||||
- ios-xcode-build
|
||||
- openEuler-cann
|
||||
- ui-build
|
||||
|
||||
outputs:
|
||||
tag_name: ${{ steps.tag.outputs.name }}
|
||||
@@ -1317,6 +1322,18 @@ jobs:
|
||||
mv -v artifact/*.zip release
|
||||
mv -v artifact/*.tar.gz release
|
||||
|
||||
- name: Download UI build
|
||||
id: download_ui
|
||||
uses: actions/download-artifact@v7
|
||||
with:
|
||||
name: ui-build
|
||||
path: ./ui-dist
|
||||
|
||||
- name: Package UI
|
||||
id: package_ui
|
||||
run: |
|
||||
tar -czvf release/llama-${{ steps.tag.outputs.name }}-ui.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./ui-dist .
|
||||
|
||||
- name: Create release
|
||||
id: create_release
|
||||
uses: ggml-org/action-create-release@v1
|
||||
@@ -1366,6 +1383,9 @@ jobs:
|
||||
- [openEuler aarch64 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-aarch64.tar.gz)
|
||||
- [openEuler aarch64 (910b, ACL Graph)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-aarch64-aclgraph.tar.gz)
|
||||
|
||||
**UI:**
|
||||
- [UI](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-ui.tar.gz)
|
||||
|
||||
- name: Upload release
|
||||
id: upload_release
|
||||
uses: actions/github-script@v8
|
||||
|
||||
@@ -91,45 +91,44 @@ jobs:
|
||||
export ${{ matrix.extra_args }}
|
||||
pytest -v -x -m "not slow"
|
||||
|
||||
# TODO: provision CUDA runner
|
||||
# server-cuda:
|
||||
# runs-on: [self-hosted, llama-server, Linux, NVIDIA]
|
||||
#
|
||||
# name: server-cuda (${{ matrix.wf_name }})
|
||||
# strategy:
|
||||
# matrix:
|
||||
# build_type: [Release]
|
||||
# wf_name: ["GPUx1"]
|
||||
# include:
|
||||
# - build_type: Release
|
||||
# extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
|
||||
# wf_name: "GPUx1, backend-sampling"
|
||||
# fail-fast: false
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
# with:
|
||||
# fetch-depth: 0
|
||||
# ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
#
|
||||
# - name: Build
|
||||
# id: cmake_build
|
||||
# run: |
|
||||
# cmake -B build -DGGML_SCHED_NO_REALLOC=ON
|
||||
# cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server
|
||||
#
|
||||
# - name: Tests
|
||||
# id: server_integration_tests
|
||||
# if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
|
||||
# run: |
|
||||
# cd tools/server/tests
|
||||
# python3 -m venv venv
|
||||
# source venv/bin/activate
|
||||
# pip install -r requirements.txt
|
||||
# export ${{ matrix.extra_args }}
|
||||
# pytest -v -x -m "not slow"
|
||||
server-cuda:
|
||||
runs-on: [self-hosted, llama-server, Linux, NVIDIA]
|
||||
|
||||
name: server-cuda (${{ matrix.wf_name }})
|
||||
strategy:
|
||||
matrix:
|
||||
build_type: [Release]
|
||||
wf_name: ["GPUx1"]
|
||||
include:
|
||||
- build_type: Release
|
||||
extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
|
||||
wf_name: "GPUx1, backend-sampling"
|
||||
fail-fast: false
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build -DGGML_CUDA=ON -DGGML_SCHED_NO_REALLOC=ON
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_tests
|
||||
if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
export ${{ matrix.extra_args }}
|
||||
pytest -v -x -m "not slow"
|
||||
|
||||
server-kleidiai:
|
||||
runs-on: ah-ubuntu_22_04-c8g_8x
|
||||
|
||||
@@ -54,8 +54,13 @@ concurrency:
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
ui-build:
|
||||
name: Build Web UI
|
||||
uses: ./.github/workflows/ui-build.yml
|
||||
|
||||
server:
|
||||
runs-on: ubuntu-latest
|
||||
needs: ui-build
|
||||
|
||||
name: server (${{ matrix.wf_name }})
|
||||
strategy:
|
||||
@@ -93,12 +98,11 @@ jobs:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v6
|
||||
- name: Download built UI
|
||||
uses: actions/download-artifact@v7
|
||||
with:
|
||||
node-version: "24"
|
||||
cache: "npm"
|
||||
cache-dependency-path: "tools/ui/package-lock.json"
|
||||
name: ui-build
|
||||
path: tools/ui/dist
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
|
||||
@@ -6,7 +6,7 @@ on:
|
||||
jobs:
|
||||
build:
|
||||
name: Build static output
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: [self-hosted, fast]
|
||||
env:
|
||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
|
||||
@@ -31,7 +31,7 @@ jobs:
|
||||
|
||||
- name: Generate checksums
|
||||
run: |
|
||||
cd build/tools/ui/dist
|
||||
cd tools/ui/dist
|
||||
for f in *; do
|
||||
sha256sum "$f" | awk '{print $1, $2}' >> checksums.txt
|
||||
done
|
||||
@@ -40,5 +40,5 @@ jobs:
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
name: ui-build
|
||||
path: build/tools/ui/dist/
|
||||
path: tools/ui/dist/
|
||||
retention-days: 1
|
||||
|
||||
@@ -0,0 +1,118 @@
|
||||
name: CI (UI, self-hosted)
|
||||
|
||||
# these are the same as ui-ci.yml, but with self-hosted runners
|
||||
# the runners come with pre-installed Playwright browsers version: 1.56.1
|
||||
# the jobs are much lighter because they don't need to install node and playwright browsers
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
sha:
|
||||
description: 'Commit SHA1 to build'
|
||||
required: false
|
||||
type: string
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/ui-ci-self-hosted.yml',
|
||||
'.github/workflows/ui-build.yml',
|
||||
'tools/ui/**.*',
|
||||
'tools/server/tests/**.*'
|
||||
]
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/ui-ci-self-hosted.yml',
|
||||
'.github/workflows/ui-build.yml',
|
||||
'tools/ui/**.*',
|
||||
'tools/server/tests/**.*'
|
||||
]
|
||||
|
||||
env:
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_LOG_VERBOSITY: 10
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
ui-build:
|
||||
name: Build static output
|
||||
uses: ./.github/workflows/ui-build.yml
|
||||
|
||||
ui-checks:
|
||||
name: UI Checks
|
||||
needs: ui-build
|
||||
runs-on: [self-hosted, PLAYWRIGHT]
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Install dependencies
|
||||
id: setup
|
||||
run: npm ci
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Run type checking
|
||||
if: ${{ always() && steps.setup.conclusion == 'success' }}
|
||||
run: npm run check
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Run linting
|
||||
if: ${{ always() && steps.setup.conclusion == 'success' }}
|
||||
run: npm run lint
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Run Client tests
|
||||
if: ${{ always() }}
|
||||
run: npm run test:client
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Run Unit tests
|
||||
if: ${{ always() }}
|
||||
run: npm run test:unit
|
||||
working-directory: tools/ui
|
||||
|
||||
e2e-tests:
|
||||
name: E2E Tests
|
||||
needs: ui-build
|
||||
runs-on: [self-hosted, PLAYWRIGHT]
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Install dependencies
|
||||
id: setup
|
||||
run: npm ci
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Build application
|
||||
if: ${{ always() && steps.setup.conclusion == 'success' }}
|
||||
run: npm run build
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Build Storybook
|
||||
if: ${{ always() }}
|
||||
run: npm run build-storybook
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Run UI tests
|
||||
if: ${{ always() }}
|
||||
run: npm run test:ui -- --testTimeout=60000
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Run E2E tests
|
||||
if: ${{ always() }}
|
||||
run: npm run test:e2e
|
||||
working-directory: tools/ui
|
||||
@@ -12,6 +12,7 @@ on:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/ui-ci.yml',
|
||||
'.github/workflows/ui-build.yml',
|
||||
'tools/ui/**.*',
|
||||
'tools/server/tests/**.*'
|
||||
]
|
||||
@@ -19,6 +20,7 @@ on:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/ui-ci.yml',
|
||||
'.github/workflows/ui-build.yml',
|
||||
'tools/ui/**.*',
|
||||
'tools/server/tests/**.*'
|
||||
]
|
||||
@@ -41,6 +43,8 @@ jobs:
|
||||
ui-checks:
|
||||
name: UI Checks
|
||||
needs: ui-build
|
||||
# TODO: cannot move to self-hosted runner because the Playwright browsers require sudo to install
|
||||
# figure out how to fix that - maybe provision runners with already installed browsers and do not do the install step?
|
||||
runs-on: ubuntu-latest
|
||||
continue-on-error: true
|
||||
steps:
|
||||
|
||||
@@ -38,7 +38,7 @@ jobs:
|
||||
uses: actions/download-artifact@v7
|
||||
with:
|
||||
name: ui-build
|
||||
path: build/tools/ui/dist/
|
||||
path: tools/ui/dist/
|
||||
|
||||
- name: Install Hugging Face Hub CLI
|
||||
run: pip install -U huggingface_hub
|
||||
@@ -49,12 +49,12 @@ jobs:
|
||||
- name: Sync built files to Hugging Face bucket (version tag)
|
||||
run: |
|
||||
# Upload the built files to the Hugging Face bucket under the release version
|
||||
hf buckets sync build/tools/ui/dist hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/${{ inputs.version_tag }} --delete --quiet
|
||||
hf buckets sync tools/ui/dist hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/${{ inputs.version_tag }} --delete --quiet
|
||||
|
||||
- name: Sync built files to Hugging Face bucket (latest)
|
||||
run: |
|
||||
# Also upload to the 'latest' directory for fallback downloads
|
||||
hf buckets sync build/tools/ui/dist hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/latest --delete --quiet
|
||||
hf buckets sync tools/ui/dist hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/latest --delete --quiet
|
||||
|
||||
- name: Verify upload
|
||||
run: |
|
||||
|
||||
@@ -14,7 +14,7 @@ on:
|
||||
|
||||
jobs:
|
||||
update-ops-docs:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: [self-hosted, fast]
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
|
||||
+3
-2
@@ -1,7 +1,7 @@
|
||||
You are a coding agent. Here are some very important rules that you must follow:
|
||||
|
||||
General:
|
||||
- By very precise and concise when writing code, comments, explanations, etc.
|
||||
- Be very precise and concise when writing code, comments, explanations, etc.
|
||||
- PR and commit titles format: `<module> : <title>`. Lookup recents for examples
|
||||
- Don't try to build or run the code unless you are explicitly asked to do so
|
||||
- Use the `gh` CLI tool when querying PRs, issues, or other GitHub resources
|
||||
@@ -16,7 +16,8 @@ Pull requests (PRs):
|
||||
- New branch names are prefixed with "gg/"
|
||||
- Before opening a pull request, ask the user to confirm the description
|
||||
- When creating a pull request, look for the repository's PR template and follow it
|
||||
- For the AI usage disclosure section, write "YES. llama.cpp + pi"
|
||||
- For the AI usage disclosure section, write "YES. llama.cpp + pi + [MODEL]"
|
||||
- Ask the user to tell you what model was used and write it in place of [MODEL]
|
||||
- Always create the pull requests in draft mode
|
||||
|
||||
Commits:
|
||||
|
||||
+1
-11
@@ -108,20 +108,10 @@ option(LLAMA_BUILD_TESTS "llama: build tests"
|
||||
option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_APP "llama: build the unified binary" OFF)
|
||||
option(LLAMA_BUILD_APP "llama: build the unified binary" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_UI "llama: build the embedded Web UI for server" ON)
|
||||
option(LLAMA_USE_PREBUILT_UI "llama: use prebuilt UI from HF Bucket when available (requires LLAMA_BUILD_UI=ON)" ON)
|
||||
|
||||
# Backward compat: when old var is set but new one isn't, forward the value
|
||||
if(DEFINED LLAMA_BUILD_WEBUI)
|
||||
set(LLAMA_BUILD_UI ${LLAMA_BUILD_WEBUI})
|
||||
message(DEPRECATION "LLAMA_BUILD_WEBUI is deprecated, use LLAMA_BUILD_UI instead")
|
||||
endif()
|
||||
if(DEFINED LLAMA_USE_PREBUILT_WEBUI)
|
||||
set(LLAMA_USE_PREBUILT_UI ${LLAMA_USE_PREBUILT_WEBUI})
|
||||
message(DEPRECATION "LLAMA_USE_PREBUILT_WEBUI is deprecated, use LLAMA_USE_PREBUILT_UI instead")
|
||||
endif()
|
||||
|
||||
option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT})
|
||||
option(LLAMA_TESTS_INSTALL "llama: install tests" ON)
|
||||
|
||||
|
||||
@@ -49,7 +49,6 @@
|
||||
/examples/parallel/ @ggerganov
|
||||
/examples/passkey/ @ggerganov
|
||||
/examples/retrieval/ @ggerganov
|
||||
/examples/save-load-state/ @ggerganov
|
||||
/examples/speculative-simple/ @ggerganov
|
||||
/examples/speculative/ @ggerganov
|
||||
/ggml/cmake/ @ggerganov
|
||||
|
||||
@@ -27,6 +27,7 @@ LLM inference in C/C++
|
||||
- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
|
||||
- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
|
||||
- Hugging Face GGUF editor: [discussion](https://github.com/ggml-org/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
|
||||
- WebGPU support is now available in the browser, see a blog/demo introducing it [here](https://reeselevine.github.io/llamas-on-the-web/).
|
||||
|
||||
----
|
||||
|
||||
@@ -290,7 +291,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
| [CANN](docs/build.md#cann) | Ascend NPU |
|
||||
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
|
||||
| [IBM zDNN](docs/backend/zDNN.md) | IBM Z & LinuxONE |
|
||||
| [WebGPU [In Progress]](docs/build.md#webgpu) | All |
|
||||
| [WebGPU](docs/build.md#webgpu) | All |
|
||||
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
|
||||
| [Hexagon [In Progress]](docs/backend/snapdragon/README.md) | Snapdragon |
|
||||
| [VirtGPU](docs/backend/VirtGPU.md) | VirtGPU APIR |
|
||||
|
||||
+10
-1
@@ -3,7 +3,16 @@ set(TARGET llama-app)
|
||||
add_executable(${TARGET} llama.cpp)
|
||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama)
|
||||
|
||||
target_link_libraries(${TARGET} PRIVATE llama-server-impl llama-cli-impl llama-completion-impl llama-bench-impl)
|
||||
target_link_libraries(${TARGET} PRIVATE
|
||||
llama-server-impl
|
||||
llama-cli-impl
|
||||
llama-completion-impl
|
||||
llama-bench-impl
|
||||
llama-batched-bench-impl
|
||||
llama-fit-params-impl
|
||||
llama-quantize-impl
|
||||
llama-perplexity-impl
|
||||
)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
if(LLAMA_TOOLS_INSTALL)
|
||||
|
||||
+33
-5
@@ -1,14 +1,24 @@
|
||||
#include "build-info.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
// visible
|
||||
int llama_server(int argc, char ** argv);
|
||||
int llama_cli(int argc, char ** argv);
|
||||
|
||||
// hidden
|
||||
int llama_completion(int argc, char ** argv);
|
||||
int llama_bench(int argc, char ** argv);
|
||||
int llama_batched_bench(int argc, char ** argv);
|
||||
int llama_fit_params(int argc, char ** argv);
|
||||
int llama_quantize(int argc, char ** argv);
|
||||
int llama_perplexity(int argc, char ** argv);
|
||||
|
||||
static int help(int argc, char ** argv);
|
||||
static int version(int argc, char ** argv);
|
||||
|
||||
struct command {
|
||||
const char * name;
|
||||
@@ -19,13 +29,23 @@ struct command {
|
||||
};
|
||||
|
||||
static const command cmds[] = {
|
||||
{"serve", "HTTP API server", {"server"}, false, llama_server },
|
||||
{"cli", "Command-line interactive interface", {"client"}, false, llama_cli },
|
||||
{"completion", "Text completion", {"complete"}, true, llama_completion },
|
||||
{"bench", "Benchmarking tool", {}, true, llama_bench },
|
||||
{"help", "Show available commands", {}, true, help },
|
||||
{"serve", "HTTP API server", {"server"}, false, llama_server },
|
||||
{"cli", "Command-line interactive interface", {"client"}, false, llama_cli },
|
||||
{"completion", "Text completion", {"complete"}, true, llama_completion },
|
||||
{"bench", "Benchmark prompt processing and text generation", {}, true, llama_bench },
|
||||
{"batched-bench", "Benchmark batched decoding performance", {}, true, llama_batched_bench},
|
||||
{"fit-params", "Compute parameters to fit a model in device memory", {}, true, llama_fit_params },
|
||||
{"quantize", "Quantize a model", {}, true, llama_quantize },
|
||||
{"perplexity", "Compute model perplexity and KL divergence", {}, true, llama_perplexity },
|
||||
{"version", "Show version", {}, true, version },
|
||||
{"help", "Show available commands", {}, true, help },
|
||||
};
|
||||
|
||||
static int version(int argc, char ** argv) {
|
||||
printf("%s\n", llama_build_info());
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int help(int argc, char ** argv) {
|
||||
const bool show_all = argc >= 2 && std::string(argv[1]) == "all";
|
||||
|
||||
@@ -58,6 +78,14 @@ int main(int argc, char ** argv) {
|
||||
|
||||
for (const auto & cmd : cmds) {
|
||||
if (matches(arg, cmd)) {
|
||||
|
||||
// router spawns children through this same binary, it needs the
|
||||
// subcommand to relaunch as 'llama serve' and not bare options
|
||||
#ifdef _WIN32
|
||||
_putenv_s("LLAMA_APP_CMD", cmd.name);
|
||||
#else
|
||||
setenv("LLAMA_APP_CMD", cmd.name, 1);
|
||||
#endif
|
||||
return cmd.func(argc - 1, argv + 1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,6 +7,7 @@ VISIONOS_MIN_OS_VERSION=1.0
|
||||
TVOS_MIN_OS_VERSION=16.4
|
||||
|
||||
BUILD_SHARED_LIBS=OFF
|
||||
LLAMA_BUILD_APP=OFF
|
||||
LLAMA_BUILD_EXAMPLES=OFF
|
||||
LLAMA_BUILD_TOOLS=OFF
|
||||
LLAMA_BUILD_TESTS=OFF
|
||||
@@ -31,6 +32,7 @@ COMMON_CMAKE_ARGS=(
|
||||
-DCMAKE_XCODE_ATTRIBUTE_STRIP_INSTALLED_PRODUCT=NO
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
-DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}
|
||||
-DLLAMA_BUILD_APP=${LLAMA_BUILD_APP}
|
||||
-DLLAMA_BUILD_EXAMPLES=${LLAMA_BUILD_EXAMPLES}
|
||||
-DLLAMA_BUILD_TOOLS=${LLAMA_BUILD_TOOLS}
|
||||
-DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS}
|
||||
|
||||
@@ -238,7 +238,7 @@ function gg_run_ctest_debug {
|
||||
(cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time cmake --build . --config Debug -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
(time ctest -C Debug --output-on-failure -L main -E "test-opt|test-backend-ops" ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
(time ctest -C Debug --output-on-failure -L main -E "test-opt|test-backend-ops|test-llama-archs" ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
|
||||
set +e
|
||||
}
|
||||
@@ -461,10 +461,10 @@ function gg_run_qwen3_0_6b {
|
||||
|
||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/test-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/test-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/test-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/test-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
|
||||
function check_ppl {
|
||||
qnt="$1"
|
||||
|
||||
@@ -3591,6 +3591,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.speculative.draft.p_min = std::stof(value);
|
||||
}
|
||||
).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_P_MIN"));
|
||||
add_opt(common_arg(
|
||||
{"--spec-draft-backend-sampling"},
|
||||
{"--no-spec-draft-backend-sampling"},
|
||||
string_format("offload draft sampling to the backend (default: %s)",
|
||||
params.speculative.draft.backend_sampling ? "enabled" : "disabled"),
|
||||
[](common_params & params, bool value) {
|
||||
params.speculative.draft.backend_sampling = value;
|
||||
}
|
||||
).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_BACKEND_SAMPLING"));
|
||||
add_opt(common_arg(
|
||||
{"--spec-draft-device", "-devd", "--device-draft"}, "<dev1,dev2,..>",
|
||||
"comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
|
||||
|
||||
@@ -219,6 +219,7 @@ struct common_chat_parser_params {
|
||||
bool reasoning_in_content = false;
|
||||
std::string generation_prompt;
|
||||
bool parse_tool_calls = true;
|
||||
bool is_continuation = false;
|
||||
bool echo = false; // Include assistant prefilled msg in output
|
||||
bool debug = false; // Enable debug output for PEG parser
|
||||
common_peg_arena parser = {};
|
||||
|
||||
+3
-5
@@ -305,6 +305,8 @@ struct common_params_speculative_draft {
|
||||
float p_split = 0.1f; // speculative decoding split probability
|
||||
float p_min = 0.0f; // minimum speculative decoding probability (greedy)
|
||||
|
||||
bool backend_sampling = true; // offload draft sampling to the backend (default: on)
|
||||
|
||||
common_params_model mparams;
|
||||
|
||||
llama_context * ctx_tgt = nullptr;
|
||||
@@ -615,11 +617,7 @@ struct common_params {
|
||||
std::map<std::string, std::string> default_template_kwargs;
|
||||
|
||||
// UI configs
|
||||
#ifdef LLAMA_UI_DEFAULT_ENABLED
|
||||
bool ui = LLAMA_UI_DEFAULT_ENABLED != 0;
|
||||
#else
|
||||
bool ui = true; // default to enabled when not set
|
||||
#endif
|
||||
bool ui = true;
|
||||
|
||||
// Deprecated: use ui, ui_mcp_proxy, ui_config_json instead
|
||||
bool webui = ui;
|
||||
|
||||
+1
-1
@@ -26,7 +26,7 @@ class common_params_fit_exception : public std::runtime_error {
|
||||
using std::runtime_error::runtime_error;
|
||||
};
|
||||
|
||||
static std::vector<llama_device_memory_data> common_get_device_memory_data(
|
||||
std::vector<llama_device_memory_data> common_get_device_memory_data(
|
||||
const char * path_model,
|
||||
const llama_model_params * mparams,
|
||||
const llama_context_params * cparams,
|
||||
|
||||
@@ -1,6 +1,11 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "llama.h"
|
||||
#include "../src/llama-ext.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
enum common_params_fit_status {
|
||||
COMMON_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
|
||||
@@ -30,3 +35,14 @@ void common_fit_print(
|
||||
struct llama_context_params * cparams);
|
||||
|
||||
void common_memory_breakdown_print(const struct llama_context * ctx);
|
||||
|
||||
// Load a model + context with no_alloc and return the per-device memory breakdown.
|
||||
std::vector<llama_device_memory_data> common_get_device_memory_data(
|
||||
const char * path_model,
|
||||
const struct llama_model_params * mparams,
|
||||
const struct llama_context_params * cparams,
|
||||
std::vector<ggml_backend_dev_t> & devs,
|
||||
uint32_t & hp_ngl,
|
||||
uint32_t & hp_n_ctx_train,
|
||||
uint32_t & hp_n_expert,
|
||||
enum ggml_log_level log_level);
|
||||
|
||||
+37
-7
@@ -33,16 +33,15 @@ const std::map<std::string, common_speculative_type> common_speculative_type_fro
|
||||
};
|
||||
|
||||
static std::string common_speculative_get_devices_str(const std::vector<ggml_backend_dev_t> & devices) {
|
||||
if (devices.empty()) {
|
||||
return "default";
|
||||
}
|
||||
|
||||
std::string result;
|
||||
for (size_t i = 0; i < devices.size(); i++) {
|
||||
if (i > 0) result += ", ";
|
||||
if (devices[i] == nullptr) {
|
||||
continue;
|
||||
}
|
||||
if (!result.empty()) result += ", ";
|
||||
result += ggml_backend_dev_name(devices[i]);
|
||||
}
|
||||
return result;
|
||||
return result.empty() ? "default" : result;
|
||||
}
|
||||
|
||||
struct common_speculative_config {
|
||||
@@ -414,6 +413,9 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
|
||||
|
||||
std::vector<common_sampler_ptr> smpls;
|
||||
|
||||
// backend sampler chain per seq, attached to ctx_dft
|
||||
std::vector<llama_sampler *> backend_chains;
|
||||
|
||||
int32_t n_embd = 0;
|
||||
|
||||
// Per-sequence cross-batch carryover: pair (h_p, x_{p+1}) at MTP pos p+1.
|
||||
@@ -445,7 +447,7 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
|
||||
n_embd = llama_model_n_embd(llama_get_model(ctx_dft));
|
||||
|
||||
LOG_INF("%s: adding speculative implementation 'draft-mtp'\n", __func__);
|
||||
LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f, n_embd=%d\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min, n_embd);
|
||||
LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f, n_embd=%d, backend_sampling=%d\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min, n_embd, (int) this->params.backend_sampling);
|
||||
LOG_INF("%s: - gpu_layers=%d, cache_k=%s, cache_v=%s, ctx_tgt=%s, ctx_dft=%s, devices=[%s]\n", __func__,
|
||||
this->params.n_gpu_layers,
|
||||
ggml_type_name(this->params.cache_type_k),
|
||||
@@ -469,6 +471,22 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
|
||||
s.reset(common_sampler_init(llama_get_model(ctx_dft), sparams));
|
||||
}
|
||||
|
||||
// offload draft sampling to the backend
|
||||
backend_chains.assign(n_seq, nullptr);
|
||||
if (this->params.backend_sampling) {
|
||||
for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
|
||||
llama_sampler * chain = llama_sampler_chain_init(llama_sampler_chain_default_params());
|
||||
llama_sampler_chain_add(chain, llama_sampler_init_top_k(10));
|
||||
|
||||
if (!llama_set_sampler(ctx_dft, seq_id, chain)) {
|
||||
LOG_WRN("%s: backend offload failed for seq_id=%d; using CPU sampler\n", __func__, (int) seq_id);
|
||||
llama_sampler_free(chain);
|
||||
chain = nullptr;
|
||||
}
|
||||
backend_chains[seq_id] = chain;
|
||||
}
|
||||
}
|
||||
|
||||
llama_set_embeddings_pre_norm(ctx_tgt, true, /*masked*/ false);
|
||||
llama_set_embeddings_pre_norm(ctx_dft, true, /*masked*/ true);
|
||||
|
||||
@@ -484,6 +502,18 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
|
||||
}
|
||||
|
||||
~common_speculative_impl_draft_mtp() override {
|
||||
auto * ctx_dft = this->params.ctx_dft;
|
||||
for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) backend_chains.size(); ++seq_id) {
|
||||
if (backend_chains[seq_id] == nullptr) {
|
||||
continue;
|
||||
}
|
||||
if (ctx_dft) {
|
||||
llama_set_sampler(ctx_dft, seq_id, nullptr);
|
||||
}
|
||||
llama_sampler_free(backend_chains[seq_id]);
|
||||
}
|
||||
backend_chains.clear();
|
||||
|
||||
if (batch.token != nullptr) {
|
||||
free(batch.token);
|
||||
batch.token = nullptr;
|
||||
|
||||
@@ -1610,6 +1610,47 @@ class TextModel(ModelBase):
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def _set_vocab_hybriddna(self):
|
||||
from transformers import AutoTokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
|
||||
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) # ty: ignore[unresolved-attribute]
|
||||
assert max(tokenizer.vocab.values()) < vocab_size # ty: ignore[unresolved-attribute]
|
||||
|
||||
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute]
|
||||
# k-mers can share text with a base-vocab BPE token (e.g. CCCCCC) and get
|
||||
# dropped by get_vocab(); a reserved marker suffix (U+E000) keeps each
|
||||
# k-mer's own id (llama.cpp strips it on detokenization)
|
||||
for kmer in tokenizer.kmers: # ty: ignore[unresolved-attribute]
|
||||
reverse_vocab[tokenizer.dna_token_to_id[kmer]] = kmer + "\ue000" # ty: ignore[unresolved-attribute]
|
||||
added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute]
|
||||
added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute]
|
||||
|
||||
tokens: list[str] = []
|
||||
toktypes: list[int] = []
|
||||
for i in range(vocab_size):
|
||||
if i not in reverse_vocab:
|
||||
tokens.append(f"[PAD{i}]")
|
||||
toktypes.append(gguf.TokenType.UNUSED)
|
||||
else:
|
||||
token: str = reverse_vocab[i]
|
||||
if token in added_vocab:
|
||||
if added_tokens_decoder[i].special or self.does_token_look_special(token):
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
else:
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
else:
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
tokens.append(token)
|
||||
|
||||
tokpre = self.get_vocab_base_pre(tokenizer)
|
||||
self.gguf_writer.add_tokenizer_model("hybriddna")
|
||||
self.gguf_writer.add_tokenizer_pre(tokpre)
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def _set_vocab_qwen(self):
|
||||
from .qwen import QwenModel
|
||||
|
||||
|
||||
+10
-60
@@ -189,7 +189,8 @@ class HunYuanModel(TextModel):
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
# HunyuanOCR has pad_token_id=-1 in config.json; exclude pad from SpecialVocab
|
||||
# Some HunYuanVL variants (e.g. OCR-style configs) have pad_token_id=-1;
|
||||
# guard SpecialVocab so it doesn't try to emit an invalid pad id.
|
||||
token_types = None
|
||||
if (self.hparams.get("pad_token_id") or 0) < 0:
|
||||
token_types = ('bos', 'eos', 'unk', 'sep', 'cls', 'mask')
|
||||
@@ -250,7 +251,8 @@ class HunYuanModel(TextModel):
|
||||
self._fix_special_tokens()
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
# HunyuanOCR has num_experts=1 which is not MoE, prevent parent from writing it
|
||||
# Some HunYuanVL variants set num_experts=1 (not real MoE);
|
||||
# prevent the parent class from emitting expert_count metadata in that case.
|
||||
saved_num_experts = self.hparams.pop("num_experts", None)
|
||||
super().set_gguf_parameters()
|
||||
if saved_num_experts is not None and saved_num_experts > 1:
|
||||
@@ -288,51 +290,21 @@ class HunYuanModel(TextModel):
|
||||
|
||||
@ModelBase.register("HunYuanVLForConditionalGeneration")
|
||||
class HunyuanVLVisionModel(MmprojModel):
|
||||
# Handles both HunyuanOCR and HunyuanVL, which share the HF architecture name
|
||||
# "HunYuanVLForConditionalGeneration" and the `vit.perceive.*` vision layout.
|
||||
# Each variant maps to a different projector type in clip.cpp so image
|
||||
# preprocessing follows the correct code path.
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
assert self.hparams_vision is not None
|
||||
# HunyuanOCR / HunyuanVL uses max_image_size instead of image_size
|
||||
# HunyuanVL uses max_image_size instead of image_size
|
||||
if "image_size" not in self.hparams_vision:
|
||||
self.hparams_vision["image_size"] = self.hparams_vision.get("max_image_size", 2048)
|
||||
|
||||
@staticmethod
|
||||
def is_ocr_variant(hparams: dict) -> bool:
|
||||
"""Return True for HunyuanOCR, False for HunyuanVL.
|
||||
|
||||
The projector's output dim must equal the text model's hidden_size by
|
||||
construction (that's what "projector" means). HunyuanOCR pairs a 1B text
|
||||
backbone (hidden=1024); HunyuanVL pairs a 4B one (hidden=3072). So the
|
||||
ViT -> LLM projection dim is a hard architectural signature, not a
|
||||
magic number.
|
||||
"""
|
||||
vision_out = int((hparams.get("vision_config") or {}).get("out_hidden_size", 0))
|
||||
return vision_out == 1024
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
assert self.hparams_vision is not None
|
||||
vcfg = self.hparams_vision
|
||||
|
||||
if self.is_ocr_variant(self.global_config):
|
||||
# --- HunyuanOCR ---
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR)
|
||||
self.gguf_writer.add_vision_use_gelu(True)
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(vcfg.get("rms_norm_eps", 1e-5))
|
||||
self.gguf_writer.add_vision_spatial_merge_size(vcfg.get("spatial_merge_size", 2))
|
||||
self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
|
||||
self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])
|
||||
return
|
||||
|
||||
# --- HunyuanVL ---
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANVL)
|
||||
self.gguf_writer.add_vision_use_gelu(str(vcfg["hidden_act"]).lower() == "gelu")
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(float(vcfg["rms_norm_eps"]))
|
||||
self.gguf_writer.add_vision_spatial_merge_size(int(vcfg["spatial_merge_size"]))
|
||||
self.gguf_writer.add_vision_use_gelu(True)
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(vcfg.get("rms_norm_eps", 1e-5))
|
||||
self.gguf_writer.add_vision_spatial_merge_size(vcfg.get("spatial_merge_size", 2))
|
||||
self.gguf_writer.add_vision_min_pixels(int(self.preprocessor_config["min_pixels"]))
|
||||
self.gguf_writer.add_vision_max_pixels(int(self.preprocessor_config["max_pixels"]))
|
||||
|
||||
@@ -353,7 +325,7 @@ class HunyuanVLVisionModel(MmprojModel):
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
# force conv weights to F32 or F16 to avoid BF16 IM2COL issues on Metal
|
||||
# Both HunyuanOCR and HunyuanVL emit the ViT -> LLM projection as mm.0/mm.2.
|
||||
# HunyuanVL emit the ViT -> LLM projection as mm.0/mm.2.
|
||||
if ("mm.0." in new_name or "mm.2." in new_name) and new_name.endswith(".weight"):
|
||||
return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32
|
||||
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
||||
@@ -361,40 +333,18 @@ class HunyuanVLVisionModel(MmprojModel):
|
||||
|
||||
@ModelBase.register("HunYuanVLForConditionalGeneration")
|
||||
class HunyuanVLTextModel(HunYuanModel):
|
||||
# The "HunYuanVLForConditionalGeneration" HF architecture covers both HunyuanOCR
|
||||
# and HunyuanVL. HunyuanOCR reuses the HunYuan-Dense text backbone (standard RoPE),
|
||||
# while HunyuanVL introduces a new LLM arch with XD-RoPE. Detect the variant from
|
||||
# the config and pick the matching GGUF architecture.
|
||||
model_arch = gguf.MODEL_ARCH.HUNYUAN_VL
|
||||
|
||||
@staticmethod
|
||||
def _is_ocr_config(hparams: dict) -> bool:
|
||||
# OCR pairs a 1B text backbone (hidden=1024) with a ViT projector that
|
||||
# outputs 1024-d; HunyuanVL uses 3072-d. Keep in sync with
|
||||
# HunyuanVLVisionModel.is_ocr_variant.
|
||||
return int((hparams.get("vision_config") or {}).get("out_hidden_size", 0)) == 1024
|
||||
|
||||
def __init__(self, dir_model: Path, *args, **kwargs):
|
||||
raw_hparams = kwargs.get("hparams") or ModelBase.load_hparams(dir_model, is_mistral_format=False)
|
||||
if self._is_ocr_config(raw_hparams):
|
||||
self.model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
|
||||
else:
|
||||
self.model_arch = gguf.MODEL_ARCH.HUNYUAN_VL
|
||||
super().__init__(dir_model, *args, **kwargs)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
|
||||
# Only emit XD-RoPE metadata for the HunyuanVL backbone; HunyuanOCR uses
|
||||
# the HunYuan-Dense arch which already handles standard rope in super().
|
||||
if self.model_arch != gguf.MODEL_ARCH.HUNYUAN_VL:
|
||||
return
|
||||
|
||||
# XD-RoPE metadata for the HunyuanVL;
|
||||
if self.rope_parameters.get("rope_type") != "xdrope":
|
||||
return
|
||||
|
||||
# defaults for HunyuanVL. The C++ side later computes:
|
||||
# freq_base = rope_theta * alpha ** (head_dim / (head_dim - 2))
|
||||
self.gguf_writer.add_rope_freq_base(float(self.rope_parameters["rope_theta"]))
|
||||
self.gguf_writer.add_rope_scaling_alpha(float(self.rope_parameters["alpha"]))
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
||||
|
||||
+9
-7
@@ -51,6 +51,15 @@ class LlamaModel(TextModel):
|
||||
if path_tekken_json.is_file() and not path_tokenizer_json.is_file():
|
||||
self._set_vocab_mistral()
|
||||
|
||||
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
||||
if tokenizer_config_file.is_file():
|
||||
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
||||
tokenizer_config_json = json.load(f)
|
||||
if (add_prefix_space := tokenizer_config_json.get("add_prefix_space")) is not None:
|
||||
self.gguf_writer.add_add_space_prefix(add_prefix_space)
|
||||
if tokenizer_config_json.get("tokenizer_class") == "HybridDNATokenizer":
|
||||
return self._set_vocab_hybriddna()
|
||||
|
||||
try:
|
||||
self._set_vocab_sentencepiece()
|
||||
except FileNotFoundError:
|
||||
@@ -72,13 +81,6 @@ class LlamaModel(TextModel):
|
||||
special_vocab._set_special_token("eot", 32010)
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
||||
if tokenizer_config_file.is_file():
|
||||
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
||||
tokenizer_config_json = json.load(f)
|
||||
if "add_prefix_space" in tokenizer_config_json:
|
||||
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
|
||||
|
||||
# Apply to granite small models only
|
||||
if self.hparams.get("vocab_size", 32000) == 49152:
|
||||
self.gguf_writer.add_add_bos_token(False)
|
||||
|
||||
@@ -489,6 +489,7 @@ The following templates have active tests in `tests/test-chat.cpp`:
|
||||
| Qwen-QwQ-32B | Reasoning | Forced-open thinking |
|
||||
| NousResearch Hermes 2 Pro | JSON_NATIVE | `<tool_call>` wrapper |
|
||||
| IBM Granite 3.3 | JSON_NATIVE | `<think></think>` + `<response></response>` |
|
||||
| IBM Granite 4.0 | JSON_NATIVE | `<tool_call>` wrapper (same template used by 4.1) |
|
||||
| ByteDance Seed-OSS | TAG_WITH_TAGGED | Custom `<seed:think>` and `<seed:tool_call>` tags |
|
||||
| Qwen3-Coder | TAG_WITH_TAGGED | XML-style tool format |
|
||||
| DeepSeek V3.1 | JSON_NATIVE | Forced thinking mode |
|
||||
|
||||
@@ -33,8 +33,8 @@
|
||||
"name": "arm64-windows-snapdragon",
|
||||
"inherits": [ "base", "arm64-windows-llvm" ],
|
||||
"cacheVariables": {
|
||||
"CMAKE_C_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE",
|
||||
"CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE",
|
||||
"CMAKE_C_FLAGS": "-march=armv8.7a+fp16+dotprod+i8mm -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE",
|
||||
"CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16+dotprod+i8mm -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE",
|
||||
"CMAKE_C_FLAGS_RELEASE": "-O3 -DNDEBUG",
|
||||
"CMAKE_CXX_FLAGS_RELEASE": "-O3 -DNDEBUG",
|
||||
"CMAKE_C_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
|
||||
|
||||
@@ -24,7 +24,7 @@ Native Windows 11 arm64 builds has the following tools dependencies:
|
||||
- UCRT and Driver Kit
|
||||
- LLVM core libraries and Clang compiler (winget)
|
||||
- CMake, Git, Python (winget)
|
||||
- Hexagon SDK Community Edition 6.4 or later (see windows.md)
|
||||
- Hexagon SDK Community Edition 6.6 or later (see windows.md)
|
||||
- OpenCL SDK 2.3 or later (see windows.md)
|
||||
|
||||
Note: The rest of the **Windows** build process assumes that you're running natively in Powershell.
|
||||
@@ -45,7 +45,7 @@ Preset CMake variables:
|
||||
GGML_HEXAGON="ON"
|
||||
GGML_OPENCL="ON"
|
||||
GGML_OPENMP="OFF"
|
||||
HEXAGON_SDK_ROOT="/opt/hexagon/6.4.0.2"
|
||||
HEXAGON_SDK_ROOT="/opt/hexagon/6.6.0.0"
|
||||
...
|
||||
-- Including OpenCL backend
|
||||
-- Including Hexagon backend
|
||||
|
||||
@@ -28,15 +28,15 @@ c:\Qualcomm\OpenCL_SDK\2.3.2
|
||||
|
||||
Either use the trimmed down version (optimized for CI) from
|
||||
|
||||
https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v6.4.0.2/hexagon-sdk-v6.4.0.2-arm64-wos.tar.xz
|
||||
https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v6.6.0.0/hexagon-sdk-v6.6.0.0-arm64-wos.tar.xz
|
||||
|
||||
Or download the complete official version from
|
||||
|
||||
https://softwarecenter.qualcomm.com/catalog/item/Hexagon_SDK?version=6.4.0.2
|
||||
https://softwarecenter.qualcomm.com/catalog/item/Hexagon_SDK?version=6.6.0.0
|
||||
|
||||
Unzip/untar the archive into
|
||||
```
|
||||
c:\Qualcomm\Hexagon_SDK\6.4.0.2
|
||||
c:\Qualcomm\Hexagon_SDK\6.6.0.0
|
||||
```
|
||||
|
||||
## Install the latest Adreno GPU driver
|
||||
@@ -123,10 +123,10 @@ The overall Hexagon backend build procedure for Windows on Snapdragon is the sam
|
||||
However, additional settings are required for generating and signing HTP Ops libraries.
|
||||
```
|
||||
> $env:OPENCL_SDK_ROOT="C:\Qualcomm\OpenCL_SDK\2.3.2"
|
||||
> $env:HEXAGON_SDK_ROOT="C:\Qualcomm\Hexagon_SDK\6.4.0.2"
|
||||
> $env:HEXAGON_TOOLS_ROOT="C:\Qualcomm\Hexagon_SDK\6.4.0.2\tools\HEXAGON_Tools\19.0.04"
|
||||
> $env:HEXAGON_SDK_ROOT="C:\Qualcomm\Hexagon_SDK\6.6.0.0"
|
||||
> $env:HEXAGON_TOOLS_ROOT="C:\Qualcomm\Hexagon_SDK\6.6.0.0\tools\HEXAGON_Tools\19.0.07"
|
||||
> $env:HEXAGON_HTP_CERT="c:\Users\MyUsers\Certs\ggml-htp-v1.pfx"
|
||||
> $env:WINDOWS_SDK_BIN="C:\Program Files (x86)\Windows Kits\10\bin\10.0.26100.0\arm64"
|
||||
> $env:WINDOWS_SDK_BIN="C:\Program Files (x86)\Windows Kits\10\bin\10.0.26100.0"
|
||||
|
||||
> cmake --preset arm64-windows-snapdragon-release -B build-wos
|
||||
...
|
||||
|
||||
+1
-1
@@ -735,7 +735,7 @@ ninja
|
||||
|
||||
To read documentation for how to build on Android, [click here](./android.md)
|
||||
|
||||
## WebGPU [In Progress]
|
||||
## WebGPU
|
||||
|
||||
The WebGPU backend relies on [Dawn](https://dawn.googlesource.com/dawn). Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/docs/quickstart-cmake.md) to install Dawn locally so that llama.cpp can find it using CMake. The current implementation is up-to-date with Dawn commit `18eb229`.
|
||||
|
||||
|
||||
@@ -291,6 +291,7 @@ Here are some models known to work (w/ chat template override when needed):
|
||||
llama-server --jinja -fa -hf bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M
|
||||
llama-server --jinja -fa -hf bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q6_K_L
|
||||
llama-server --jinja -fa -hf bartowski/Llama-3.3-70B-Instruct-GGUF:Q4_K_M
|
||||
llama-server --jinja -fa -hf ibm-granite/granite-4.1-3b-GGUF:Q4_K_M
|
||||
|
||||
# Native support for DeepSeek R1 works best w/ our template override (official template is buggy, although we do work around it)
|
||||
|
||||
|
||||
+1
-1
@@ -247,7 +247,7 @@ Specifies a comma-separated list of speculative decoding types to use.
|
||||
|------|-------------|
|
||||
| `none` | No speculative decoding (default) |
|
||||
| `draft-simple` | Use a simple draft model for speculation |
|
||||
| `draft-mtp` | Use Masked Token Prediction (MTP) heads from the main model |
|
||||
| `draft-mtp` | Use Multi Token Prediction (MTP) heads from the main model |
|
||||
| `ngram-cache` | Use n-gram cache lookup |
|
||||
| `ngram-simple` | Use simple n-gram pattern matching |
|
||||
| `ngram-map-k` | Use n-gram pattern matching with n-gram-keys |
|
||||
|
||||
@@ -27,7 +27,6 @@ else()
|
||||
add_subdirectory(parallel)
|
||||
add_subdirectory(passkey)
|
||||
add_subdirectory(retrieval)
|
||||
add_subdirectory(save-load-state)
|
||||
add_subdirectory(simple)
|
||||
add_subdirectory(simple-chat)
|
||||
add_subdirectory(speculative)
|
||||
|
||||
@@ -1308,7 +1308,8 @@ def do_dump_model(model_plus: ModelPlus) -> None:
|
||||
|
||||
def main(args_in: list[str] | None = None) -> None:
|
||||
output_choices = ["f32", "f16"]
|
||||
if np.uint32(1) == np.uint32(1).newbyteorder("<"):
|
||||
dummy_val = np.uint32(1)
|
||||
if dummy_val == dummy_val.view(dummy_val.dtype.newbyteorder("<")):
|
||||
# We currently only support Q8_0 output on little endian systems.
|
||||
output_choices.append("q8_0")
|
||||
parser = argparse.ArgumentParser(description="Convert a LLaMA model to a GGML compatible file")
|
||||
|
||||
@@ -25,6 +25,7 @@ android {
|
||||
arguments += "-DCMAKE_VERBOSE_MAKEFILE=ON"
|
||||
|
||||
arguments += "-DBUILD_SHARED_LIBS=ON"
|
||||
arguments += "-DLLAMA_BUILD_APP=OFF"
|
||||
arguments += "-DLLAMA_BUILD_COMMON=ON"
|
||||
arguments += "-DLLAMA_OPENSSL=OFF"
|
||||
|
||||
|
||||
@@ -64,7 +64,7 @@ def load_model_and_tokenizer(model_path, use_sentence_transformers=False, device
|
||||
print("Using SentenceTransformer to apply all numbered layers")
|
||||
model = SentenceTransformer(model_path)
|
||||
tokenizer = model.tokenizer
|
||||
config = model[0].auto_model.config
|
||||
config = model[0].auto_model.config # ty: ignore[unresolved-attribute]
|
||||
else:
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
||||
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
|
||||
|
||||
@@ -1,5 +0,0 @@
|
||||
set(TARGET llama-save-load-state)
|
||||
add_executable(${TARGET} save-load-state.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
@@ -76,6 +76,7 @@ GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_i
|
||||
// Utils
|
||||
// Create a buffer and allocate all the tensors in a ggml_context
|
||||
// ggml_backend_alloc_ctx_tensors_from_buft_size returns the size of the buffer that would be allocated by ggml_backend_alloc_ctx_tensors_from_buft
|
||||
// ggml_backend_alloc_ctx_tensors_from_buft returns NULL on failure or if all tensors in ctx are already allocated or zero-sized
|
||||
GGML_API size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
||||
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
||||
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
|
||||
|
||||
@@ -1275,6 +1275,9 @@ static void ggml_backend_meta_buffer_set_tensor(ggml_backend_buffer_t buffer, gg
|
||||
for (size_t j = 0; j < n_bufs; j++) {
|
||||
ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||
const size_t chunk_size_j = simple_tensor->nb[split_state.axis + 1];
|
||||
if (chunk_size_j == 0) {
|
||||
continue;
|
||||
}
|
||||
const size_t simple_offset = i_start * chunk_size_j;
|
||||
ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_j, simple_offset, chunk_size_j, i_stop - i_start, chunk_size_j, chunk_size_full);
|
||||
offset_j += chunk_size_j;
|
||||
@@ -1382,6 +1385,9 @@ static void ggml_backend_meta_buffer_get_tensor(ggml_backend_buffer_t buffer, co
|
||||
for (size_t j = 0; j < n_bufs; j++){
|
||||
const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||
const size_t chunk_size_j = simple_tensor->nb[split_state.axis + 1];
|
||||
if (chunk_size_j == 0) {
|
||||
continue;
|
||||
}
|
||||
const size_t simple_offset = i_start * chunk_size_j;
|
||||
ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_j, simple_offset, chunk_size_j, i_stop - i_start, chunk_size_j, chunk_size_full);
|
||||
offset_j += chunk_size_j;
|
||||
@@ -1445,6 +1451,7 @@ static ggml_backend_buffer_t ggml_backend_meta_buffer_type_alloc_buffer(ggml_bac
|
||||
buf_ctx->buf_configs.reserve(n_simple_bufts);
|
||||
for (size_t i = 0; i < n_simple_bufts; i++) {
|
||||
ggml_backend_buffer_t simple_buf = ggml_backend_buft_alloc_buffer(ggml_backend_meta_buft_simple_buft(buft, i), size);
|
||||
GGML_ASSERT(simple_buf != nullptr);
|
||||
max_size = std::max(max_size, ggml_backend_buffer_get_size(simple_buf));
|
||||
buf_ctx->buf_configs.emplace_back(ggml_init(params), simple_buf);
|
||||
}
|
||||
@@ -1474,8 +1481,27 @@ struct ggml_backend_buffer * ggml_backend_meta_alloc_ctx_tensors_from_buft(struc
|
||||
t->data = (void *) 0x2000000000000000; // FIXME
|
||||
}
|
||||
for (size_t i = 0; i < n_simple_bufts; i++) {
|
||||
meta_buf_ctx->buf_configs[i].buf = ggml_backend_alloc_ctx_tensors_from_buft(
|
||||
meta_buf_ctx->buf_configs[i].ctx, ggml_backend_meta_buft_simple_buft(buft, i));
|
||||
ggml_context * ctx = meta_buf_ctx->buf_configs[i].ctx;
|
||||
ggml_backend_buffer_type_t simple_buft = ggml_backend_meta_buft_simple_buft(buft, i);
|
||||
|
||||
// If a ggml_context only has zero-sized tensors, ggml_backend_alloc_ctx_tensors_from_buft returns NULL.
|
||||
// For those edge cases, allocate a dummy buffer instead.
|
||||
bool any_nonzero_slice = false;
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
||||
if (ggml_nelements(t) != 0) {
|
||||
any_nonzero_slice = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (any_nonzero_slice) {
|
||||
meta_buf_ctx->buf_configs[i].buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, simple_buft);
|
||||
} else {
|
||||
meta_buf_ctx->buf_configs[i].buf = ggml_backend_buft_alloc_buffer(simple_buft, 0);
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
||||
t->buffer = meta_buf_ctx->buf_configs[i].buf;
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(meta_buf_ctx->buf_configs[i].buf != nullptr);
|
||||
meta_buf->size = std::max(meta_buf->size, ggml_backend_buffer_get_size(meta_buf_ctx->buf_configs[i].buf));
|
||||
}
|
||||
return meta_buf;
|
||||
@@ -1605,6 +1631,9 @@ static void ggml_backend_meta_set_tensor_async(ggml_backend_t backend, ggml_tens
|
||||
ggml_backend_t simple_backend = ggml_backend_meta_simple_backend(backend, j);
|
||||
ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||
const size_t chunk_size_j = simple_tensor->nb[split_state.axis + 1];
|
||||
if (chunk_size_j == 0) {
|
||||
continue;
|
||||
}
|
||||
ggml_backend_tensor_set_2d_async(simple_backend, simple_tensor, (const char *) data + offset_j, offset, chunk_size_j,
|
||||
i_stop - i_start, chunk_size_j, chunk_size_full);
|
||||
offset_j += chunk_size_j;
|
||||
@@ -1646,6 +1675,9 @@ static void ggml_backend_meta_get_tensor_async(ggml_backend_t backend, const ggm
|
||||
ggml_backend_t simple_backend = ggml_backend_meta_simple_backend(backend, j);
|
||||
const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||
const size_t chunk_size_j = simple_tensor->nb[split_state.axis + 1];
|
||||
if (chunk_size_j == 0) {
|
||||
continue;
|
||||
}
|
||||
ggml_backend_tensor_get_2d_async(simple_backend, simple_tensor, (char *) data + offset_j, offset, chunk_size_j,
|
||||
i_stop - i_start, chunk_size_j, chunk_size_full);
|
||||
offset_j += chunk_size_j;
|
||||
|
||||
@@ -306,7 +306,7 @@ void ggml_backend_tensor_get_2d_async(ggml_backend_t backend, const struct ggml_
|
||||
GGML_ASSERT(tensor);
|
||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
||||
|
||||
if (n_copies <= 1 || backend->iface.set_tensor_2d_async == NULL) {
|
||||
if (n_copies <= 1 || backend->iface.get_tensor_2d_async == NULL) {
|
||||
for (size_t i = 0; i < n_copies; i++) {
|
||||
ggml_backend_tensor_get_async(backend, tensor, (char *) data + i*stride_data, offset + i*stride_tensor, size);
|
||||
}
|
||||
@@ -317,7 +317,7 @@ void ggml_backend_tensor_get_2d_async(ggml_backend_t backend, const struct ggml_
|
||||
}
|
||||
|
||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
||||
GGML_ASSERT(offset + (n_copies-1)*stride_tensor + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
||||
GGML_ASSERT(offset + (n_copies-1)*stride_tensor + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
||||
backend->iface.get_tensor_2d_async(backend, tensor, data, offset, size, n_copies, stride_tensor, stride_data);
|
||||
}
|
||||
|
||||
@@ -379,7 +379,7 @@ void ggml_backend_tensor_get_2d(const struct ggml_tensor * tensor, void * data,
|
||||
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
||||
|
||||
if (n_copies <= 1 || buf->iface.set_tensor_2d == NULL) {
|
||||
if (n_copies <= 1 || buf->iface.get_tensor_2d == NULL) {
|
||||
for (size_t i = 0; i < n_copies; i++) {
|
||||
ggml_backend_tensor_get(tensor, (char *) data + i*stride_data, offset + i*stride_tensor, size);
|
||||
}
|
||||
|
||||
@@ -1561,7 +1561,8 @@ static __inline__ void ggml_cuda_kernel_launch(Kernel kernel, const ggml_cuda_ke
|
||||
return env == nullptr || std::atoi(env) != 0;
|
||||
}();
|
||||
|
||||
if (env_pdl_enabled && ggml_cuda_info().devices[ggml_cuda_get_device()].cc >= GGML_CUDA_CC_HOPPER) {
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
if (env_pdl_enabled && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_HOPPER) {
|
||||
auto pdl_cfg = ggml_cuda_pdl_config(launch_params);
|
||||
|
||||
CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, kernel, std::forward<Args>(args)... ));
|
||||
|
||||
@@ -2735,9 +2735,10 @@ static bool ggml_hexagon_supported_ssm_conv(const struct ggml_hexagon_session *
|
||||
if (dst->ne[0] != d_inner || dst->ne[1] != n_t || dst->ne[2] != n_s) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// TODO: add support for non-contiguous tensors
|
||||
if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
|
||||
if (src0->nb[0] != sizeof(float) || src1->nb[0] != sizeof(float) || dst->nb[0] != sizeof(float)) {
|
||||
return false;
|
||||
}
|
||||
if (src0->nb[1] != src0->ne[0] * sizeof(float) || src1->nb[1] != src1->ne[0] * sizeof(float)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@@ -852,9 +852,10 @@ static void fa_softmax_thread(unsigned int n, unsigned int i, void * data) {
|
||||
v_s_rowmax1 = hvx_vec_reduce_max_f16(v_s_rowmax1);
|
||||
|
||||
// Splat m_prev[r], m_prev[r+1] from the per-row accumulator.
|
||||
// vror brings the target lane to lane 0, then extract + re-splat.
|
||||
HVX_Vector v_m_prev0 = hvx_vec_splat_f16(hvx_vec_get_f16(Q6_V_vror_VR(m_prev_v, r_vec_off * 2)));
|
||||
HVX_Vector v_m_prev1 = hvx_vec_splat_f16(hvx_vec_get_f16(Q6_V_vror_VR(m_prev_v, (r_vec_off + 1) * 2)));
|
||||
// vror brings the target lane to lane 0, then vdelta replicates it
|
||||
// across all lanes — stays in the vector domain (no store/reload).
|
||||
HVX_Vector v_m_prev0 = hvx_vec_repl_f16(Q6_V_vror_VR(m_prev_v, r_vec_off * 2));
|
||||
HVX_Vector v_m_prev1 = hvx_vec_repl_f16(Q6_V_vror_VR(m_prev_v, (r_vec_off + 1) * 2));
|
||||
|
||||
// HVX max — both operands are splats, so result is splat of m_new.
|
||||
HVX_Vector v_dup_m0 = Q6_Vhf_vmax_VhfVhf(v_m_prev0, v_s_rowmax0);
|
||||
|
||||
@@ -201,11 +201,10 @@ static inline HVX_Vector dequantize_x4x2_q4_0_group_hvx(const uint8_t *packed_32
|
||||
|
||||
// Batch-dequantize 4 contiguous x4x2 Q4_0 groups (4x32 = 128 packed bytes) using
|
||||
// full HVX vector width. One vmemu + one vlut16 replaces 4 separate calls.
|
||||
// Output: out[0..3] each hold 32 FP16 values in the first 64 bytes.
|
||||
static inline void dequantize_x4x2_q4_0_x4groups_hvx(
|
||||
// Output: vector_x2 each hold 32 FP16 values in the first 64 bytes.
|
||||
static inline HVX_Vector_x2 dequantize_x4x2_q4_0_x4groups_hvx(
|
||||
const uint8_t *packed_128, bool upper_nibbles,
|
||||
const __fp16 *scales_4, const HVX_Vector vlut_cvt,
|
||||
HVX_Vector out[4]) {
|
||||
const __fp16 *scales_4, const HVX_Vector vlut_cvt) {
|
||||
// Load all 128 packed bytes (4 contiguous 32-byte groups)
|
||||
HVX_Vector vq = hvx_vmemu(packed_128);
|
||||
const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
|
||||
@@ -221,8 +220,7 @@ static inline void dequantize_x4x2_q4_0_x4groups_hvx(
|
||||
HVX_Vector v_hi = Q6_V_hi_W(vp); // [group2: 32 fp16 | group3: 32 fp16]
|
||||
|
||||
// Build per-group scale vectors: first 64 bytes use scale_a, last 64 use scale_b
|
||||
volatile HVX_Vector vscale = hvx_vmemu(scales_4);
|
||||
|
||||
HVX_Vector vscale = hvx_vmemu(scales_4);
|
||||
HVX_Vector v_sc01 = hvx_vec_repl_2x_f16(vscale);
|
||||
HVX_Vector v_sc23 = hvx_vec_repl_2x_f16(Q6_V_vror_VR(vscale, 4));
|
||||
|
||||
@@ -230,8 +228,9 @@ static inline void dequantize_x4x2_q4_0_x4groups_hvx(
|
||||
v_hi = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hi, v_sc23));
|
||||
|
||||
// Extract individual groups: scatter uses q_mask64 so only first 64 bytes matter
|
||||
out[0] = v_lo; // group0 already in [0:63]
|
||||
out[1] = v_hi; // group2 already in [0:63]
|
||||
HVX_Vector_x2 r = { v_lo,/* group1 already in [0:63] */
|
||||
v_hi /* group2 already in [0:63] */ };
|
||||
return r;
|
||||
}
|
||||
|
||||
// Dequantize one x4x2 Q8_0 group (32 int8 quants) -> 32 FP16 in first 64 bytes.
|
||||
@@ -292,12 +291,11 @@ static inline HVX_Vector dequantize_x4x2_mxfp4_group_hvx(const uint8_t * packed
|
||||
}
|
||||
|
||||
// Batch-dequantize 4 contiguous x4x2 MXFP4 groups (4x32 = 128 packed bytes).
|
||||
static inline void dequantize_x4x2_mxfp4_x4groups_hvx(const uint8_t * packed_128,
|
||||
static inline HVX_Vector_x4 dequantize_x4x2_mxfp4_x4groups_hvx(const uint8_t * packed_128,
|
||||
bool upper_nibbles,
|
||||
int sub_blk_base,
|
||||
const HVX_Vector vlut_cvt,
|
||||
mxfp4_scales_t scales,
|
||||
HVX_Vector out[4]) {
|
||||
mxfp4_scales_t scales) {
|
||||
HVX_Vector vq = hvx_vmemu(packed_128);
|
||||
const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
|
||||
HVX_Vector v_quants = upper_nibbles ? Q6_Vub_vlsr_VubR(vq, 4) : vq;
|
||||
@@ -318,10 +316,8 @@ static inline void dequantize_x4x2_mxfp4_x4groups_hvx(const uint8_t * packed_12
|
||||
v_lo = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_lo, v_sc01));
|
||||
v_hi = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hi, v_sc23));
|
||||
|
||||
out[0] = v_lo;
|
||||
out[1] = Q6_V_vror_VR(v_lo, 64);
|
||||
out[2] = v_hi;
|
||||
out[3] = Q6_V_vror_VR(v_hi, 64);
|
||||
HVX_Vector_x4 r = { v_lo, Q6_V_vror_VR(v_lo, 64), v_hi, Q6_V_vror_VR(v_hi, 64) };
|
||||
return r;
|
||||
}
|
||||
|
||||
// Dequantize a tile range from x4x2 weight data (already in VTCM) to tile-major FP16.
|
||||
@@ -372,18 +368,18 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
|
||||
unsigned row1 = ct * HMX_FP16_TILE_N_COLS + 1;
|
||||
|
||||
for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2, row1 += 2) {
|
||||
HVX_Vector v0[2];
|
||||
const uint8_t *r0 = vtcm_src + row_offset; row_offset += row_stride;
|
||||
dequantize_x4x2_q4_0_x4groups_hvx(r0 + packed_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt, v0);
|
||||
Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, v0[0]);
|
||||
Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, v0[1]);
|
||||
const uint8_t *r1 = vtcm_src + row_offset; row_offset += row_stride;
|
||||
|
||||
HVX_Vector_x2 dv0 = dequantize_x4x2_q4_0_x4groups_hvx(r0 + packed_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt);
|
||||
HVX_Vector_x2 dv1 = dequantize_x4x2_q4_0_x4groups_hvx(r1 + packed_off, upper, (const __fp16 *)(r1 + scale_off), vlut_cvt);
|
||||
|
||||
Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv0.v[0]);
|
||||
Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv0.v[1]);
|
||||
v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
|
||||
|
||||
|
||||
r0 = vtcm_src + row_offset; row_offset += row_stride;
|
||||
dequantize_x4x2_q4_0_x4groups_hvx(r0 + packed_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt, v0);
|
||||
Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, v0[0]);
|
||||
Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, v0[1]);
|
||||
Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv1.v[0]);
|
||||
Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv1.v[1]);
|
||||
v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
|
||||
}
|
||||
|
||||
@@ -415,21 +411,21 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
|
||||
// Batch-convert all 8 E8M0 scales once per row (stays in HVX register)
|
||||
mxfp4_scales_t r0_e8 = mxfp4_convert_scales(r0 + e8m0_blk_off);
|
||||
|
||||
HVX_Vector v0[4], v1[4];
|
||||
dequantize_x4x2_mxfp4_x4groups_hvx(r0 + packed_off, upper, sub_blk_base, vlut_cvt, r0_e8, v0);
|
||||
HVX_Vector_x4 dv0, dv1;
|
||||
dv0 = dequantize_x4x2_mxfp4_x4groups_hvx(r0 + packed_off, upper, sub_blk_base, vlut_cvt, r0_e8);
|
||||
if (row1 < n_cols) {
|
||||
mxfp4_scales_t r1_e8 = mxfp4_convert_scales(r1 + e8m0_blk_off);
|
||||
dequantize_x4x2_mxfp4_x4groups_hvx(r1 + packed_off, upper, sub_blk_base, vlut_cvt, r1_e8, v1);
|
||||
dv1 = dequantize_x4x2_mxfp4_x4groups_hvx(r1 + packed_off, upper, sub_blk_base, vlut_cvt, r1_e8);
|
||||
} else {
|
||||
v1[0] = v1[1] = v1[2] = v1[3] = Q6_V_vzero();
|
||||
dv1.v[0] = dv1.v[1] = dv1.v[2] = dv1.v[3] = Q6_V_vzero();
|
||||
}
|
||||
|
||||
for (int g = 0; g < 4; g++) {
|
||||
Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_bases[g], HMX_FP16_TILE_SIZE - 1, v_off, v0[g]);
|
||||
Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_bases[g], HMX_FP16_TILE_SIZE - 1, v_off, dv0.v[g]);
|
||||
}
|
||||
v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
|
||||
for (int g = 0; g < 4; g++) {
|
||||
Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_bases[g], HMX_FP16_TILE_SIZE - 1, v_off, v1[g]);
|
||||
Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_bases[g], HMX_FP16_TILE_SIZE - 1, v_off, dv1.v[g]);
|
||||
}
|
||||
v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
|
||||
}
|
||||
@@ -612,11 +608,13 @@ static void core_dot_chunk_fp16(__fp16 *restrict output, const __fp16 *restrict
|
||||
const __fp16 *row_tiles = activation + r * n_dot_tiles * HMX_FP16_TILE_N_ELMS;
|
||||
const __fp16 *col_tiles = weight + c * n_dot_tiles * HMX_FP16_TILE_N_ELMS;
|
||||
|
||||
for (int k = 0; k < n_dot_tiles; ++k) {
|
||||
Q6_activation_hf_mxmem_RR((unsigned int)row_tiles, 2047);
|
||||
Q6_weight_hf_mxmem_RR((unsigned int)col_tiles, 2047);
|
||||
row_tiles += HMX_FP16_TILE_N_ELMS;
|
||||
col_tiles += HMX_FP16_TILE_N_ELMS;
|
||||
for (int k = 0, k_block; k < n_dot_tiles; k += k_block) {
|
||||
k_block = hex_smin(n_dot_tiles - k, 32);
|
||||
const uint32_t range = 2048u * (uint32_t)k_block - 1;
|
||||
Q6_activation_hf_mxmem_RR_deep((unsigned int)row_tiles, range);
|
||||
Q6_weight_hf_mxmem_RR((unsigned int)col_tiles, range);
|
||||
row_tiles += k_block * HMX_FP16_TILE_N_ELMS;
|
||||
col_tiles += k_block * HMX_FP16_TILE_N_ELMS;
|
||||
}
|
||||
|
||||
__fp16 *out_tile = output + (r * n_col_tiles + c) * HMX_FP16_TILE_N_ELMS;
|
||||
@@ -832,10 +830,6 @@ static void transfer_activation_chunk_threaded(struct htp_context *ctx, __fp16 *
|
||||
worker_pool_run_func(ctx->worker_pool, transfer_activation_chunk_worker_fn, &state, ctx->n_threads);
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
#define FALLBACK_TO_STANDARD 1
|
||||
|
||||
// C += AB
|
||||
static void core_mma_chunk_fp16(__fp16 *restrict c, const __fp16 *restrict a, const __fp16 *restrict b,
|
||||
const __fp16 *restrict col_scales, const __fp16 *restrict eye_tile,
|
||||
@@ -861,314 +855,80 @@ static void core_mma_chunk_fp16(__fp16 *restrict c, const __fp16 *restrict a, co
|
||||
Q6_weight_hf_mxmem_RR((unsigned int)eye_tile, 2047);
|
||||
}
|
||||
|
||||
for (int k = 0; k < n_dot_tiles; ++k) {
|
||||
Q6_activation_hf_mxmem_RR((unsigned int)row_tiles, 2047);
|
||||
Q6_weight_hf_mxmem_RR((unsigned int)col_tiles, 2047);
|
||||
row_tiles += HMX_FP16_TILE_N_ELMS;
|
||||
col_tiles += HMX_FP16_TILE_N_ELMS;
|
||||
for (int k = 0, k_block; k < n_dot_tiles; k += k_block) {
|
||||
k_block = hex_smin(n_dot_tiles - k, 32);
|
||||
const uint32_t range = 2048u * (uint32_t)k_block - 1;
|
||||
Q6_activation_hf_mxmem_RR_deep((unsigned int)row_tiles, range);
|
||||
Q6_weight_hf_mxmem_RR((unsigned int)col_tiles, range);
|
||||
row_tiles += k_block * HMX_FP16_TILE_N_ELMS;
|
||||
col_tiles += k_block * HMX_FP16_TILE_N_ELMS;
|
||||
}
|
||||
|
||||
Q6_mxmem_AR_after_hf(accum_tile, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static __attribute__((noinline)) int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx,
|
||||
float *restrict out, const float *restrict x, const uint8_t *restrict w,
|
||||
int m, int k, int n, int weight_type) {
|
||||
// assume k % 32 == 0 && n % 32 == 0
|
||||
const size_t row_stride = get_x4x2_row_stride(weight_type, k);
|
||||
if (row_stride == 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
const size_t vtcm_budget = ctx->vtcm_size;
|
||||
|
||||
const size_t K_BLOCK_SIZE = 1024;
|
||||
|
||||
// Fallback: if k doesn't need K-blocking, out-stationary has no advantage
|
||||
const size_t k_iters_check = (k + K_BLOCK_SIZE - 1) / K_BLOCK_SIZE;
|
||||
if (k_iters_check <= 1) {
|
||||
FARF(HIGH, "%s: K_BLK=%zu >= k=%d, fallback to standard path", __func__, K_BLOCK_SIZE, k);
|
||||
return FALLBACK_TO_STANDARD;
|
||||
}
|
||||
|
||||
// Dynamic M,N search via hmx_compute_chunks
|
||||
const size_t sub_row_stride_alloc = get_x4x2_row_stride(weight_type, K_BLOCK_SIZE);
|
||||
const size_t per_m = K_BLOCK_SIZE * sizeof(float) // scratch1: M×K×4 (act DMA staging F32)
|
||||
+ K_BLOCK_SIZE * sizeof(__fp16); // activation: M×K×2 (F16 tiles)
|
||||
const size_t per_n = sub_row_stride_alloc // scratch0: N×sub_row(K) (packed quant)
|
||||
+ K_BLOCK_SIZE * sizeof(__fp16); // weight: N×K×2 (F16 tiles)
|
||||
const size_t per_mn = sizeof(__fp16); // output: M×N×2 (out-stationary)
|
||||
|
||||
// Alignment margin: hex_align_up can add up to 2047 bytes per buffer;
|
||||
// scratch1 (mc×6144) is naturally 2048-aligned, remaining 4 buffers need margin
|
||||
const size_t align_margin = 4 * HMX_FP16_TILE_SIZE;
|
||||
const size_t overhead = HMX_FP16_TILE_SIZE + 256 + align_margin; // eye_tile + scales + alignment
|
||||
|
||||
size_t M_BLOCK_SIZE, N_BLOCK_SIZE, vtcm_used;
|
||||
// Cost-based search: minimize ceil(m/mc)*m_block_cost + ceil(n/nc)*n_block_cost.
|
||||
// From profiling: wt_dequant per element ≈ 1.5× activation load per element.
|
||||
// m_block_cost = n*3: each extra M-block re-dequants all N×K weight (expensive).
|
||||
// n_block_cost = m*2: each extra N-block re-loads all M×K activation (cheaper).
|
||||
const size_t m_block_cost = (size_t) n * 3;
|
||||
const size_t n_block_cost = (size_t) m * 2;
|
||||
if (hmx_compute_chunks(vtcm_budget, overhead, per_n, per_m, per_mn,
|
||||
hex_align_up(m, HMX_FP16_TILE_N_ROWS), n,
|
||||
m_block_cost, n_block_cost, &M_BLOCK_SIZE,
|
||||
&N_BLOCK_SIZE, &vtcm_used) != 0) {
|
||||
FARF(HIGH, "%s: VTCM too small (m=%d k=%d n=%d budget=%zu)", __func__, m, k, n, vtcm_budget);
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Compute precise buffer sizes from searched M,N and fixed K
|
||||
const size_t weight_size = hex_align_up(N_BLOCK_SIZE * K_BLOCK_SIZE * sizeof(__fp16), HMX_FP16_TILE_SIZE);
|
||||
const size_t act_size = hex_align_up(M_BLOCK_SIZE * K_BLOCK_SIZE * sizeof(__fp16), HMX_FP16_TILE_SIZE);
|
||||
const size_t out_size = hex_align_up(M_BLOCK_SIZE * N_BLOCK_SIZE * sizeof(__fp16), HMX_FP16_TILE_SIZE);
|
||||
const size_t scratch0_sz = hex_align_up(N_BLOCK_SIZE * sub_row_stride_alloc, HMX_FP16_TILE_SIZE);
|
||||
const size_t scratch1_sz = hex_align_up(M_BLOCK_SIZE * K_BLOCK_SIZE * sizeof(float), HMX_FP16_TILE_SIZE);
|
||||
|
||||
const size_t total_vtcm = weight_size + act_size + out_size + scratch0_sz + scratch1_sz + HMX_FP16_TILE_SIZE + 256;
|
||||
if (total_vtcm > vtcm_budget) {
|
||||
FARF(HIGH, "%s: VTCM overflow after search: need %zu have %zu (M=%zu N=%zu K=%zu)", __func__, total_vtcm,
|
||||
vtcm_budget, M_BLOCK_SIZE, N_BLOCK_SIZE, K_BLOCK_SIZE);
|
||||
return -1;
|
||||
}
|
||||
|
||||
uint8_t *vtcm_ptr = (uint8_t *) ctx->vtcm_base;
|
||||
__fp16 *vtcm_weight = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, weight_size);
|
||||
__fp16 *vtcm_activation = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, act_size);
|
||||
__fp16 *vtcm_output = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, out_size);
|
||||
uint8_t *vtcm_scratch0 = vtcm_seq_alloc(&vtcm_ptr, scratch0_sz);
|
||||
uint8_t *vtcm_scratch1 = vtcm_seq_alloc(&vtcm_ptr, scratch1_sz);
|
||||
__fp16 *vtcm_eye_tile = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, HMX_FP16_TILE_SIZE);
|
||||
__fp16 *vtcm_scales = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, 256);
|
||||
assert((size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base) <= vtcm_budget);
|
||||
|
||||
FARF(HIGH, "hmx-mm: m=%d k=%d n=%d wtype=%d block M=%zu N=%zu K=%zu vtcm=%zu/%zu", m, k, n, weight_type,
|
||||
M_BLOCK_SIZE, N_BLOCK_SIZE, K_BLOCK_SIZE, (size_t) (vtcm_ptr - (uint8_t *) ctx->vtcm_base), vtcm_budget);
|
||||
|
||||
// initialize eye tile (32x32 identity matrix)
|
||||
{
|
||||
HVX_Vector v;
|
||||
v = Q6_V_vzero();
|
||||
v = Q6_Vw_vinsert_VwR(v, 0x3c000000);
|
||||
v = Q6_V_vror_VR(v, VLEN - 4);
|
||||
v = Q6_Vw_vinsert_VwR(v, 0x00003c00);
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
((HVX_Vector *) vtcm_eye_tile)[i] = v;
|
||||
v = Q6_V_vror_VR(v, VLEN - 8);
|
||||
}
|
||||
}
|
||||
hmx_init_column_scales(vtcm_scales, Q6_V_vsplat_R(0x3c00)); // scale: 1.0, bias: 0.0 in FP16
|
||||
|
||||
TIMER_DEFINE(fetch);
|
||||
TIMER_DEFINE(act_load);
|
||||
TIMER_DEFINE(wt_dequant);
|
||||
TIMER_DEFINE(core);
|
||||
|
||||
HAP_compute_res_hmx_lock(ctx->vtcm_rctx);
|
||||
|
||||
for (size_t mr = 0; mr < m; mr += M_BLOCK_SIZE) {
|
||||
size_t m_blk_sz = hex_smin(m - mr, M_BLOCK_SIZE);
|
||||
for (size_t nc = 0; nc < n; nc += N_BLOCK_SIZE) {
|
||||
size_t n_blk_sz = hex_smin(n - nc, N_BLOCK_SIZE);
|
||||
|
||||
const int n_row_tiles = hmx_ceil_div(m_blk_sz, HMX_FP16_TILE_N_ROWS);
|
||||
const int n_col_tiles = hmx_ceil_div(n_blk_sz, HMX_FP16_TILE_N_COLS);
|
||||
|
||||
for (size_t kk = 0; kk < k; kk += K_BLOCK_SIZE) {
|
||||
const size_t k_blk_sz = hex_smin(k - kk, K_BLOCK_SIZE);
|
||||
|
||||
TIMER_START(fetch);
|
||||
// fetch activation block into VTCM
|
||||
{
|
||||
const float *activation_block = x + mr * k + kk;
|
||||
|
||||
dma_queue_push(ctx->dma[0],
|
||||
dma_make_ptr(vtcm_scratch1, activation_block),
|
||||
k_blk_sz * sizeof(float),
|
||||
k * sizeof(float),
|
||||
k_blk_sz * sizeof(float),
|
||||
m_blk_sz);
|
||||
}
|
||||
|
||||
// fetch weight block into VTCM (x4x2 sub-block: quants + scales)
|
||||
const size_t sub_row_stride = get_x4x2_row_stride(weight_type, k_blk_sz);
|
||||
{
|
||||
const int blk_start = kk / QK_Q4_0x4x2;
|
||||
const int nb_sub = (k_blk_sz + QK_Q4_0x4x2 - 1) / QK_Q4_0x4x2;
|
||||
const int full_qrow = (weight_type == HTP_TYPE_Q8_0) ? k : (k / 2);
|
||||
const int scale_blk_size = (weight_type == HTP_TYPE_MXFP4) ? HMX_X4X2_MXFP4_EBLK_SIZE : HMX_X4X2_DBLK_SIZE;
|
||||
uint8_t *dst = vtcm_scratch0;
|
||||
const uint8_t *src = w + nc * row_stride;
|
||||
const size_t n_rows = n_blk_sz;
|
||||
const size_t src_stride = row_stride;
|
||||
const size_t dst_stride = sub_row_stride;
|
||||
const size_t quant_off = (weight_type == HTP_TYPE_Q8_0) ? (blk_start * QK_Q8_0x4x2) : (blk_start * (QK_Q4_0x4x2 / 2));
|
||||
const size_t quant_width = (weight_type == HTP_TYPE_Q8_0) ? (nb_sub * QK_Q8_0x4x2) : (nb_sub * (QK_Q4_0x4x2 / 2));
|
||||
const size_t scale_off = full_qrow + blk_start * scale_blk_size;
|
||||
const size_t scale_width = nb_sub * scale_blk_size;
|
||||
|
||||
// 2D DMA: quants sub-range
|
||||
dma_queue_push(ctx->dma[0], dma_make_ptr(dst, src + quant_off), dst_stride, src_stride, quant_width, n_rows);
|
||||
// 2D DMA: scales sub-range
|
||||
dma_queue_push(ctx->dma[0], dma_make_ptr(dst + quant_width, src + scale_off), dst_stride, src_stride, scale_width, n_rows);
|
||||
}
|
||||
TIMER_STOP(fetch);
|
||||
|
||||
TIMER_START(act_load);
|
||||
// load activation block
|
||||
{
|
||||
dma_queue_pop(ctx->dma[0]); // wait for act DNA
|
||||
transfer_activation_chunk_threaded(ctx, vtcm_activation, (float *) vtcm_scratch1, m_blk_sz, k_blk_sz, k_blk_sz);
|
||||
}
|
||||
TIMER_STOP(act_load);
|
||||
|
||||
TIMER_START(wt_dequant);
|
||||
// dequantize weight block
|
||||
{
|
||||
dma_queue_pop(ctx->dma[0]);
|
||||
dma_queue_pop(ctx->dma[0]);
|
||||
// vtcm_scratch0 is used to store the qweight chunk
|
||||
// worker_pool_run_func already returned, so fetch is done
|
||||
dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight, vtcm_scratch0,
|
||||
n_blk_sz, k_blk_sz, sub_row_stride, weight_type);
|
||||
}
|
||||
TIMER_STOP(wt_dequant);
|
||||
|
||||
// core mma
|
||||
TIMER_START(core);
|
||||
{
|
||||
core_mma_chunk_fp16(vtcm_output, vtcm_activation, vtcm_weight, vtcm_scales, vtcm_eye_tile, n_row_tiles,
|
||||
n_col_tiles, k_blk_sz / HMX_FP16_TILE_N_COLS, kk == 0);
|
||||
}
|
||||
TIMER_STOP(core);
|
||||
}
|
||||
|
||||
// store output block
|
||||
{
|
||||
float *output_block = out + (mr * n + nc);
|
||||
transfer_output_chunk_threaded(ctx, output_block, vtcm_output, m_blk_sz, n_blk_sz, n);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
HAP_compute_res_hmx_unlock(ctx->vtcm_rctx);
|
||||
|
||||
#if defined(ENABLE_PROFILE_TIMERS)
|
||||
FARF(HIGH, "fetch: %lld us, act_load: %lld us, wt_dequant: %lld us, core: %lld us",
|
||||
TIMER_US(fetch), TIMER_US(act_load), TIMER_US(wt_dequant), TIMER_US(core));
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict dst, const float *restrict activation,
|
||||
int hmx_matmul_q_f32(struct htp_context *ctx, float *restrict dst, const float *restrict activation,
|
||||
const uint8_t *restrict permuted_weight, int m, int k, int n,
|
||||
int weight_type) {
|
||||
if (!dst || !activation || !permuted_weight || !m || !n || !k) { return -1; }
|
||||
if (k % 32 != 0 || n % 32 != 0) { return -1; }
|
||||
|
||||
if (!hex_is_aligned(dst, VLEN) || !hex_is_aligned(activation, VLEN) || !hex_is_aligned(permuted_weight, VLEN)) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// for large m, k (e.g. prefill FFN Down), use out-stationary version
|
||||
if (m >= 128 && k > n && n > 1024) {
|
||||
int rc = mat_mul_qk_0_d16a32_out_stationary(ctx, dst, activation, permuted_weight, m, k, n, weight_type);
|
||||
if (rc != FALLBACK_TO_STANDARD) {
|
||||
return rc; // 0 success, -1 error
|
||||
}
|
||||
FARF(HIGH, "hmx_matmul_qk: out-stationary fallback to standard m=%d k=%d n=%d", m, k, n);
|
||||
// fall through to standard path
|
||||
}
|
||||
|
||||
size_t row_stride = get_x4x2_row_stride(weight_type, k);
|
||||
if (row_stride == 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
FARF(HIGH, "hmx_matmul_qk: STANDARD path m=%d k=%d n=%d type=%d", m, k, n, weight_type);
|
||||
|
||||
// --- Dynamic VTCM layout ---
|
||||
const size_t vtcm_budget = ctx->vtcm_size;
|
||||
const size_t vec_dot_size = k * sizeof(__fp16);
|
||||
const size_t vec_dot_size = k * sizeof(__fp16);
|
||||
const size_t vtcm_budget = ctx->vtcm_size;
|
||||
size_t vtcm_used = 0;
|
||||
|
||||
// Pipeline = 4-stage DMA→dequant→HMX→store with HMX worker overlap.
|
||||
// Only pays off when the chunker yields >=2 n-chunks, so the main loop can
|
||||
// overlap HMX (C) with HVX (B/D); with a single n-chunk the extra VTCM for
|
||||
// double-buffered output and the worker-dispatch overhead are pure loss.
|
||||
// Try pipeline costs first; fall back to sequential if the layout collapses
|
||||
// to one n-chunk. m >= 128 floor keeps HMX utilization reasonable.
|
||||
const size_t pipe_per_n = row_stride + 2 * vec_dot_size; // Q + S0 + S1 (dequant bufs)
|
||||
const size_t pipe_per_mn = 2 * sizeof(__fp16); // O x 2 (output double buffer)
|
||||
const size_t seq_per_n = vec_dot_size + 2 * row_stride; // W + S0 + S1 (x4x2 DMA bufs)
|
||||
const size_t seq_per_mn = sizeof(__fp16); // O x 1
|
||||
const size_t size_per_n = row_stride + 2 * vec_dot_size; // Q + S0 + S1 (dequant bufs)
|
||||
const size_t size_per_mn = 2 * sizeof(__fp16); // O x 2 (output double buffer)
|
||||
|
||||
size_t m_chunk_n_rows = 0, n_chunk_n_cols = 0, vtcm_used = 0;
|
||||
bool use_pipeline = false;
|
||||
|
||||
if (m >= 128) {
|
||||
size_t mc = 0, nc = 0, used = 0;
|
||||
if (hmx_compute_chunks(vtcm_budget, /*overhead=*/256, pipe_per_n, /*per_m=*/vec_dot_size, pipe_per_mn,
|
||||
hex_align_up(m, HMX_FP16_TILE_N_ROWS), n,
|
||||
/*m_block_cost=*/(size_t) n * 3,
|
||||
/*n_block_cost=*/(size_t) m * 2, &mc, &nc, &used) == 0 &&
|
||||
hmx_ceil_div((size_t) n, nc) >= 2) {
|
||||
m_chunk_n_rows = mc;
|
||||
n_chunk_n_cols = nc;
|
||||
vtcm_used = used;
|
||||
use_pipeline = true;
|
||||
}
|
||||
size_t m_chunk_n_rows = 0, n_chunk_n_cols = 0;
|
||||
if (hmx_compute_chunks(vtcm_budget, /*overhead=*/256, size_per_n, /*per_m=*/vec_dot_size, size_per_mn,
|
||||
hex_align_up(m, HMX_FP16_TILE_N_ROWS), n,
|
||||
/*m_block_cost=*/(size_t) n * 3,
|
||||
/*n_block_cost=*/(size_t) m * 2, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used)) {
|
||||
FARF(HIGH, "hmx-mm-q: VTCM too small : m %d k %d n %d budget %zu", m, k, n, vtcm_budget);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (!use_pipeline) {
|
||||
if (hmx_compute_chunks(vtcm_budget, /*overhead=*/256, seq_per_n, /*per_m=*/vec_dot_size, seq_per_mn,
|
||||
hex_align_up(m, HMX_FP16_TILE_N_ROWS), n,
|
||||
/*m_block_cost=*/(size_t) n * 3,
|
||||
/*n_block_cost=*/(size_t) m * 2, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) {
|
||||
FARF(HIGH, "%s: VTCM too small (m=%d k=%d n=%d budget=%zu)", __func__, m, k, n, vtcm_budget);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Compute precise buffer sizes per execution path
|
||||
const size_t weight_area_size = hex_align_up(
|
||||
n_chunk_n_cols * (use_pipeline ? row_stride : vec_dot_size), HMX_FP16_TILE_SIZE);
|
||||
const size_t activation_area_size = hex_align_up(m_chunk_n_rows * vec_dot_size, HMX_FP16_TILE_SIZE);
|
||||
const size_t output_area_size = hex_align_up(
|
||||
m_chunk_n_rows * n_chunk_n_cols * sizeof(__fp16), HMX_FP16_TILE_SIZE);
|
||||
const size_t weight_area_size = hex_align_up(n_chunk_n_cols * row_stride, HMX_FP16_TILE_SIZE);
|
||||
const size_t act_area_size = hex_align_up(m_chunk_n_rows * vec_dot_size, HMX_FP16_TILE_SIZE);
|
||||
const size_t output_area_size = hex_align_up(m_chunk_n_rows * n_chunk_n_cols * sizeof(__fp16), HMX_FP16_TILE_SIZE);
|
||||
|
||||
size_t scratch0_size, scratch1_size, scratch2_size;
|
||||
if (use_pipeline) {
|
||||
scratch0_size = hex_align_up(n_chunk_n_cols * vec_dot_size, HMX_FP16_TILE_SIZE); // dequant buf 0
|
||||
scratch1_size = scratch0_size; // dequant buf 1
|
||||
scratch2_size = output_area_size; // output buf 1
|
||||
} else {
|
||||
scratch0_size = hex_align_up(n_chunk_n_cols * row_stride, HMX_FP16_TILE_SIZE); // x4x2 DMA buf 0
|
||||
scratch1_size = scratch0_size; // x4x2 DMA buf 1
|
||||
scratch2_size = 0; // unused
|
||||
}
|
||||
scratch0_size = hex_align_up(n_chunk_n_cols * vec_dot_size, HMX_FP16_TILE_SIZE); // dequant buf 0
|
||||
scratch1_size = scratch0_size; // dequant buf 1
|
||||
scratch2_size = output_area_size; // output buf 1
|
||||
|
||||
uint8_t *vtcm_ptr = (uint8_t *) ctx->vtcm_base;
|
||||
__fp16 *vtcm_weight = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, weight_area_size);
|
||||
__fp16 *vtcm_activation = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, activation_area_size);
|
||||
__fp16 *vtcm_activation = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, act_area_size);
|
||||
__fp16 *vtcm_output = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, output_area_size);
|
||||
void *vtcm_scratch0 = vtcm_seq_alloc(&vtcm_ptr, scratch0_size);
|
||||
void *vtcm_scratch1 = vtcm_seq_alloc(&vtcm_ptr, scratch1_size);
|
||||
void *vtcm_scratch2 = scratch2_size ? vtcm_seq_alloc(&vtcm_ptr, scratch2_size) : NULL;
|
||||
__fp16 *vtcm_scales = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, 256);
|
||||
if ((size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base) > vtcm_budget) {
|
||||
FARF(ERROR, "%s: vtcm overflow: used=%zu limit=%zu", __func__,
|
||||
(size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base), vtcm_budget);
|
||||
|
||||
vtcm_used = vtcm_ptr - (uint8_t *) ctx->vtcm_base;
|
||||
if (vtcm_used > vtcm_budget) {
|
||||
FARF(ERROR, "hmx-mm-q: VTCM overflow: used %zu budget %zu", vtcm_used, vtcm_budget);
|
||||
return -1;
|
||||
}
|
||||
|
||||
hmx_init_column_scales(vtcm_scales, Q6_V_vsplat_R(0x3c00)); // scale: 1.0, bias: 0.0 in FP16
|
||||
|
||||
FARF(HIGH, "%s: m=%d k=%d n=%d wtype=%d pipe=%d mc=%zu nc=%zu vtcm=%zu/%zu",
|
||||
__func__, m, k, n, weight_type, use_pipeline,
|
||||
m_chunk_n_rows, n_chunk_n_cols,
|
||||
(size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base), vtcm_budget);
|
||||
FARF(HIGH, "hmx-mm-q: standard : m %d k %d n %d wtype %d mc %zu nc %zu vtcm %zu/%zu",
|
||||
m, k, n, weight_type, m_chunk_n_rows, n_chunk_n_cols, vtcm_used, vtcm_budget);
|
||||
|
||||
TIMER_DEFINE(activation_load);
|
||||
TIMER_DEFINE(weight_load);
|
||||
@@ -1178,184 +938,115 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
|
||||
TIMER_DEFINE(total);
|
||||
TIMER_START(total);
|
||||
|
||||
FARF(HIGH, "hmx_matmul_qk: %s mc=%zu nc=%zu vtcm=%zu/%zu",
|
||||
use_pipeline ? "PIPELINE" : "SEQUENTIAL", m_chunk_n_rows, n_chunk_n_cols,
|
||||
(size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base), vtcm_budget);
|
||||
// 4-stage pipeline: DMA load (A), dequantize (B), HMX matmul (C), store (D)
|
||||
// HMX compute (C) runs on dedicated worker thread, overlapping with HVX stages (B, D).
|
||||
|
||||
if (!use_pipeline) {
|
||||
HAP_compute_res_hmx_lock(ctx->vtcm_rctx);
|
||||
for (size_t mr = 0; mr < m; mr += m_chunk_n_rows) {
|
||||
// transfer activation matrix chunk into VTCM
|
||||
const size_t n_rows = hex_smin(m - mr, m_chunk_n_rows);
|
||||
const size_t n_row_tiles = hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS);
|
||||
// A --> B: vtcm_qweight, 1 buffer
|
||||
// B --> C: vtcm_weight0/vtcm_weight1, 2 buffers
|
||||
// C --> D: vtcm_output0/vtcm_output1, 2 buffers
|
||||
|
||||
TIMER_START(activation_load);
|
||||
{
|
||||
const float *activation_chunk = activation + mr * k;
|
||||
transfer_activation_chunk_threaded(ctx, vtcm_activation, activation_chunk, n_rows, k, k);
|
||||
}
|
||||
TIMER_STOP(activation_load);
|
||||
// Async timeline (C overlaps B+D):
|
||||
// main+HVX: [A0][Act][B0][A1][sub C0][B1‖C0][A2][wait,sub C1][D0+B2‖C1][wait,sub C2][D1‖C2][wait][D2]
|
||||
// HMX queue: [████ C0 ████████][████ C1 ████████████][████ C2 ████████]
|
||||
|
||||
void *buf_curr = vtcm_scratch0;
|
||||
void *buf_next = vtcm_scratch1;
|
||||
int n_chunk_cnt = hmx_ceil_div(n, n_chunk_n_cols);
|
||||
hmx_matmul_job_t job_slots[2]; // persistent double-buffered job descriptors
|
||||
|
||||
{
|
||||
const size_t n_cols_first = hex_smin(n, n_chunk_n_cols);
|
||||
dma_queue_push(ctx->dma[0], dma_make_ptr(buf_curr, permuted_weight), row_stride, row_stride, row_stride, n_cols_first);
|
||||
}
|
||||
for (size_t mr = 0; mr < m; mr += m_chunk_n_rows) {
|
||||
const size_t n_rows = hex_smin(m - mr, m_chunk_n_rows);
|
||||
|
||||
for (size_t nc = 0; nc < n; nc += n_chunk_n_cols) {
|
||||
const size_t n_cols = hex_smin(n - nc, n_chunk_n_cols);
|
||||
const size_t n_col_tiles = hmx_ceil_div(n_cols, HMX_FP16_TILE_N_COLS);
|
||||
void *vtcm_qweight = vtcm_weight;
|
||||
void *vtcm_weight_bufs[2] = { vtcm_scratch0, vtcm_scratch1 };
|
||||
void *vtcm_output_bufs[2] = { vtcm_output, vtcm_scratch2 };
|
||||
|
||||
TIMER_START(weight_load);
|
||||
{
|
||||
dma_queue_pop(ctx->dma[0]); // wait until current weight chunk become ready
|
||||
|
||||
const size_t nc_next = nc + n_chunk_n_cols;
|
||||
if (nc_next < n) {
|
||||
const size_t n_cols_next = hex_smin(n - nc_next, n_chunk_n_cols);
|
||||
|
||||
const uint8_t *next_weight_chunk = permuted_weight + nc_next * row_stride;
|
||||
|
||||
dma_queue_push(ctx->dma[0], dma_make_ptr(buf_next, next_weight_chunk), row_stride, row_stride, row_stride, n_cols_next);
|
||||
}
|
||||
|
||||
// Dequant + vscatter writes directly to [K, N] transposed tiles.
|
||||
// HMX computes C = A x B, where A=[M,K] activation, B=[K,N] weight.
|
||||
dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight, buf_curr, n_cols, k, row_stride, weight_type);
|
||||
|
||||
hex_swap_ptr(&buf_curr, &buf_next);
|
||||
}
|
||||
TIMER_STOP(weight_load);
|
||||
|
||||
TIMER_START(hmx_core);
|
||||
{
|
||||
core_dot_chunk_fp16(vtcm_output, vtcm_activation, vtcm_weight, vtcm_scales, n_row_tiles, n_col_tiles, k / 32);
|
||||
}
|
||||
TIMER_STOP(hmx_core);
|
||||
|
||||
TIMER_START(output_store);
|
||||
{
|
||||
float *output = dst + (mr * n + nc);
|
||||
transfer_output_chunk_threaded(ctx, output, vtcm_output, n_rows, n_cols, n);
|
||||
}
|
||||
TIMER_STOP(output_store);
|
||||
}
|
||||
// prologue: A0
|
||||
const size_t n_cols_A0 = hex_smin(n - 0 * n_chunk_n_cols, n_chunk_n_cols);
|
||||
{
|
||||
const uint8_t *qweight_chunk_A0 = permuted_weight;
|
||||
dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_A0), row_stride, row_stride, row_stride, n_cols_A0);
|
||||
}
|
||||
HAP_compute_res_hmx_unlock(ctx->vtcm_rctx);
|
||||
} else {
|
||||
// 4-stage pipeline: DMA load (A), dequantize (B), HMX matmul (C), store (D)
|
||||
// HMX compute (C) runs on dedicated worker thread, overlapping with HVX stages (B, D).
|
||||
|
||||
// A --> B: vtcm_qweight, 1 buffer
|
||||
// B --> C: vtcm_weight0/vtcm_weight1, 2 buffers
|
||||
// C --> D: vtcm_output0/vtcm_output1, 2 buffers
|
||||
{
|
||||
const float *activation_chunk = activation + mr * k;
|
||||
transfer_activation_chunk_threaded(ctx, vtcm_activation, activation_chunk, n_rows, k, k);
|
||||
}
|
||||
|
||||
// Async timeline (C overlaps B+D):
|
||||
// main+HVX: [A0][Act][B0][A1][sub C0][B1‖C0][A2][wait,sub C1][D0+B2‖C1][wait,sub C2][D1‖C2][wait][D2]
|
||||
// HMX queue: [████ C0 ████████][████ C1 ████████████][████ C2 ████████]
|
||||
// prologue: B0, A1, submit C0 (async), B1 (overlaps C0)
|
||||
{
|
||||
// B0: wait for DMA, dequant weight chunk 0
|
||||
dma_queue_pop(ctx->dma[0]);
|
||||
dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[0], vtcm_qweight, n_cols_A0, k, row_stride, weight_type);
|
||||
|
||||
int n_chunk_cnt = hmx_ceil_div(n, n_chunk_n_cols);
|
||||
hmx_matmul_job_t job_slots[2]; // persistent double-buffered job descriptors
|
||||
|
||||
for (size_t mr = 0; mr < m; mr += m_chunk_n_rows) {
|
||||
const size_t n_rows = hex_smin(m - mr, m_chunk_n_rows);
|
||||
|
||||
void *vtcm_qweight = vtcm_weight;
|
||||
void *vtcm_weight_bufs[2] = { vtcm_scratch0, vtcm_scratch1 };
|
||||
void *vtcm_output_bufs[2] = { vtcm_output, vtcm_scratch2 };
|
||||
|
||||
// prologue: A0
|
||||
const size_t n_cols_A0 = hex_smin(n - 0 * n_chunk_n_cols, n_chunk_n_cols);
|
||||
{
|
||||
// Use 2D DMA (n_cols rows x row_stride) to avoid 16-bit roiwidth overflow.
|
||||
const uint8_t *qweight_chunk_A0 = permuted_weight;
|
||||
dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_A0), row_stride, row_stride, row_stride, n_cols_A0);
|
||||
// A1: issue DMA for weight chunk 1
|
||||
const size_t n_cols_A1 = hex_smin(n - 1 * n_chunk_n_cols, n_chunk_n_cols);
|
||||
if (1 < n_chunk_cnt) {
|
||||
const uint8_t *qweight_chunk_A1 = permuted_weight + n_chunk_n_cols * row_stride;
|
||||
dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_A1), row_stride, row_stride, row_stride, n_cols_A1);
|
||||
}
|
||||
|
||||
{
|
||||
const float *activation_chunk = activation + mr * k;
|
||||
transfer_activation_chunk_threaded(ctx, vtcm_activation, activation_chunk, n_rows, k, k);
|
||||
}
|
||||
// submit C0 (non-blocking — HMX worker executes in parallel)
|
||||
hmx_matmul_job_init(&job_slots[0], (__fp16 *) vtcm_output_bufs[0], (__fp16 *) vtcm_activation,
|
||||
(__fp16 *) vtcm_weight_bufs[0], vtcm_scales,
|
||||
hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS),
|
||||
hmx_ceil_div(n_cols_A0, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS);
|
||||
hmx_queue_push(ctx->hmx_queue, hmx_queue_make_desc(hmx_matmul_worker_fn, &job_slots[0]));
|
||||
|
||||
// prologue: B0, A1, submit C0 (async), B1 (overlaps C0)
|
||||
{
|
||||
// B0: wait for DMA, dequant weight chunk 0
|
||||
// B1: DMA pop + dequant (runs in parallel with C0 on HMX worker)
|
||||
if (1 < n_chunk_cnt) {
|
||||
dma_queue_pop(ctx->dma[0]);
|
||||
dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[0], vtcm_qweight, n_cols_A0, k, row_stride, weight_type);
|
||||
|
||||
// A1: issue DMA for weight chunk 1
|
||||
const size_t n_cols_A1 = hex_smin(n - 1 * n_chunk_n_cols, n_chunk_n_cols);
|
||||
if (1 < n_chunk_cnt) {
|
||||
const uint8_t *qweight_chunk_A1 = permuted_weight + n_chunk_n_cols * row_stride;
|
||||
dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_A1), row_stride, row_stride, row_stride, n_cols_A1);
|
||||
}
|
||||
|
||||
// submit C0 (non-blocking — HMX worker executes in parallel)
|
||||
hmx_matmul_job_init(&job_slots[0], (__fp16 *) vtcm_output_bufs[0], (__fp16 *) vtcm_activation,
|
||||
(__fp16 *) vtcm_weight_bufs[0], vtcm_scales,
|
||||
hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS),
|
||||
hmx_ceil_div(n_cols_A0, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS);
|
||||
hmx_queue_push(ctx->hmx_queue, hmx_queue_make_desc(hmx_matmul_worker_fn, &job_slots[0]));
|
||||
|
||||
// B1: DMA pop + dequant (runs in parallel with C0 on HMX worker)
|
||||
if (1 < n_chunk_cnt) {
|
||||
dma_queue_pop(ctx->dma[0]);
|
||||
dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[1], vtcm_qweight, n_cols_A1, k, row_stride, weight_type);
|
||||
}
|
||||
}
|
||||
|
||||
// main loop: wait C_i → submit C_{i+1} → D_i + B_{i+2} (parallel with C_{i+1})
|
||||
for (int i = 0; i < n_chunk_cnt; ++i) {
|
||||
const size_t nc = i * n_chunk_n_cols;
|
||||
const size_t nc_p1 = nc + 1 * n_chunk_n_cols;
|
||||
const size_t nc_p2 = nc + 2 * n_chunk_n_cols;
|
||||
|
||||
const size_t n_cols = hex_smin(n - nc, n_chunk_n_cols);
|
||||
const size_t n_cols_p1 = hex_smin(n - nc_p1, n_chunk_n_cols);
|
||||
const size_t n_cols_p2 = hex_smin(n - nc_p2, n_chunk_n_cols);
|
||||
|
||||
// issue A_{i+2}: DMA push (non-blocking)
|
||||
if (i + 2 < n_chunk_cnt) {
|
||||
const uint8_t *qweight_chunk_p2 = permuted_weight + nc_p2 * row_stride;
|
||||
dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_p2), row_stride, row_stride, row_stride, n_cols_p2);
|
||||
}
|
||||
|
||||
// wait C_i: block until prologue/previous C completes
|
||||
hmx_queue_pop(ctx->hmx_queue);
|
||||
|
||||
// submit C_{i+1} (non-blocking, overlaps with D_i + B_{i+2} below)
|
||||
// job_slots[(i+1)%2] is safe: C_i just completed, freeing slot i%2's
|
||||
// counterpart — and (i+1)%2 was last used by C_{i-1} which completed
|
||||
// before C_i was submitted.
|
||||
if (i + 1 < n_chunk_cnt) {
|
||||
hmx_matmul_job_init(&job_slots[(i + 1) % 2], (__fp16 *) vtcm_output_bufs[(i + 1) % 2],
|
||||
(__fp16 *) vtcm_activation, (__fp16 *) vtcm_weight_bufs[(i + 1) % 2],
|
||||
vtcm_scales, hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS),
|
||||
hmx_ceil_div(n_cols_p1, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS);
|
||||
hmx_queue_push(ctx->hmx_queue, hmx_queue_make_desc(hmx_matmul_worker_fn, &job_slots[(i + 1) % 2]));
|
||||
}
|
||||
|
||||
// D_i: store output (multi-thread HVX, parallel with C_{i+1})
|
||||
float *output_chunk = dst + (mr * n + nc);
|
||||
transfer_output_chunk_threaded(ctx, output_chunk, vtcm_output_bufs[i % 2], n_rows, n_cols, n);
|
||||
|
||||
// B_{i+2}: DMA pop + dequant (multi-thread HVX, parallel with C_{i+1})
|
||||
if (i + 2 < n_chunk_cnt) {
|
||||
dma_queue_pop(ctx->dma[0]);
|
||||
dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[(i + 2) % 2], vtcm_qweight, n_cols_p2, k, row_stride, weight_type);
|
||||
}
|
||||
dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[1], vtcm_qweight, n_cols_A1, k, row_stride, weight_type);
|
||||
}
|
||||
}
|
||||
|
||||
hmx_queue_suspend(ctx->hmx_queue);
|
||||
// main loop: wait C_i → submit C_{i+1} → D_i + B_{i+2} (parallel with C_{i+1})
|
||||
for (int i = 0; i < n_chunk_cnt; ++i) {
|
||||
const size_t nc = i * n_chunk_n_cols;
|
||||
const size_t nc_p1 = nc + 1 * n_chunk_n_cols;
|
||||
const size_t nc_p2 = nc + 2 * n_chunk_n_cols;
|
||||
|
||||
const size_t n_cols = hex_smin(n - nc, n_chunk_n_cols);
|
||||
const size_t n_cols_p1 = hex_smin(n - nc_p1, n_chunk_n_cols);
|
||||
const size_t n_cols_p2 = hex_smin(n - nc_p2, n_chunk_n_cols);
|
||||
|
||||
// issue A_{i+2}: DMA push (non-blocking)
|
||||
if (i + 2 < n_chunk_cnt) {
|
||||
const uint8_t *qweight_chunk_p2 = permuted_weight + nc_p2 * row_stride;
|
||||
dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_p2), row_stride, row_stride, row_stride, n_cols_p2);
|
||||
}
|
||||
|
||||
// wait C_i: block until prologue/previous C completes
|
||||
hmx_queue_pop(ctx->hmx_queue);
|
||||
|
||||
// submit C_{i+1} (non-blocking, overlaps with D_i + B_{i+2} below)
|
||||
// job_slots[(i+1)%2] is safe: C_i just completed, freeing slot i%2's
|
||||
// counterpart — and (i+1)%2 was last used by C_{i-1} which completed
|
||||
// before C_i was submitted.
|
||||
if (i + 1 < n_chunk_cnt) {
|
||||
hmx_matmul_job_init(&job_slots[(i + 1) % 2], (__fp16 *) vtcm_output_bufs[(i + 1) % 2],
|
||||
(__fp16 *) vtcm_activation, (__fp16 *) vtcm_weight_bufs[(i + 1) % 2],
|
||||
vtcm_scales, hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS),
|
||||
hmx_ceil_div(n_cols_p1, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS);
|
||||
hmx_queue_push(ctx->hmx_queue, hmx_queue_make_desc(hmx_matmul_worker_fn, &job_slots[(i + 1) % 2]));
|
||||
}
|
||||
|
||||
// D_i: store output (multi-thread HVX, parallel with C_{i+1})
|
||||
float *output_chunk = dst + (mr * n + nc);
|
||||
transfer_output_chunk_threaded(ctx, output_chunk, vtcm_output_bufs[i % 2], n_rows, n_cols, n);
|
||||
|
||||
// B_{i+2}: DMA pop + dequant (multi-thread HVX, parallel with C_{i+1})
|
||||
if (i + 2 < n_chunk_cnt) {
|
||||
dma_queue_pop(ctx->dma[0]);
|
||||
dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[(i + 2) % 2], vtcm_qweight, n_cols_p2, k, row_stride, weight_type);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
hmx_queue_suspend(ctx->hmx_queue);
|
||||
|
||||
TIMER_STOP(total);
|
||||
|
||||
#if defined(ENABLE_PROFILE_TIMERS)
|
||||
FARF(HIGH, "%s: %lld us, m=%d k=%d n=%d pipeline=%d", __func__, TIMER_US(total), m, k, n, use_pipeline);
|
||||
FARF(HIGH, "hex-mm-q: %lld us : m %d k %d n %d", TIMER_US(total), m, k, n);
|
||||
if (!use_pipeline) {
|
||||
FARF(HIGH, " activation_load: %lld us, weight_load: %lld us, hmx_core: %lld us, output_store: %lld us",
|
||||
TIMER_US(activation_load), TIMER_US(weight_load), TIMER_US(hmx_core), TIMER_US(output_store));
|
||||
@@ -1370,15 +1061,15 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
|
||||
|
||||
//
|
||||
|
||||
static inline int hmx_matmul_batch_r2(const hmx_matmul_w16a32_batched_params_t *params) {
|
||||
static inline int hmx_matmul_batch_r2(const hmx_matmul_f16_f32_batched_params_t *params) {
|
||||
return params->ne02 > 0 ? params->ne12 / params->ne02 : 1;
|
||||
}
|
||||
|
||||
static inline int hmx_matmul_batch_r3(const hmx_matmul_w16a32_batched_params_t *params) {
|
||||
static inline int hmx_matmul_batch_r3(const hmx_matmul_f16_f32_batched_params_t *params) {
|
||||
return params->ne03 > 0 ? params->ne13 / params->ne03 : 1;
|
||||
}
|
||||
|
||||
static inline const __fp16 *hmx_matmul_weight_batch_ptr(const hmx_matmul_w16a32_batched_params_t *params,
|
||||
static inline const __fp16 *hmx_matmul_weight_batch_ptr(const hmx_matmul_f16_f32_batched_params_t *params,
|
||||
int dst_b2, int dst_b3) {
|
||||
const int r2 = hmx_matmul_batch_r2(params);
|
||||
const int r3 = hmx_matmul_batch_r3(params);
|
||||
@@ -1387,37 +1078,36 @@ static inline const __fp16 *hmx_matmul_weight_batch_ptr(const hmx_matmul_w16a32_
|
||||
(size_t) (dst_b3 / r3) * params->src0_nb3);
|
||||
}
|
||||
|
||||
static inline const float *hmx_matmul_activation_batch_ptr(const hmx_matmul_w16a32_batched_params_t *params,
|
||||
static inline const float *hmx_matmul_activation_batch_ptr(const hmx_matmul_f16_f32_batched_params_t *params,
|
||||
int dst_b2, int dst_b3) {
|
||||
return (const float *) ((const uint8_t *) params->activation +
|
||||
(size_t) dst_b2 * params->src1_nb2 +
|
||||
(size_t) dst_b3 * params->src1_nb3);
|
||||
}
|
||||
|
||||
static inline float *hmx_matmul_dst_batch_ptr(const hmx_matmul_w16a32_batched_params_t *params,
|
||||
static inline float *hmx_matmul_dst_batch_ptr(const hmx_matmul_f16_f32_batched_params_t *params,
|
||||
int dst_b2, int dst_b3) {
|
||||
return (float *) ((uint8_t *) params->dst +
|
||||
(size_t) dst_b2 * params->dst_nb2 +
|
||||
(size_t) dst_b3 * params->dst_nb3);
|
||||
}
|
||||
|
||||
static int hmx_mat_mul_permuted_w16a32_batched_legacy(struct htp_context *ctx,
|
||||
const hmx_matmul_w16a32_batched_params_t *params) {
|
||||
static int hmx_matmul_f16_f32_batched_legacy(struct htp_context *ctx,
|
||||
const hmx_matmul_f16_f32_batched_params_t *params) {
|
||||
int ret = 0;
|
||||
for (int b3 = 0; b3 < params->ne13 && ret == 0; ++b3) {
|
||||
for (int b2 = 0; b2 < params->ne12 && ret == 0; ++b2) {
|
||||
ret = hmx_mat_mul_permuted_w16a32(ctx,
|
||||
hmx_matmul_dst_batch_ptr(params, b2, b3),
|
||||
hmx_matmul_activation_batch_ptr(params, b2, b3),
|
||||
hmx_matmul_weight_batch_ptr(params, b2, b3),
|
||||
params->m, params->k, params->n,
|
||||
params->act_stride, params->weight_stride);
|
||||
ret = hmx_matmul_f16_f32(ctx, hmx_matmul_dst_batch_ptr(params, b2, b3),
|
||||
hmx_matmul_activation_batch_ptr(params, b2, b3),
|
||||
hmx_matmul_weight_batch_ptr(params, b2, b3),
|
||||
params->m, params->k, params->n,
|
||||
params->act_stride, params->weight_stride);
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmul_w16a32_batched_params_t *params) {
|
||||
int hmx_matmul_f16_f32_batched(struct htp_context *ctx, const hmx_matmul_f16_f32_batched_params_t *params) {
|
||||
if (!ctx || !params || !params->dst || !params->activation || !params->permuted_weight) { return -1; }
|
||||
if (!params->m || !params->k || !params->n) { return -1; }
|
||||
if (params->act_stride < params->k || params->weight_stride < params->k || params->dst_stride < params->n) { return -1; }
|
||||
@@ -1435,7 +1125,7 @@ int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmu
|
||||
|
||||
if (group_size <= 1) {
|
||||
FARF(HIGH, "%s: no dim2 GQA reuse (group=%d), using legacy batched loop", __func__, group_size);
|
||||
return hmx_mat_mul_permuted_w16a32_batched_legacy(ctx, params);
|
||||
return hmx_matmul_f16_f32_batched_legacy(ctx, params);
|
||||
}
|
||||
|
||||
// Grouped path: reuse interleaved weight across all q_heads sharing a
|
||||
@@ -1464,7 +1154,7 @@ int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmu
|
||||
/*m_block_cost=*/(size_t) params->n,
|
||||
/*n_block_cost=*/(size_t) params->m, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) {
|
||||
FARF(HIGH, "%s: grouped path does not fit VTCM, falling back to legacy batched loop", __func__);
|
||||
return hmx_mat_mul_permuted_w16a32_batched_legacy(ctx, params);
|
||||
return hmx_matmul_f16_f32_batched_legacy(ctx, params);
|
||||
}
|
||||
|
||||
const size_t act_head_stride = m_chunk_n_rows * (size_t) params->k; // fp16 elements between heads
|
||||
@@ -1486,7 +1176,7 @@ int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmu
|
||||
|
||||
if ((size_t) (vtcm_ptr - (uint8_t *) ctx->vtcm_base) > vtcm_budget) {
|
||||
FARF(HIGH, "%s: grouped layout overflowed VTCM, falling back to legacy batched loop", __func__);
|
||||
return hmx_mat_mul_permuted_w16a32_batched_legacy(ctx, params);
|
||||
return hmx_matmul_f16_f32_batched_legacy(ctx, params);
|
||||
}
|
||||
|
||||
hmx_init_column_scales(vtcm_scales, Q6_V_vsplat_R(0x3c00)); // scale: 1.0, bias: 0.0 in FP16
|
||||
@@ -1614,7 +1304,7 @@ int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmu
|
||||
|
||||
//
|
||||
|
||||
int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx, float *restrict dst, const float *restrict activation,
|
||||
int hmx_matmul_f16_f32(struct htp_context *ctx, float *restrict dst, const float *restrict activation,
|
||||
const __fp16 *restrict permuted_weight, int m, int k, int n,
|
||||
int act_stride, int weight_stride) {
|
||||
if (!dst || !activation || !permuted_weight || !m || !n || !k) { return -1; }
|
||||
|
||||
@@ -33,14 +33,14 @@ typedef struct {
|
||||
size_t src1_nb3;
|
||||
size_t dst_nb2;
|
||||
size_t dst_nb3;
|
||||
} hmx_matmul_w16a32_batched_params_t;
|
||||
} hmx_matmul_f16_f32_batched_params_t;
|
||||
|
||||
// HMX matrix multiplication — tile-permuted FP16 weights, FP32 activation/output
|
||||
// act_stride: activation row stride in elements (= k for contiguous, or
|
||||
// nb[1]/sizeof(float) for permuted tensors like attention Q).
|
||||
// weight_stride: weight row stride in elements (= k for compact weights, or
|
||||
// nb[1]/sizeof(__fp16) for permuted KV-cache views used by QK).
|
||||
int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx,
|
||||
int hmx_matmul_f16_f32(struct htp_context *ctx,
|
||||
float *restrict dst,
|
||||
const float *activation,
|
||||
const __fp16 *permuted_weight,
|
||||
@@ -48,13 +48,12 @@ int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx,
|
||||
int act_stride,
|
||||
int weight_stride);
|
||||
|
||||
// Batched F16 wrapper over hmx_mat_mul_permuted_w16a32.
|
||||
// Batched F16 wrapper over hmx_mat_mul_f16_f32.
|
||||
// Batch semantics match ggml_mul_mat(): src0 broadcasts to src1 in dims 2/3.
|
||||
int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx,
|
||||
const hmx_matmul_w16a32_batched_params_t *params);
|
||||
int hmx_matmul_f16_f32_batched(struct htp_context *ctx, const hmx_matmul_f16_f32_batched_params_t *params);
|
||||
|
||||
// HMX matrix multiplication — tile-permuted quantised weights (Q4_0/Q8_0/IQ4_NL)
|
||||
int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx,
|
||||
// HMX matrix multiplication — quantised weights (Q4_0/Q8_0/IQ4_NL/MXFP4)
|
||||
int hmx_matmul_q_f32(struct htp_context *ctx,
|
||||
float *restrict dst,
|
||||
const float *activation,
|
||||
const uint8_t *permuted_weight,
|
||||
|
||||
@@ -87,6 +87,27 @@ AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
|
||||
}
|
||||
}
|
||||
|
||||
#if __HVX_ARCH__ >= 75
|
||||
{
|
||||
// Power on HMX and set HMX clock
|
||||
HAP_power_request_t request;
|
||||
memset(&request, 0, sizeof(HAP_power_request_t));
|
||||
request.type = HAP_power_set_HMX_v2;
|
||||
request.hmx_v2.set_power = TRUE;
|
||||
request.hmx_v2.power_up = TRUE;
|
||||
request.hmx_v2.set_clock = TRUE;
|
||||
request.hmx_v2.target_corner = HAP_DCVS_EXP_VCORNER_MAX;
|
||||
request.hmx_v2.min_corner = HAP_DCVS_EXP_VCORNER_MAX;
|
||||
request.hmx_v2.max_corner = HAP_DCVS_EXP_VCORNER_MAX;
|
||||
request.hmx_v2.perf_mode = HAP_CLK_PERF_HIGH;
|
||||
FARF(ALWAYS, "Setting HMX clock\n");
|
||||
err = HAP_power_set((void *) ctx, &request);
|
||||
if (err != AEE_SUCCESS) {
|
||||
FARF(ERROR, "Error setting HMX clock.");
|
||||
return err;
|
||||
}
|
||||
}
|
||||
#else
|
||||
{
|
||||
// Power on HMX
|
||||
HAP_power_request_t request;
|
||||
@@ -94,31 +115,12 @@ AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
|
||||
request.type = HAP_power_set_HMX;
|
||||
request.hmx.power_up = TRUE;
|
||||
FARF(ALWAYS, "Powering HMX on\n");
|
||||
err = HAP_power_set((void *) &ctx, &request);
|
||||
err = HAP_power_set((void *) ctx, &request);
|
||||
if (err != AEE_SUCCESS) {
|
||||
FARF(ERROR, "Error powering on HMX.");
|
||||
return err;
|
||||
}
|
||||
}
|
||||
|
||||
#if __HVX_ARCH__ >= 75
|
||||
{
|
||||
// Set HMX clock
|
||||
HAP_power_request_t request;
|
||||
memset(&request, 0, sizeof(HAP_power_request_t));
|
||||
request.type = HAP_power_set_HMX_v2;
|
||||
request.hmx_v2.set_clock = TRUE;
|
||||
request.hmx_v2.target_corner = HAP_DCVS_EXP_VCORNER_MAX;
|
||||
request.hmx_v2.min_corner = HAP_DCVS_EXP_VCORNER_MAX;
|
||||
request.hmx_v2.max_corner = HAP_DCVS_EXP_VCORNER_MAX;
|
||||
request.hmx_v2.perf_mode = HAP_CLK_PERF_HIGH;
|
||||
FARF(ALWAYS, "Setting HMX clock\n");
|
||||
err = HAP_power_set((void *) &ctx, &request);
|
||||
if (err != AEE_SUCCESS) {
|
||||
FARF(ERROR, "Error setting HMX clock.");
|
||||
return err;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
return AEE_SUCCESS;
|
||||
|
||||
@@ -2995,7 +2995,6 @@ int op_matmul(struct htp_ops_context * octx) {
|
||||
// is handled by HMX itself; when M < 32 fall back to HVX.
|
||||
const int m_total = (int) src1->ne[1];
|
||||
const int m_hmx = m_total & ~31; // 0 when M < 32
|
||||
|
||||
if (m_hmx == 0) {
|
||||
return op_matmul_hvx(octx);
|
||||
}
|
||||
@@ -3020,7 +3019,7 @@ int op_matmul(struct htp_ops_context * octx) {
|
||||
|
||||
if (src0->type == HTP_TYPE_F16) {
|
||||
if (is_batched) {
|
||||
hmx_matmul_w16a32_batched_params_t batch_params = {
|
||||
hmx_matmul_f16_f32_batched_params_t batch_params = {
|
||||
.dst = (float *) dst->data,
|
||||
.activation = (float *) src1->data,
|
||||
.permuted_weight = (const __fp16 *) src0->data,
|
||||
@@ -3041,15 +3040,14 @@ int op_matmul(struct htp_ops_context * octx) {
|
||||
.dst_nb2 = dst->nb[2],
|
||||
.dst_nb3 = dst->nb[3],
|
||||
};
|
||||
ret = hmx_mat_mul_permuted_w16a32_batched(octx->ctx, &batch_params);
|
||||
ret = hmx_matmul_f16_f32_batched(octx->ctx, &batch_params);
|
||||
} else {
|
||||
ret = hmx_mat_mul_permuted_w16a32(octx->ctx,
|
||||
ret = hmx_matmul_f16_f32(octx->ctx,
|
||||
(float*) dst->data, (float*) src1->data, (const __fp16 *) src0->data,
|
||||
m_total, k, n, act_stride, wgt_stride);
|
||||
}
|
||||
} else {
|
||||
ret = hmx_mat_mul_permuted_qk_0_d16a32(octx->ctx,
|
||||
(float*) dst->data, (float*) src1->data, (const uint8_t *) src0->data,
|
||||
ret = hmx_matmul_q_f32(octx->ctx, (float*) dst->data, (float*) src1->data, (const uint8_t *) src0->data,
|
||||
m_total, k, n, (int) src0->type);
|
||||
}
|
||||
|
||||
|
||||
@@ -107,7 +107,7 @@ static inline void rope_yarn_one(float theta, float freq_scale, float * corr_dim
|
||||
cache[i0 + 1] = sinf(theta_final) * mscale_final;
|
||||
}
|
||||
|
||||
static void rope_cache_init(const float theta_base,
|
||||
static __attribute__((noinline)) void rope_cache_init(const float theta_base,
|
||||
const float freq_scale,
|
||||
const float * freq_factors,
|
||||
float * corr_dims,
|
||||
@@ -129,7 +129,7 @@ static void rope_cache_init(const float theta_base,
|
||||
|
||||
// pos_t/h/w/e: the four position ids for this sequence step (t=time, h=height, w=width, e=extra).
|
||||
// sections[4]: number of head dims assigned to each position component.
|
||||
static void mrope_cache_init(const float pos_t,
|
||||
static __attribute__((noinline)) void mrope_cache_init(const float pos_t,
|
||||
const float pos_h,
|
||||
const float pos_w,
|
||||
const float pos_e,
|
||||
|
||||
@@ -20,55 +20,56 @@
|
||||
#include "htp-ops.h"
|
||||
#include "hvx-utils.h"
|
||||
|
||||
#define htp_ssm_conv_tensors_preamble \
|
||||
const struct htp_tensor * restrict src0 = octx->src[0]; \
|
||||
const struct htp_tensor * restrict src1 = octx->src[1]; \
|
||||
const struct htp_tensor * restrict dst = octx->dst; \
|
||||
struct htp_spad * restrict src0_spad = &octx->src0_spad; \
|
||||
struct htp_spad * restrict src1_spad = &octx->src1_spad; \
|
||||
struct htp_spad * restrict dst_spad = &octx->dst_spad; \
|
||||
\
|
||||
const uint32_t ne00 = src0->ne[0]; \
|
||||
const uint32_t ne01 = src0->ne[1]; \
|
||||
const uint32_t ne02 = src0->ne[2]; \
|
||||
const uint32_t ne03 = src0->ne[3]; \
|
||||
\
|
||||
const uint32_t ne10 = src1->ne[0]; \
|
||||
const uint32_t ne11 = src1->ne[1]; \
|
||||
const uint32_t ne12 = src1->ne[2]; \
|
||||
const uint32_t ne13 = src1->ne[3]; \
|
||||
\
|
||||
const uint32_t ne0 = dst->ne[0]; \
|
||||
const uint32_t ne1 = dst->ne[1]; \
|
||||
const uint32_t ne2 = dst->ne[2]; \
|
||||
const uint32_t ne3 = dst->ne[3]; \
|
||||
\
|
||||
const uint32_t nb00 = src0->nb[0]; \
|
||||
const uint32_t nb01 = src0->nb[1]; \
|
||||
const uint32_t nb02 = src0->nb[2]; \
|
||||
const uint32_t nb03 = src0->nb[3]; \
|
||||
\
|
||||
const uint32_t nb10 = src1->nb[0]; \
|
||||
const uint32_t nb11 = src1->nb[1]; \
|
||||
const uint32_t nb12 = src1->nb[2]; \
|
||||
const uint32_t nb13 = src1->nb[3]; \
|
||||
\
|
||||
const uint32_t nb0 = dst->nb[0]; \
|
||||
const uint32_t nb1 = dst->nb[1]; \
|
||||
const uint32_t nb2 = dst->nb[2]; \
|
||||
#define htp_ssm_conv_tensors_preamble \
|
||||
const struct htp_tensor * restrict src0 = octx->src[0]; \
|
||||
const struct htp_tensor * restrict src1 = octx->src[1]; \
|
||||
const struct htp_tensor * restrict dst = octx->dst; \
|
||||
struct htp_spad * restrict src0_spad = &octx->src0_spad; \
|
||||
struct htp_spad * restrict src1_spad = &octx->src1_spad; \
|
||||
struct htp_spad * restrict dst_spad = &octx->dst_spad; \
|
||||
\
|
||||
const uint32_t ne00 = src0->ne[0]; \
|
||||
const uint32_t ne01 = src0->ne[1]; \
|
||||
const uint32_t ne02 = src0->ne[2]; \
|
||||
const uint32_t ne03 = src0->ne[3]; \
|
||||
\
|
||||
const uint32_t ne10 = src1->ne[0]; \
|
||||
const uint32_t ne11 = src1->ne[1]; \
|
||||
const uint32_t ne12 = src1->ne[2]; \
|
||||
const uint32_t ne13 = src1->ne[3]; \
|
||||
\
|
||||
const uint32_t ne0 = dst->ne[0]; \
|
||||
const uint32_t ne1 = dst->ne[1]; \
|
||||
const uint32_t ne2 = dst->ne[2]; \
|
||||
const uint32_t ne3 = dst->ne[3]; \
|
||||
\
|
||||
const uint32_t nb00 = src0->nb[0]; \
|
||||
const uint32_t nb01 = src0->nb[1]; \
|
||||
const uint32_t nb02 = src0->nb[2]; \
|
||||
const uint32_t nb03 = src0->nb[3]; \
|
||||
\
|
||||
const uint32_t nb10 = src1->nb[0]; \
|
||||
const uint32_t nb11 = src1->nb[1]; \
|
||||
const uint32_t nb12 = src1->nb[2]; \
|
||||
const uint32_t nb13 = src1->nb[3]; \
|
||||
\
|
||||
const uint32_t nb0 = dst->nb[0]; \
|
||||
const uint32_t nb1 = dst->nb[1]; \
|
||||
const uint32_t nb2 = dst->nb[2]; \
|
||||
const uint32_t nb3 = dst->nb[3];
|
||||
|
||||
struct htp_ssm_conv_context {
|
||||
struct htp_ops_context * octx;
|
||||
uint32_t nrows_per_thread;
|
||||
uint32_t d_inner_tile;
|
||||
uint64_t t_start;
|
||||
};
|
||||
|
||||
#define htp_ssm_conv_preamble \
|
||||
#define htp_ssm_conv_preamble \
|
||||
struct htp_ssm_conv_context * scctx = (struct htp_ssm_conv_context *) data; \
|
||||
struct htp_ops_context * octx = scctx->octx; \
|
||||
htp_ssm_conv_tensors_preamble; \
|
||||
dma_queue * dma_queue = octx->ctx->dma[ith];
|
||||
struct htp_ops_context * octx = scctx->octx; \
|
||||
htp_ssm_conv_tensors_preamble; \
|
||||
dma_queue * dma_queue = octx->ctx->dma[ith];
|
||||
|
||||
// Scalar FP32 SSM_CONV implementation
|
||||
static void ssm_conv_thread_f32_f32(unsigned int nth, unsigned int ith, void *data) {
|
||||
@@ -128,118 +129,211 @@ static void ssm_conv_thread_f32_f32(unsigned int nth, unsigned int ith, void *da
|
||||
dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
|
||||
}
|
||||
|
||||
// HVX FP32 SSM_CONV implementation - vectorizes across d_inner dimension
|
||||
|
||||
// In-register 32x32 fp32 transpose using std 5-stage HVX vshuff butterfly.
|
||||
static inline void hvx_transpose_32x32_f32(HVX_Vector m[32]) {
|
||||
HVX_Vector tmp[32];
|
||||
|
||||
// Stage 0 (R = -4): pair (2i, 2i+1) for i = 0..15. m -> tmp.
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
HVX_VectorPair p = Q6_W_vshuff_VVR(m[2*i + 1], m[2*i], -4);
|
||||
tmp[2*i + 0] = Q6_V_lo_W(p);
|
||||
tmp[2*i + 1] = Q6_V_hi_W(p);
|
||||
}
|
||||
|
||||
// Stage 1 (R = -8): per block of 4, pair (b+0, b+2) and (b+1, b+3). tmp -> m.
|
||||
for (int b = 0; b < 32; b += 4) {
|
||||
HVX_VectorPair p0 = Q6_W_vshuff_VVR(tmp[b + 2], tmp[b + 0], -8);
|
||||
HVX_VectorPair p1 = Q6_W_vshuff_VVR(tmp[b + 3], tmp[b + 1], -8);
|
||||
m[b + 0] = Q6_V_lo_W(p0); m[b + 1] = Q6_V_hi_W(p0);
|
||||
m[b + 2] = Q6_V_lo_W(p1); m[b + 3] = Q6_V_hi_W(p1);
|
||||
}
|
||||
|
||||
// Stage 2 (R = -16): per block of 8, pair (b+i, b+i+4) for i = 0..3. m -> tmp.
|
||||
for (int b = 0; b < 32; b += 8) {
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
HVX_VectorPair p = Q6_W_vshuff_VVR(m[b + i + 4], m[b + i], -16);
|
||||
tmp[b + 2*i + 0] = Q6_V_lo_W(p);
|
||||
tmp[b + 2*i + 1] = Q6_V_hi_W(p);
|
||||
}
|
||||
}
|
||||
|
||||
// Stage 3 (R = -32): per block of 16, pair (b+i, b+i+8) for i = 0..7. tmp -> m.
|
||||
for (int b = 0; b < 32; b += 16) {
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
HVX_VectorPair p = Q6_W_vshuff_VVR(tmp[b + i + 8], tmp[b + i], -32);
|
||||
m[b + 2*i + 0] = Q6_V_lo_W(p);
|
||||
m[b + 2*i + 1] = Q6_V_hi_W(p);
|
||||
}
|
||||
}
|
||||
|
||||
// Stage 4 (R = -64): pair (i, i+16) for i = 0..15. m -> tmp -> m.
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
HVX_VectorPair p = Q6_W_vshuff_VVR(m[i + 16], m[i], -64);
|
||||
tmp[2 * i + 0] = Q6_V_lo_W(p);
|
||||
tmp[2 * i + 1] = Q6_V_hi_W(p);
|
||||
}
|
||||
|
||||
for (int i = 0; i < 32; ++i) {
|
||||
m[i] = tmp[i];
|
||||
}
|
||||
}
|
||||
|
||||
// HVX FP32 SSM_CONV implementation - channel-vectorized HVX kernel with src0/src1
|
||||
// transposed into VTCM.
|
||||
//
|
||||
// VTCM layouts (per thread):
|
||||
// src1_T : {d_inner_per_thread, d_conv} — staged once per launch (small).
|
||||
// src0_T : {d_inner_tile, ncs} — staged per d_inner-tile.
|
||||
//
|
||||
// d_inner_tile is chosen so that per-thread VTCM stays under the budget.
|
||||
// Each thread iterates ceil(d_inner_per_thread d_inner_tile) tiles serially.
|
||||
#define HTP_SSM_CONV_VTCM_BUDGET (1u << 20) // 1 MiB per thread
|
||||
|
||||
// Scalar transpose: src1 {d_conv, d_inner} (DDR) -> {d_inner_per_thread, d_conv} (VTCM)
|
||||
static inline void transpose_src1(const float * src1_data,
|
||||
uint32_t src1_stride_inner,
|
||||
uint32_t i1_off,
|
||||
uint32_t d_inner_per_thread,
|
||||
uint32_t d_conv,
|
||||
float * src1_T) {
|
||||
for (uint32_t i = 0; i < d_inner_per_thread; ++i) {
|
||||
const float * src_row = src1_data + (i1_off + i) * src1_stride_inner;
|
||||
for (uint32_t j = 0; j < d_conv; ++j) {
|
||||
src1_T[j * d_inner_per_thread + i] = src_row[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// HVX 32x32 src0 transpose: src0 {ncs, d_inner} (DDR) -> src0_T {d_inner_tile, ncs} (VTCM)
|
||||
static inline void transpose_src0_block(const float * src0_block,
|
||||
uint32_t ncs,
|
||||
uint32_t cb_n,
|
||||
uint32_t d_inner_tile,
|
||||
float * src0_T_block_dst,
|
||||
uint32_t cb /* dst column offset */) {
|
||||
const uint32_t T_TILE = VLEN_FP32;
|
||||
|
||||
HVX_Vector __attribute__((aligned(VLEN))) sub[32];
|
||||
|
||||
for (uint32_t t0 = 0; t0 < ncs; t0 += T_TILE) {
|
||||
const uint32_t t_n = MIN(T_TILE, ncs - t0);
|
||||
|
||||
// Load 32 rows (channels) of T_TILE samples; pad missing channels with zeros.
|
||||
for (uint32_t r = 0; r < cb_n; ++r) {
|
||||
const float * src_row = src0_block + r * ncs + t0;
|
||||
if (t_n == T_TILE) {
|
||||
sub[r] = *(const HVX_UVector *) src_row;
|
||||
} else {
|
||||
HVX_Vector v = hvx_vec_splat_f32(0.0f);
|
||||
hvx_vec_store_u(&v, t_n * sizeof(float), hvx_vec_splat_f32(0.0f));
|
||||
|
||||
float __attribute__((aligned(VLEN))) tmp[VLEN_FP32] = { 0 };
|
||||
for (uint32_t k = 0; k < t_n; ++k) tmp[k] = src_row[k];
|
||||
v = *(const HVX_Vector *) tmp;
|
||||
sub[r] = v;
|
||||
}
|
||||
}
|
||||
for (uint32_t r = cb_n; r < T_TILE; ++r) {
|
||||
sub[r] = hvx_vec_splat_f32(0.0f);
|
||||
}
|
||||
|
||||
hvx_transpose_32x32_f32(sub);
|
||||
|
||||
// Store transposed sub-tile to src0_T at offsets (t0 + j) * d_inner_tile + cb.
|
||||
// Only write the valid t_n rows of the transposed result.
|
||||
for (uint32_t r = 0; r < t_n; ++r) {
|
||||
float * dst = src0_T_block_dst + (t0 + r) * d_inner_tile + cb;
|
||||
if (cb_n == T_TILE) {
|
||||
*(HVX_UVector *) dst = sub[r];
|
||||
} else {
|
||||
hvx_vec_store_u(dst, cb_n * sizeof(float), sub[r]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ssm_conv_thread_f32_f32_hvx(unsigned int nth, unsigned int ith, void *data) {
|
||||
htp_ssm_conv_preamble;
|
||||
|
||||
uint64_t t1, t2;
|
||||
t1 = HAP_perf_get_qtimer_count();
|
||||
|
||||
const int nc = src1->ne[0]; // d_conv
|
||||
const int ncs = src0->ne[0]; // d_conv - 1 + n_t
|
||||
|
||||
const uint32_t d_conv = src1->ne[0];
|
||||
const uint32_t d_inner = src0->ne[1];
|
||||
const uint32_t n_t = dst->ne[1];
|
||||
const uint32_t n_s = dst->ne[2];
|
||||
const uint32_t ncs = src0->ne[0];
|
||||
|
||||
const uint32_t src0_stride_inner = src0->nb[1] / sizeof(float);
|
||||
const uint32_t src0_stride_seq = src0->nb[2] / sizeof(float);
|
||||
const uint32_t src1_stride_inner = src1->nb[1] / sizeof(float);
|
||||
const uint32_t dst_stride_token = dst->nb[1] / sizeof(float);
|
||||
const uint32_t dst_stride_seq = dst->nb[2] / sizeof(float);
|
||||
|
||||
const uint32_t dr = scctx->nrows_per_thread;
|
||||
const uint32_t ir0 = dr * ith;
|
||||
const uint32_t ir1 = MIN(ir0 + dr, d_inner);
|
||||
|
||||
if (ir0 >= ir1) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint32_t d_inner_per_thread = ir1 - ir0;
|
||||
const uint32_t d_inner_tile = scctx->d_inner_tile;
|
||||
|
||||
const float * src0_data = (const float *) src0->data;
|
||||
const float * src1_data = (const float *) src1->data;
|
||||
float * dst_data = (float *) dst->data;
|
||||
float * dst_data = (float *) dst->data;
|
||||
|
||||
// Calculate row range for this thread
|
||||
const int dr = scctx->nrows_per_thread;
|
||||
const uint32_t ir0 = dr * ith;
|
||||
const uint32_t ir1 = MIN(ir0 + dr, d_inner);
|
||||
const uint32_t ir = ir1 - ir0;
|
||||
// Per-thread VTCM regions.
|
||||
float * src0_T = (float *)(octx->src0_spad.data + ith * octx->src0_spad.size_per_thread);
|
||||
float * src1_T = (float *)(octx->src1_spad.data + ith * octx->src1_spad.size_per_thread);
|
||||
|
||||
if (ir0 >= ir1) {
|
||||
return; // No work for this thread
|
||||
}
|
||||
// Stage src1 weights once into VTCM in {d_inner_per_thread, d_conv} layout.
|
||||
transpose_src1(src1_data, src1_stride_inner, ir0, d_inner_per_thread, d_conv, src1_T);
|
||||
|
||||
// src0 and src1 gather offsets
|
||||
uint32_t __attribute__((aligned(VLEN))) src0_offsets[VLEN_FP32] = { 0 };
|
||||
uint32_t __attribute__((aligned(VLEN))) src1_offsets[VLEN_FP32] = { 0 };
|
||||
|
||||
for (uint32_t i = 0; i < VLEN_FP32; ++i) {
|
||||
src0_offsets[i] = i * (ncs) * sizeof(float);
|
||||
src1_offsets[i] = i * (d_conv) * sizeof(float);
|
||||
}
|
||||
|
||||
const uint32_t src0_gather_len = VLEN * ncs;
|
||||
const uint32_t src1_gather_len = VLEN * d_conv;
|
||||
|
||||
// gather scratchpads
|
||||
HVX_Vector * src0_vec = (HVX_Vector *) (octx->ctx->vtcm_base + ith * VLEN*2 + 0);
|
||||
HVX_Vector * src1_vec = (HVX_Vector *) (octx->ctx->vtcm_base + ith * VLEN*2 + VLEN);
|
||||
|
||||
float * data_src0 = (float *) ((char *) src0->data + ir0 * src0->nb[1]);
|
||||
float * data_src1 = (float *) ((char *) src1->data + ir0 * src1->nb[1]);
|
||||
|
||||
uint8_t * spad_src0 = octx->src0_spad.data + ith * octx->src0_spad.size_per_thread;
|
||||
uint8_t * spad_src1 = octx->src1_spad.data + ith * octx->src1_spad.size_per_thread;
|
||||
|
||||
// copy src1 workload to VTCM
|
||||
dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src1, data_src1), nb11, nb11, ir);
|
||||
|
||||
// FARF(HIGH, "ssm-conv-src1-fetch %d: ir0 %u size %u\n", ith, ir0, nb11 * ir);
|
||||
const uint32_t C_TILE = VLEN_FP32;
|
||||
|
||||
for (uint32_t i3 = 0; i3 < n_s; ++i3) {
|
||||
float * src0_data_ptr = (float *) ((char *) data_src0 + i3 * (src0->nb[2]));
|
||||
for (uint32_t tile_off = 0; tile_off < d_inner_per_thread; tile_off += d_inner_tile) {
|
||||
const uint32_t tile_n = MIN(d_inner_tile, d_inner_per_thread - tile_off);
|
||||
|
||||
// copy src0 workload to VTCM
|
||||
dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0, src0_data_ptr), nb01, nb01, ir);
|
||||
// Place src0 chunk into VTCM in {d_inner_tile, ncs} layout.
|
||||
const float * src0_block = src0_data + i3 * src0_stride_seq + (ir0 + tile_off) * src0_stride_inner;
|
||||
|
||||
// FARF(HIGH, "ssm-conv-src0-fetch %d: ir0 %u i3 %u size %u\n", ith, ir0, i3, nb01 * ir);
|
||||
|
||||
dma_queue_flush(dma_queue);
|
||||
|
||||
for (uint32_t i2 = 0; i2 < n_t; ++i2) {
|
||||
float * dst_ptr = (float *) ((char *) dst->data + ir0 * (dst->nb[0]) + i2 * (dst->nb[1]) + i3 * (dst->nb[2]));
|
||||
|
||||
const uint32_t nvec = ir / VLEN_FP32;
|
||||
const uint32_t nloe = ir % VLEN_FP32;
|
||||
uint32_t i1 = 0;
|
||||
|
||||
for (uint32_t vi1 = 0; vi1 < nvec; vi1++) {
|
||||
HVX_Vector acc_vec = Q6_V_vsplat_R(0);
|
||||
|
||||
for (uint32_t i0 = 0; i0 < d_conv; ++i0) {
|
||||
uint32_t src0_base = (uint32_t) spad_src0 + (i0 + i1 * ncs) * sizeof(float) + i2 * (src0->nb[0]);
|
||||
uint32_t src1_base = (uint32_t) spad_src1 + (i0 + i1 * nc) * sizeof(float);
|
||||
Q6_vgather_ARMVw(src0_vec, src0_base, src0_gather_len, (*(const HVX_Vector *) src0_offsets));
|
||||
Q6_vgather_ARMVw(src1_vec, src1_base, src1_gather_len, (*(const HVX_Vector *) src1_offsets));
|
||||
|
||||
HVX_Vector prod = Q6_Vqf32_vmpy_VsfVsf(*(const HVX_Vector *) src0_vec, *(const HVX_Vector *) src1_vec);
|
||||
acc_vec = Q6_Vqf32_vadd_Vqf32Vqf32(acc_vec, prod);
|
||||
}
|
||||
|
||||
*(HVX_UVector *) (dst_ptr + i1) = Q6_Vsf_equals_Vqf32(acc_vec);
|
||||
i1 += VLEN_FP32;
|
||||
for (uint32_t cb = 0; cb < tile_n; cb += C_TILE) {
|
||||
const uint32_t cb_n = MIN(C_TILE, tile_n - cb);
|
||||
transpose_src0_block(src0_block + cb * src0_stride_inner, ncs, cb_n, d_inner_tile, src0_T, cb);
|
||||
}
|
||||
|
||||
if (nloe) {
|
||||
HVX_Vector acc_vec = Q6_V_vsplat_R(0);
|
||||
for (uint32_t t = 0; t < n_t; ++t) {
|
||||
for (uint32_t cb = 0; cb < tile_n; cb += C_TILE) {
|
||||
const uint32_t cb_n = MIN(C_TILE, tile_n - cb);
|
||||
|
||||
for (uint32_t i0 = 0; i0 < d_conv; ++i0) {
|
||||
uint32_t src0_base = (uint32_t) spad_src0 + (i0 + i1 * ncs) * sizeof(float) + i2 * (src0->nb[0]);
|
||||
uint32_t src1_base = (uint32_t) spad_src1 + (i0 + i1 * nc) * sizeof(float);
|
||||
Q6_vgather_ARMVw(src0_vec, src0_base, src0_gather_len, (*(const HVX_Vector *) src0_offsets));
|
||||
Q6_vgather_ARMVw(src1_vec, src1_base, src1_gather_len, (*(const HVX_Vector *) src1_offsets));
|
||||
HVX_Vector acc = hvx_vec_splat_f32(0.0f);
|
||||
for (uint32_t j = 0; j < d_conv; ++j) {
|
||||
HVX_Vector x = *(const HVX_Vector *) (src0_T + (t + j) * d_inner_tile + cb);
|
||||
HVX_Vector w = *(const HVX_Vector *) (src1_T + j * d_inner_per_thread + tile_off + cb);
|
||||
acc = Q6_Vqf32_vadd_Vqf32Vqf32(acc, Q6_Vqf32_vmpy_VsfVsf(x, w));
|
||||
}
|
||||
HVX_Vector res = Q6_Vsf_equals_Vqf32(acc);
|
||||
|
||||
HVX_Vector prod = Q6_Vqf32_vmpy_VsfVsf(*(const HVX_Vector *) src0_vec, *(const HVX_Vector *) src1_vec);
|
||||
acc_vec = Q6_Vqf32_vadd_Vqf32Vqf32(acc_vec, prod);
|
||||
float * dst_ptr = dst_data + i3 * dst_stride_seq + t * dst_stride_token + (ir0 + tile_off + cb);
|
||||
if (cb_n == C_TILE) {
|
||||
*(HVX_UVector *) dst_ptr = res;
|
||||
} else {
|
||||
hvx_vec_store_u(dst_ptr, cb_n * sizeof(float), res);
|
||||
}
|
||||
}
|
||||
|
||||
hvx_vec_store_u(dst_ptr + i1, (ir - i1) * 4, Q6_Vsf_equals_Vqf32(acc_vec));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
t2 = HAP_perf_get_qtimer_count();
|
||||
|
||||
FARF(HIGH, "ssm-conv-f32-hvx %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n",
|
||||
ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], ir0, ir1,
|
||||
FARF(HIGH, "ssm-conv-f32-hvx %d/%d: %ux%ux%ux%u (%u:%u) tile=%u * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n",
|
||||
ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], ir0, ir1, d_inner_tile,
|
||||
src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1],
|
||||
dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
|
||||
}
|
||||
@@ -264,46 +358,44 @@ int op_ssm_conv_f32(struct htp_ops_context * octx) {
|
||||
|
||||
if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
|
||||
uint32_t use_hvx = 0;
|
||||
if (d_inner >= VLEN_FP32 && d_inner % VLEN_FP32 == 0) {
|
||||
int is_aligned = hex_is_aligned((void *) src0->data, VLEN) &&
|
||||
hex_is_aligned((void *) src1->data, VLEN) &&
|
||||
hex_is_aligned((void *) dst->data, VLEN);
|
||||
|
||||
if (is_aligned) {
|
||||
use_hvx = 1;
|
||||
}
|
||||
if (d_inner >= VLEN_FP32 && n_t >= VLEN_FP32) {
|
||||
use_hvx = 1;
|
||||
}
|
||||
|
||||
if (use_hvx) {
|
||||
scctx.nrows_per_thread = (d_inner + n_threads - 1) / n_threads; // d_inner chunks per thread
|
||||
scctx.nrows_per_thread += (scctx.nrows_per_thread & 1); // round up to even
|
||||
scctx.nrows_per_thread = (d_inner + n_threads - 1) / n_threads;
|
||||
scctx.nrows_per_thread += (scctx.nrows_per_thread & 1);
|
||||
|
||||
octx->src0_spad.size_per_thread = hex_round_up(scctx.nrows_per_thread * nb01, 256);
|
||||
octx->src1_spad.size_per_thread = hex_round_up(scctx.nrows_per_thread * nb11, 256);
|
||||
octx->dst_spad.size_per_thread = hex_round_up(scctx.nrows_per_thread * sizeof(float), 256);
|
||||
const uint32_t d_inner_per_thread = scctx.nrows_per_thread;
|
||||
const uint32_t ncs = src0->ne[0];
|
||||
|
||||
const uint32_t src1_T_size = hex_round_up(d_conv * d_inner_per_thread * sizeof(float), 256);
|
||||
const uint32_t src0_T_max = HTP_SSM_CONV_VTCM_BUDGET > src1_T_size ? HTP_SSM_CONV_VTCM_BUDGET - src1_T_size : 0;
|
||||
|
||||
uint32_t d_inner_tile = (src0_T_max / sizeof(float)) / ncs;
|
||||
d_inner_tile -= (d_inner_tile % VLEN_FP32);
|
||||
if (d_inner_tile == 0) {
|
||||
FARF(HIGH, "ssm_conv-f32: inner tile rounds to 0 (ncs=%u), falling back to scalar\n", ncs);
|
||||
use_hvx = 0;
|
||||
} else {
|
||||
scctx.d_inner_tile = d_inner_tile;
|
||||
|
||||
octx->src0_spad.size_per_thread = hex_round_up(d_inner_tile * ncs * sizeof(float), 256);
|
||||
octx->src1_spad.size_per_thread = src1_T_size;
|
||||
octx->dst_spad.size_per_thread = 0;
|
||||
|
||||
octx->src0_spad.size = octx->src0_spad.size_per_thread * n_threads;
|
||||
octx->src1_spad.size = octx->src1_spad.size_per_thread * n_threads;
|
||||
octx->dst_spad.size = octx->dst_spad.size_per_thread * n_threads;
|
||||
octx->dst_spad.size = 0;
|
||||
|
||||
// Compute gather scratchpad size for src0 and src1
|
||||
const size_t gather_spad_size = n_threads * VLEN * 2;
|
||||
octx->src0_spad.data = octx->ctx->vtcm_base;
|
||||
octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
|
||||
octx->src0_spad.src = NULL;
|
||||
octx->src1_spad.src = NULL;
|
||||
|
||||
octx->src0_spad.data = octx->ctx->vtcm_base + gather_spad_size; octx->src0_spad.src = NULL;
|
||||
octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size; octx->src1_spad.src = NULL;
|
||||
octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size; octx->dst_spad.src = NULL;
|
||||
|
||||
FARF(HIGH, "ssm_conv-f32: gather-spad:%zu spad-per-thread:(%u:%u:%u) spad-sizes:(%u:%u:%u) spad-data:(%p:%p:%p)\n",
|
||||
gather_spad_size, octx->src0_spad.size_per_thread, octx->src1_spad.size_per_thread,
|
||||
octx->dst_spad.size_per_thread, octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size,
|
||||
octx->src0_spad.data, octx->src1_spad.data, octx->dst_spad.data);
|
||||
|
||||
const size_t total_spad_size =
|
||||
gather_spad_size + octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;
|
||||
|
||||
if (total_spad_size > octx->ctx->vtcm_size) {
|
||||
FARF(HIGH, "ssm_conv-f32: HVX scratchpad size %zu exceeds VTCM size %zu", total_spad_size,
|
||||
octx->ctx->vtcm_size);
|
||||
const size_t total_spad = octx->src0_spad.size + octx->src1_spad.size;
|
||||
if (total_spad > octx->ctx->vtcm_size) {
|
||||
FARF(HIGH, "ssm_conv-f32: scratchpad %zu exceeds VTCM %zu, falling back to scalar\n",
|
||||
total_spad, octx->ctx->vtcm_size);
|
||||
use_hvx = 0;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -564,9 +564,20 @@ int ggml_metal_op_concat(ggml_metal_op_t ctx, int idx) {
|
||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
|
||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3);
|
||||
|
||||
const int nth = std::min(1024, ne0);
|
||||
int nth = std::min(256, ne0);
|
||||
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, ne1, ne2, ne3, nth, 1, 1);
|
||||
// when rows are small, we can batch them together in a single threadgroup
|
||||
int nrptg = 1;
|
||||
if (nth < 256) {
|
||||
nrptg = std::min((256 + nth - 1) / nth, ne1);
|
||||
if (nrptg * nth > 256) {
|
||||
nrptg = 256 / nth;
|
||||
}
|
||||
}
|
||||
|
||||
const int nw0 = (ne1 + nrptg - 1) / nrptg;
|
||||
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, nw0, ne2, ne3, nth, nrptg, 1);
|
||||
|
||||
return 1;
|
||||
}
|
||||
@@ -1786,7 +1797,7 @@ int ggml_metal_op_set(ggml_metal_op_t ctx, int idx) {
|
||||
nk0 = ne10/ggml_blck_size(op->type);
|
||||
}
|
||||
|
||||
int nth = std::min<int>(nk0, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
|
||||
int nth = std::min<int>(nk0*ne11, 256);
|
||||
|
||||
// when rows are small, we can batch them together in a single threadgroup
|
||||
int nrptg = 1;
|
||||
@@ -1797,7 +1808,7 @@ int ggml_metal_op_set(ggml_metal_op_t ctx, int idx) {
|
||||
nrptg = (nth + nk0 - 1)/nk0;
|
||||
nth = nk0;
|
||||
|
||||
if (nrptg*nth > ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
|
||||
if (nrptg*nth > 256) {
|
||||
nrptg--;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7486,7 +7486,11 @@ kernel void kernel_concat(
|
||||
|
||||
const int i3 = tgpig.z;
|
||||
const int i2 = tgpig.y;
|
||||
const int i1 = tgpig.x;
|
||||
const int i1 = ntg.y == 1 ? tgpig.x : tgpig.x*ntg.y + tpitg.y;
|
||||
|
||||
if (i1 >= args.ne1) {
|
||||
return;
|
||||
}
|
||||
|
||||
int o[4] = {0, 0, 0, 0};
|
||||
o[args.dim] = args.dim == 0 ? args.ne00 : (args.dim == 1 ? args.ne01 : (args.dim == 2 ? args.ne02 : args.ne03));
|
||||
|
||||
@@ -375,6 +375,11 @@ struct ggml_backend_opencl_device_context {
|
||||
ggml_backend_buffer_type buffer_type;
|
||||
|
||||
cl_context context = nullptr;
|
||||
|
||||
GPU_FAMILY gpu_family = GPU_FAMILY::UNKNOWN;
|
||||
ADRENO_GPU_GEN adreno_gen = ADRENO_GPU_GEN::ADRENO_UNKNOWN;
|
||||
|
||||
size_t global_mem_size = 0;
|
||||
};
|
||||
|
||||
// backend context
|
||||
@@ -384,6 +389,18 @@ struct ggml_backend_opencl_context {
|
||||
cl_device_id device;
|
||||
std::string device_name;
|
||||
|
||||
ggml_cl_version platform_version;
|
||||
ggml_cl_version opencl_c_version;
|
||||
|
||||
// argsort is loaded in supports_op because its availability depends on how
|
||||
// many workgroups are allowed, which requires kernel compilation.
|
||||
bool kernels_loaded_argsort = false;
|
||||
// flash attn is loaded in supports_op because it contains multiple variants
|
||||
// and takes time to compile, so we want to only compile it when needed.
|
||||
bool kernels_loaded_flash_attn = false;
|
||||
// rest of the kernels are currently always loaded in alloc_buffer.
|
||||
bool kernels_loaded = false;
|
||||
|
||||
std::string driver_version;
|
||||
|
||||
GPU_FAMILY gpu_family;
|
||||
@@ -644,11 +661,10 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_mul_mm_iq4_nl_f32_l4_lm;
|
||||
|
||||
std::vector<ProfilingInfo> profiling_info;
|
||||
std::vector<ProfilingInfo> profiling_results;
|
||||
|
||||
void write_profiling_info() {
|
||||
FILE * fperf = fopen("cl_profiling.csv", "w");
|
||||
if (!fperf) {
|
||||
GGML_LOG_ERROR("Failed to open cl_profiling.csv\n");
|
||||
void flush_profiling_batch() {
|
||||
if (profiling_info.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -672,6 +688,7 @@ struct ggml_backend_opencl_context {
|
||||
CL_CHECK(clGetEventProfilingInfo(
|
||||
info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL));
|
||||
CL_CHECK(clReleaseEvent(info.evt));
|
||||
info.evt = nullptr;
|
||||
|
||||
char kernel_name[512];
|
||||
CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME,
|
||||
@@ -689,10 +706,26 @@ struct ggml_backend_opencl_context {
|
||||
info.cmd_complete_duration_ns = cmd_complete - cmd_end;
|
||||
info.cmd_total_duration_ns = cmd_complete - cmd_queued;
|
||||
}
|
||||
profiling_results.insert(profiling_results.end(),
|
||||
std::make_move_iterator(profiling_info.begin()),
|
||||
std::make_move_iterator(profiling_info.end()));
|
||||
profiling_info.clear();
|
||||
}
|
||||
|
||||
void write_profiling_info() {
|
||||
if (profiling_results.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Dump a csv
|
||||
FILE * fperf = fopen("cl_profiling.csv", "w");
|
||||
if (!fperf) {
|
||||
GGML_LOG_ERROR("Failed to open cl_profiling.csv\n");
|
||||
return;
|
||||
}
|
||||
|
||||
fprintf(fperf, "op name, kernel name, exec duration (ms), global size, local size, output size\n");
|
||||
for (const ProfilingInfo & info : profiling_info) {
|
||||
for (const ProfilingInfo & info : profiling_results) {
|
||||
fprintf(fperf, "%s,%s,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
|
||||
info.op_name.c_str(), info.kernel_name.c_str(),
|
||||
info.cmd_duration_ns/1.e6f,
|
||||
@@ -703,14 +736,14 @@ struct ggml_backend_opencl_context {
|
||||
fclose(fperf);
|
||||
|
||||
// Dump a simple chrome trace
|
||||
FILE* ftrace = fopen("cl_trace.json", "w");
|
||||
FILE * ftrace = fopen("cl_trace.json", "w");
|
||||
if (!ftrace) {
|
||||
GGML_LOG_ERROR("Failed to open cl_trace.json\n");
|
||||
return;
|
||||
}
|
||||
|
||||
fprintf(ftrace, "[\n");
|
||||
for (const ProfilingInfo & info : profiling_info) {
|
||||
for (const ProfilingInfo & info : profiling_results) {
|
||||
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %" PRIu64 ", \"pid\": \"\", \"tid\": \"Host\"},\n",
|
||||
info.kernel_name.c_str(), info.cmd_queued/1000);
|
||||
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %" PRIu64 ", \"pid\": \"\", \"tid\": \"Host\"},\n",
|
||||
@@ -721,6 +754,7 @@ struct ggml_backend_opencl_context {
|
||||
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %" PRIu64 ", \"pid\": \"\", \"tid\": \"Device\"},\n",
|
||||
info.kernel_name.c_str(), info.cmd_end/1000);
|
||||
}
|
||||
fprintf(ftrace, "]\n");
|
||||
fclose(ftrace);
|
||||
}
|
||||
|
||||
@@ -741,6 +775,9 @@ struct ggml_backend_opencl_context {
|
||||
|
||||
profiling_info.emplace_back();
|
||||
populateProfilingInfo(profiling_info.back(), evt, kernel, work_dim, global_work_size, local_work_size, tensor);
|
||||
if (profiling_info.size() >= 2048) {
|
||||
flush_profiling_batch();
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(tensor);
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
||||
@@ -781,11 +818,13 @@ struct ggml_backend_opencl_context {
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
|
||||
void free() {
|
||||
clFinish(queue);
|
||||
|
||||
ref_count--;
|
||||
if (ref_count == 0) {
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
write_profiling_info();
|
||||
profiling_info.clear();
|
||||
profiling_results.clear();
|
||||
#endif
|
||||
}
|
||||
}
|
||||
@@ -793,6 +832,9 @@ struct ggml_backend_opencl_context {
|
||||
|
||||
// All registered devices with a default device in the front.
|
||||
static std::vector<ggml_backend_device> g_ggml_backend_opencl_devices;
|
||||
// All device contexts associated with the devices above.
|
||||
// The devices live as long as the process, so do the contexts.
|
||||
static std::vector<std::unique_ptr<ggml_backend_opencl_device_context>> g_ggml_backend_opencl_dev_ctxs;
|
||||
|
||||
inline std::string read_file(const std::string &path) {
|
||||
std::ifstream ifs(path);
|
||||
@@ -836,12 +878,120 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co
|
||||
return p;
|
||||
}
|
||||
|
||||
static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_version opencl_c_version) {
|
||||
static void load_cl_kernels_argsort(ggml_backend_opencl_context *backend_ctx) {
|
||||
// compiler options for general kernels
|
||||
auto opencl_c_std =
|
||||
std::string("CL") + std::to_string(backend_ctx->opencl_c_version.major) + "." + std::to_string(backend_ctx->opencl_c_version.minor);
|
||||
std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
|
||||
" -cl-mad-enable -cl-unsafe-math-optimizations"
|
||||
" -cl-finite-math-only -cl-fast-relaxed-math";
|
||||
|
||||
// argsort
|
||||
if (!backend_ctx->kernels_loaded_argsort) {
|
||||
cl_int err;
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "argsort.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("argsort.cl");
|
||||
#endif
|
||||
backend_ctx->program_argsort_f32_i32 =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_argsort_f32_i32 = clCreateKernel(backend_ctx->program_argsort_f32_i32, "kernel_argsort_f32_i32", &err), err));
|
||||
backend_ctx->kernels_loaded_argsort = true;
|
||||
}
|
||||
}
|
||||
|
||||
static void load_cl_kernels_flash_attn(ggml_backend_opencl_context *backend_ctx) {
|
||||
// compiler options for general kernels
|
||||
auto opencl_c_std =
|
||||
std::string("CL") + std::to_string(backend_ctx->opencl_c_version.major) + "." + std::to_string(backend_ctx->opencl_c_version.minor);
|
||||
std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
|
||||
" -cl-mad-enable -cl-unsafe-math-optimizations"
|
||||
" -cl-finite-math-only -cl-fast-relaxed-math";
|
||||
|
||||
// flash_attn
|
||||
if (!backend_ctx->kernels_loaded_flash_attn) {
|
||||
cl_int err;
|
||||
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src_f16 {
|
||||
#include "flash_attn_f16.cl.h"
|
||||
};
|
||||
const std::string kernel_src_f32 {
|
||||
#include "flash_attn_f32.cl.h"
|
||||
};
|
||||
const std::string kernel_src_f32_f16 {
|
||||
#include "flash_attn_f32_f16.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src_f16 = read_file("flash_attn_f16.cl");
|
||||
const std::string kernel_src_f32 = read_file("flash_attn_f32.cl");
|
||||
const std::string kernel_src_f32_f16 = read_file("flash_attn_f32_f16.cl");
|
||||
#endif
|
||||
|
||||
if (!kernel_src_f16.empty() && !kernel_src_f32.empty() && !kernel_src_f32_f16.empty()) {
|
||||
const struct { int dk; int dv; int bm; int bn; } fa_dims[] = {
|
||||
{ 40, 40, 32, 32}, { 64, 64, 64, 64}, { 80, 80, 64, 32}, { 96, 96, 64, 32},
|
||||
{112, 112, 32, 32}, {128, 128, 32, 32}, {192, 128, 16, 16},
|
||||
{192, 192, 16, 16}, {256, 256, 16, 16},
|
||||
};
|
||||
|
||||
for (size_t i = 0; i < sizeof(fa_dims)/sizeof(fa_dims[0]); ++i) {
|
||||
const int dk = fa_dims[i].dk;
|
||||
const int dv = fa_dims[i].dv;
|
||||
const int bm = fa_dims[i].bm;
|
||||
const int bn = fa_dims[i].bn;
|
||||
std::string OPTS = compile_opts +
|
||||
" -D DK=" + std::to_string(dk) +
|
||||
" -D DV=" + std::to_string(dv) +
|
||||
" -D BLOCK_M=" + std::to_string(bm) +
|
||||
" -D BLOCK_N=" + std::to_string(bn);
|
||||
|
||||
cl_program prog_f16 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f16.c_str(), OPTS);
|
||||
cl_kernel k_f16, k_f16_q1;
|
||||
CL_CHECK((k_f16 = clCreateKernel(prog_f16, "flash_attn_f16", &err), err));
|
||||
CL_CHECK((k_f16_q1 = clCreateKernel(prog_f16, "flash_attn_f16_q1", &err), err));
|
||||
backend_ctx->kernels_flash_attn_f16[{dk, dv}] = k_f16;
|
||||
backend_ctx->kernels_flash_attn_f16_q1[{dk, dv}] = k_f16_q1;
|
||||
CL_CHECK(clReleaseProgram(prog_f16));
|
||||
|
||||
cl_program prog_f32 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f32.c_str(), OPTS);
|
||||
cl_kernel k_f32, k_f32_q1;
|
||||
CL_CHECK((k_f32 = clCreateKernel(prog_f32, "flash_attn_f32", &err), err));
|
||||
CL_CHECK((k_f32_q1 = clCreateKernel(prog_f32, "flash_attn_f32_q1", &err), err));
|
||||
backend_ctx->kernels_flash_attn_f32[{dk, dv}] = k_f32;
|
||||
backend_ctx->kernels_flash_attn_f32_q1[{dk, dv}] = k_f32_q1;
|
||||
CL_CHECK(clReleaseProgram(prog_f32));
|
||||
|
||||
cl_program prog_f32_f16 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f32_f16.c_str(), OPTS);
|
||||
cl_kernel k_f32_f16, k_f32_f16_q1;
|
||||
CL_CHECK((k_f32_f16 = clCreateKernel(prog_f32_f16, "flash_attn_f32_f16", &err), err));
|
||||
CL_CHECK((k_f32_f16_q1 = clCreateKernel(prog_f32_f16, "flash_attn_f32_f16_q1", &err), err));
|
||||
backend_ctx->kernels_flash_attn_f32_f16[{dk, dv}] = k_f32_f16;
|
||||
backend_ctx->kernels_flash_attn_f32_f16_q1[{dk, dv}] = k_f32_f16_q1;
|
||||
CL_CHECK(clReleaseProgram(prog_f32_f16));
|
||||
|
||||
backend_ctx->kernels_flash_attn_bm[{dk, dv}] = bm;
|
||||
backend_ctx->kernels_flash_attn_bn[{dk, dv}] = bn;
|
||||
}
|
||||
backend_ctx->kernels_loaded_flash_attn = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
|
||||
if (backend_ctx->kernels_loaded) {
|
||||
return;
|
||||
}
|
||||
|
||||
cl_int err;
|
||||
|
||||
// compiler options for general kernels
|
||||
auto opencl_c_std =
|
||||
std::string("CL") + std::to_string(opencl_c_version.major) + "." + std::to_string(opencl_c_version.minor);
|
||||
std::string("CL") + std::to_string(backend_ctx->opencl_c_version.major) + "." + std::to_string(backend_ctx->opencl_c_version.minor);
|
||||
std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
|
||||
" -cl-mad-enable -cl-unsafe-math-optimizations"
|
||||
" -cl-finite-math-only -cl-fast-relaxed-math";
|
||||
@@ -1986,89 +2136,6 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// flash_attn
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src_f16 {
|
||||
#include "flash_attn_f16.cl.h"
|
||||
};
|
||||
const std::string kernel_src_f32 {
|
||||
#include "flash_attn_f32.cl.h"
|
||||
};
|
||||
const std::string kernel_src_f32_f16 {
|
||||
#include "flash_attn_f32_f16.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src_f16 = read_file("flash_attn_f16.cl");
|
||||
const std::string kernel_src_f32 = read_file("flash_attn_f32.cl");
|
||||
const std::string kernel_src_f32_f16 = read_file("flash_attn_f32_f16.cl");
|
||||
#endif
|
||||
|
||||
if (!kernel_src_f16.empty() && !kernel_src_f32.empty() && !kernel_src_f32_f16.empty()) {
|
||||
const struct { int dk; int dv; int bm; int bn; } fa_dims[] = {
|
||||
{ 40, 40, 32, 32}, { 64, 64, 64, 64}, { 80, 80, 64, 32}, { 96, 96, 64, 32},
|
||||
{112, 112, 32, 32}, {128, 128, 32, 32}, {192, 128, 16, 16},
|
||||
{192, 192, 16, 16}, {256, 256, 16, 16},
|
||||
};
|
||||
|
||||
for (size_t i = 0; i < sizeof(fa_dims)/sizeof(fa_dims[0]); ++i) {
|
||||
const int dk = fa_dims[i].dk;
|
||||
const int dv = fa_dims[i].dv;
|
||||
const int bm = fa_dims[i].bm;
|
||||
const int bn = fa_dims[i].bn;
|
||||
std::string OPTS = compile_opts +
|
||||
" -D DK=" + std::to_string(dk) +
|
||||
" -D DV=" + std::to_string(dv) +
|
||||
" -D BLOCK_M=" + std::to_string(bm) +
|
||||
" -D BLOCK_N=" + std::to_string(bn);
|
||||
|
||||
cl_program prog_f16 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f16.c_str(), OPTS);
|
||||
cl_kernel k_f16, k_f16_q1;
|
||||
CL_CHECK((k_f16 = clCreateKernel(prog_f16, "flash_attn_f16", &err), err));
|
||||
CL_CHECK((k_f16_q1 = clCreateKernel(prog_f16, "flash_attn_f16_q1", &err), err));
|
||||
backend_ctx->kernels_flash_attn_f16[{dk, dv}] = k_f16;
|
||||
backend_ctx->kernels_flash_attn_f16_q1[{dk, dv}] = k_f16_q1;
|
||||
CL_CHECK(clReleaseProgram(prog_f16));
|
||||
|
||||
cl_program prog_f32 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f32.c_str(), OPTS);
|
||||
cl_kernel k_f32, k_f32_q1;
|
||||
CL_CHECK((k_f32 = clCreateKernel(prog_f32, "flash_attn_f32", &err), err));
|
||||
CL_CHECK((k_f32_q1 = clCreateKernel(prog_f32, "flash_attn_f32_q1", &err), err));
|
||||
backend_ctx->kernels_flash_attn_f32[{dk, dv}] = k_f32;
|
||||
backend_ctx->kernels_flash_attn_f32_q1[{dk, dv}] = k_f32_q1;
|
||||
CL_CHECK(clReleaseProgram(prog_f32));
|
||||
|
||||
cl_program prog_f32_f16 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f32_f16.c_str(), OPTS);
|
||||
cl_kernel k_f32_f16, k_f32_f16_q1;
|
||||
CL_CHECK((k_f32_f16 = clCreateKernel(prog_f32_f16, "flash_attn_f32_f16", &err), err));
|
||||
CL_CHECK((k_f32_f16_q1 = clCreateKernel(prog_f32_f16, "flash_attn_f32_f16_q1", &err), err));
|
||||
backend_ctx->kernels_flash_attn_f32_f16[{dk, dv}] = k_f32_f16;
|
||||
backend_ctx->kernels_flash_attn_f32_f16_q1[{dk, dv}] = k_f32_f16_q1;
|
||||
CL_CHECK(clReleaseProgram(prog_f32_f16));
|
||||
|
||||
backend_ctx->kernels_flash_attn_bm[{dk, dv}] = bm;
|
||||
backend_ctx->kernels_flash_attn_bn[{dk, dv}] = bn;
|
||||
}
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
}
|
||||
|
||||
// argsort
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "argsort.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("argsort.cl");
|
||||
#endif
|
||||
backend_ctx->program_argsort_f32_i32 =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_argsort_f32_i32 = clCreateKernel(backend_ctx->program_argsort_f32_i32, "kernel_argsort_f32_i32", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// div
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
@@ -3335,13 +3402,15 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
GGML_LOG_CONT("\n");
|
||||
backend_ctx->kernels_loaded = true;
|
||||
}
|
||||
|
||||
// XXX static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
// XXX static bool initialized = false;
|
||||
// XXX static ggml_backend_opencl_context *backend_ctx = nullptr;
|
||||
|
||||
static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev);
|
||||
static ggml_backend_opencl_context * ggml_cl_init(ggml_backend_dev_t dev);
|
||||
static bool ggml_opencl_is_device_supported(ggml_backend_dev_t dev);
|
||||
|
||||
namespace /* anonymous */ {
|
||||
extern struct ggml_backend_device_i ggml_backend_opencl_device_i;
|
||||
@@ -3554,13 +3623,13 @@ static std::vector<ggml_backend_device> ggml_opencl_probe_devices(ggml_backend_r
|
||||
/* .context = */ dev_ctx.get(),
|
||||
});
|
||||
|
||||
if (!ggml_cl2_init(&found_devices.back())) {
|
||||
if (!ggml_opencl_is_device_supported(&found_devices.back())) {
|
||||
found_devices.pop_back();
|
||||
GGML_LOG_INFO("ggml_opencl: drop unsupported device.\n");
|
||||
GGML_LOG_WARN("ggml_opencl: drop unsupported device '%s'.\n", dev->name);
|
||||
continue;
|
||||
}
|
||||
|
||||
dev_ctx.release();
|
||||
g_ggml_backend_opencl_dev_ctxs.push_back(std::move(dev_ctx));
|
||||
}
|
||||
|
||||
if (found_devices.size()) {
|
||||
@@ -3577,8 +3646,79 @@ static std::vector<ggml_backend_device> ggml_opencl_probe_devices(ggml_backend_r
|
||||
return found_devices;
|
||||
}
|
||||
|
||||
// check if device should be accepted
|
||||
static bool ggml_opencl_is_device_supported(ggml_backend_dev_t dev) {
|
||||
GGML_ASSERT(dev);
|
||||
GGML_ASSERT(dev->context);
|
||||
|
||||
ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) dev->context;
|
||||
GGML_ASSERT(dev_ctx->platform);
|
||||
GGML_ASSERT(dev_ctx->device);
|
||||
|
||||
if (strstr(dev_ctx->device_name.c_str(), "Adreno") ||
|
||||
strstr(dev_ctx->device_name.c_str(), "Qualcomm") ||
|
||||
strstr(dev_ctx->device_version.c_str(), "Adreno")) {
|
||||
dev_ctx->gpu_family = GPU_FAMILY::ADRENO;
|
||||
|
||||
// Usually device version contains the detailed device name
|
||||
dev_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_version.c_str());
|
||||
if (dev_ctx->adreno_gen == ADRENO_GPU_GEN::ADRENO_UNKNOWN) {
|
||||
dev_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_name.c_str());
|
||||
}
|
||||
} else if (strstr(dev_ctx->device_name.c_str(), "Intel")) {
|
||||
dev_ctx->gpu_family = GPU_FAMILY::INTEL;
|
||||
} else {
|
||||
GGML_LOG_WARN("ggml_opencl: unsupported GPU '%s'.\n", dev_ctx->device_name.c_str());
|
||||
dev_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
|
||||
return false;
|
||||
}
|
||||
|
||||
ggml_cl_version platform_version = get_opencl_platform_version(dev_ctx->platform);
|
||||
|
||||
// Check device OpenCL version, OpenCL 2.0 or above is required
|
||||
ggml_cl_version opencl_c_version = get_opencl_c_version(platform_version, dev_ctx->device);
|
||||
if (opencl_c_version.major < 2) {
|
||||
GGML_LOG_WARN("ggml_opencl: OpenCL 2.0 or above is required\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
if (dev_ctx->gpu_family != GPU_FAMILY::ADRENO) {
|
||||
GGML_LOG_WARN("ggml_opencl: Adreno-specific kernels should not be enabled for non-Adreno GPUs; "
|
||||
"run on an Adreno GPU or recompile with CMake option `-DGGML_OPENCL_USE_ADRENO_KERNELS=OFF`\n");
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
size_t ext_str_size;
|
||||
clGetDeviceInfo(dev_ctx->device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size);
|
||||
|
||||
char *ext_buffer = (char *)alloca(ext_str_size + 1);
|
||||
clGetDeviceInfo(dev_ctx->device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
|
||||
ext_buffer[ext_str_size] = '\0';
|
||||
|
||||
// Check if ext_buffer contains cl_khr_fp16
|
||||
bool fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
|
||||
if (!fp16_support) {
|
||||
GGML_LOG_WARN("ggml_opencl: device does not support FP16\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
// If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes
|
||||
// optional in OpenCL 3.0 (cl_khr_subgroup is mandatory in OpenCL 2.x)
|
||||
if (opencl_c_version.major == 3 && strstr(ext_buffer, "cl_khr_subgroups") == NULL &&
|
||||
strstr(ext_buffer, "cl_intel_subgroups") == NULL) {
|
||||
GGML_LOG_WARN("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) "
|
||||
"(note that subgroups is an optional feature in OpenCL 3.0)\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
clGetDeviceInfo(dev_ctx->device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size_t), &dev_ctx->global_mem_size, NULL);
|
||||
return true;
|
||||
}
|
||||
|
||||
// Initialize device if it is supported (returns nullptr if it is not).
|
||||
static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
static ggml_backend_opencl_context * ggml_cl_init(ggml_backend_dev_t dev) {
|
||||
GGML_ASSERT(dev);
|
||||
GGML_ASSERT(dev->context);
|
||||
|
||||
@@ -3600,34 +3740,13 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
// when the associated device is initialized
|
||||
backend_ctx->ref_count = 0;
|
||||
|
||||
if (strstr(dev_ctx->device_name.c_str(), "Adreno") ||
|
||||
strstr(dev_ctx->device_name.c_str(), "Qualcomm") ||
|
||||
strstr(dev_ctx->device_version.c_str(), "Adreno")) {
|
||||
backend_ctx->gpu_family = GPU_FAMILY::ADRENO;
|
||||
// Usually device version contains the detailed device name
|
||||
backend_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_version.c_str());
|
||||
if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::ADRENO_UNKNOWN) {
|
||||
backend_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_name.c_str());
|
||||
}
|
||||
|
||||
backend_ctx->gpu_family = dev_ctx->gpu_family;
|
||||
backend_ctx->adreno_gen = dev_ctx->adreno_gen;
|
||||
if (backend_ctx->gpu_family == GPU_FAMILY::ADRENO) {
|
||||
// Use wave size of 64 for all Adreno GPUs.
|
||||
backend_ctx->adreno_wave_size = 64;
|
||||
} else if (strstr(dev_ctx->device_name.c_str(), "Intel")) {
|
||||
backend_ctx->gpu_family = GPU_FAMILY::INTEL;
|
||||
} else {
|
||||
GGML_LOG_ERROR("Unsupported GPU: %s\n", dev_ctx->device_name.c_str());
|
||||
backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
if (backend_ctx->gpu_family != GPU_FAMILY::ADRENO) {
|
||||
GGML_LOG_ERROR("ggml_opencl: Adreno-specific kernels should not be enabled for non-Adreno GPUs; "
|
||||
"run on an Adreno GPU or recompile with CMake option `-DGGML_OPENCL_USE_ADRENO_KERNELS=OFF`\n");
|
||||
return nullptr;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Populate backend device name
|
||||
backend_ctx->device_name = dev_ctx->device_name;
|
||||
|
||||
@@ -3635,13 +3754,10 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
cl_device_id device = backend_ctx->device;
|
||||
|
||||
ggml_cl_version platform_version = get_opencl_platform_version(dev_ctx->platform);
|
||||
|
||||
// Check device OpenCL version, OpenCL 2.0 or above is required
|
||||
ggml_cl_version opencl_c_version = get_opencl_c_version(platform_version, device);
|
||||
if (opencl_c_version.major < 2) {
|
||||
GGML_LOG_ERROR("ggml_opencl: OpenCL 2.0 or above is required\n");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
backend_ctx->platform_version = platform_version;
|
||||
backend_ctx->opencl_c_version = opencl_c_version;
|
||||
|
||||
// Check driver version
|
||||
size_t driver_version_str_size;
|
||||
@@ -3664,34 +3780,21 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
char *ext_buffer = (char *)alloca(ext_str_size + 1);
|
||||
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
|
||||
ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
|
||||
|
||||
// Check if ext_buffer contains cl_khr_fp16
|
||||
backend_ctx->fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
|
||||
GGML_LOG_INFO("ggml_opencl: device FP16 support: %s\n", backend_ctx->fp16_support ? "true" : "false");
|
||||
|
||||
// check Adreno large buffer support
|
||||
backend_ctx->adreno_has_large_buffer = strstr(ext_buffer, "cl_qcom_large_buffer") != NULL;
|
||||
|
||||
// fp16 is required
|
||||
if (!backend_ctx->fp16_support) {
|
||||
GGML_LOG_ERROR("ggml_opencl: device does not support FP16\n");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes
|
||||
// optional in OpenCL 3.0 (cl_khr_subgroup is mandatory in OpenCL 2.x)
|
||||
if (opencl_c_version.major == 3 && strstr(ext_buffer, "cl_khr_subgroups") == NULL &&
|
||||
strstr(ext_buffer, "cl_intel_subgroups") == NULL) {
|
||||
GGML_LOG_ERROR("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) "
|
||||
"(note that subgroups is an optional feature in OpenCL 3.0)\n");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
cl_uint base_align_in_bits;
|
||||
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &base_align_in_bits, NULL));
|
||||
GGML_ASSERT(base_align_in_bits % 8u == 0);
|
||||
backend_ctx->alignment = base_align_in_bits / 8u;
|
||||
GGML_LOG_INFO("ggml_opencl: mem base addr align: %u\n", backend_ctx->alignment);
|
||||
|
||||
clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size_t), &backend_ctx->global_mem_size, NULL);
|
||||
backend_ctx->global_mem_size = dev_ctx->global_mem_size;
|
||||
GGML_LOG_INFO("ggml_opencl: global mem size: %zu MB\n", backend_ctx->global_mem_size/1024/1024);
|
||||
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &backend_ctx->max_alloc_size, NULL);
|
||||
@@ -3779,8 +3882,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
#endif
|
||||
CL_CHECK((backend_ctx->queue = clCreateCommandQueue(context, device, command_queue_props, &err), err));
|
||||
|
||||
// Load kernels
|
||||
load_cl_kernels(backend_ctx.get(), opencl_c_version);
|
||||
// delay kernel loading until the first buffer is created
|
||||
// load_cl_kernels(backend_ctx.get());
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
// Allocate intermediate buffers and images
|
||||
@@ -3822,22 +3925,9 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
return dev_ctx->backend_ctx;
|
||||
}
|
||||
|
||||
static void ggml_cl2_free(ggml_backend_t backend) {
|
||||
static void ggml_cl_free(ggml_backend_t backend) {
|
||||
ggml_backend_opencl_context * ctx = (ggml_backend_opencl_context *) backend->context;
|
||||
ctx->free();
|
||||
|
||||
// The CL context is shared by all backends, release it if all backends have been released
|
||||
bool should_release_opencl = true;
|
||||
for (auto device : g_ggml_backend_opencl_devices) {
|
||||
ggml_backend_opencl_device_context * ctx_dev = (ggml_backend_opencl_device_context *) device.context;
|
||||
if (ctx_dev->backend_ctx->ref_count > 0) {
|
||||
should_release_opencl = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (should_release_opencl) {
|
||||
CL_CHECK(clReleaseContext(ctx->context));
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
@@ -4421,7 +4511,7 @@ static const char * ggml_backend_opencl_name(ggml_backend_t backend) {
|
||||
}
|
||||
|
||||
static void ggml_backend_opencl_free(ggml_backend_t backend) {
|
||||
ggml_cl2_free(backend);
|
||||
ggml_cl_free(backend);
|
||||
}
|
||||
|
||||
static void ggml_backend_opencl_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
@@ -4460,14 +4550,17 @@ static void ggml_backend_opencl_synchronize(ggml_backend_t backend) {
|
||||
// enqueued to it won't start until commands in the other devices have
|
||||
// completed.
|
||||
static void sync_with_other_backends(ggml_backend_opencl_context * backend_ctx) {
|
||||
if (g_ggml_backend_opencl_devices.size() < 2)
|
||||
return; // No other devices to synchronize with.
|
||||
if (g_ggml_backend_opencl_devices.size() < 2) {
|
||||
return; // No other devices to synchronize with.
|
||||
}
|
||||
|
||||
std::vector<cl_event> events;
|
||||
events.reserve(g_ggml_backend_opencl_devices.size());
|
||||
|
||||
for (ggml_backend_device & backend_dev : g_ggml_backend_opencl_devices) {
|
||||
auto * other_backend_ctx = ggml_cl2_init(&backend_dev);
|
||||
ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) backend_dev.context;
|
||||
auto * other_backend_ctx = dev_ctx->backend_ctx;
|
||||
|
||||
if (backend_ctx != other_backend_ctx) {
|
||||
cl_event ev;
|
||||
CL_CHECK(clEnqueueMarkerWithWaitList(other_backend_ctx->queue, 0, nullptr, &ev));
|
||||
@@ -4620,7 +4713,7 @@ inline bool use_adreno_kernels(const ggml_backend_opencl_context *backend_ctx, c
|
||||
inline bool use_adreno_moe_kernels(const ggml_backend_opencl_context *backend_ctx, const ggml_tensor *tensor) {
|
||||
GGML_UNUSED(backend_ctx);
|
||||
int ne01 = tensor->ne[1];
|
||||
return (((strstr(tensor->name, "ffn") != NULL) && (strstr(tensor->name, "exps") != NULL)) || (strstr(tensor->name, "as") != NULL)) && (ne01 % 64 == 0);
|
||||
return (((strstr(tensor->name, "ffn") != NULL) && (strstr(tensor->name, "exps") != NULL)) || (strstr(tensor->name, "as") != NULL)) && (ne01 % 32 == 0);
|
||||
}
|
||||
|
||||
inline bool enable_adreno_trans_weight(const ggml_backend_opencl_context *backend_ctx, const ggml_tensor *tensor) {
|
||||
@@ -4880,6 +4973,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
case GGML_OP_IM2COL:
|
||||
return true;
|
||||
case GGML_OP_ARGSORT: {
|
||||
load_cl_kernels_argsort(backend_ctx);
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_argsort_f32_i32;
|
||||
int max_workgroup_size = backend_ctx->get_kernel_workgroup_size(kernel);
|
||||
|
||||
@@ -4897,6 +4992,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
return op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
{
|
||||
load_cl_kernels_flash_attn(backend_ctx);
|
||||
|
||||
const ggml_tensor * q = op->src[0];
|
||||
const ggml_tensor * k = op->src[1];
|
||||
const ggml_tensor * v = op->src[2];
|
||||
@@ -4964,7 +5061,7 @@ static ggml_backend_i ggml_backend_opencl_i = {
|
||||
|
||||
ggml_backend_t ggml_backend_opencl_init(void) {
|
||||
ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_opencl_reg(), 0);
|
||||
ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(dev);
|
||||
ggml_backend_opencl_context *backend_ctx = ggml_cl_init(dev);
|
||||
|
||||
ggml_backend_t backend = new ggml_backend {
|
||||
/* .guid = */ ggml_backend_opencl_guid(),
|
||||
@@ -5343,15 +5440,13 @@ static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer)
|
||||
}
|
||||
|
||||
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer->buft->device);
|
||||
return (void *) (uintptr_t) backend_ctx->alignment;
|
||||
ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) buffer->buft->device->context;
|
||||
return (void *) (uintptr_t) dev_ctx->backend_ctx->alignment;
|
||||
}
|
||||
|
||||
static enum ggml_status ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
|
||||
|
||||
ggml_cl2_init(buffer->buft->device);
|
||||
|
||||
if (tensor->view_src != nullptr) {
|
||||
GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
|
||||
|
||||
@@ -5391,7 +5486,8 @@ static enum ggml_status ggml_backend_opencl_buffer_init_tensor(ggml_backend_buff
|
||||
}
|
||||
|
||||
static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device);
|
||||
ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) buffer->buft->device->context;
|
||||
ggml_backend_opencl_context * backend_ctx = dev_ctx->backend_ctx;
|
||||
|
||||
cl_context context = backend_ctx->context;
|
||||
cl_command_queue queue = backend_ctx->queue;
|
||||
@@ -6626,7 +6722,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
GGML_ASSERT(tensor->extra);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device);
|
||||
ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) buffer->buft->device->context;
|
||||
ggml_backend_opencl_context *backend_ctx = dev_ctx->backend_ctx;
|
||||
|
||||
cl_context context = backend_ctx->context;
|
||||
cl_command_queue queue = backend_ctx->queue;
|
||||
@@ -7470,8 +7567,9 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||
}
|
||||
|
||||
static void ggml_backend_opencl_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
ggml_backend_dev_t dev = buffer->buft->device;
|
||||
ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(dev);
|
||||
ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) buffer->buft->device->context;
|
||||
ggml_backend_opencl_context * backend_ctx = dev_ctx->backend_ctx;
|
||||
|
||||
cl_command_queue queue = backend_ctx->queue;
|
||||
|
||||
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
|
||||
@@ -7511,7 +7609,8 @@ static const char * ggml_backend_opencl_buffer_type_get_name(ggml_backend_buffer
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buffer_type, size_t size) {
|
||||
ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer_type->device);
|
||||
ggml_backend_opencl_context *backend_ctx = ggml_cl_init(buffer_type->device);
|
||||
load_cl_kernels(backend_ctx);
|
||||
|
||||
// clCreateBuffer returns -61 for size 0
|
||||
size = std::max(size, (size_t)1);
|
||||
@@ -7534,15 +7633,15 @@ static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_b
|
||||
}
|
||||
|
||||
static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) {
|
||||
ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device);
|
||||
return backend_ctx->alignment;
|
||||
ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) buffer_type->device->context;
|
||||
return dev_ctx->backend_ctx->alignment;
|
||||
}
|
||||
|
||||
static size_t ggml_backend_opencl_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) {
|
||||
static size_t max_size = -1;
|
||||
if (max_size == (size_t)-1) {
|
||||
ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device);
|
||||
max_size = backend_ctx->max_alloc_size;
|
||||
ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) buffer_type->device->context;
|
||||
max_size = dev_ctx->backend_ctx->max_alloc_size;
|
||||
}
|
||||
return max_size;
|
||||
}
|
||||
@@ -7579,14 +7678,13 @@ static const char * ggml_backend_opencl_device_get_description(ggml_backend_dev_
|
||||
|
||||
static void ggml_backend_opencl_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||
ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) dev->context;
|
||||
ggml_backend_opencl_context * backend_ctx = (ggml_backend_opencl_context *) dev_ctx->backend_ctx;
|
||||
|
||||
static const size_t opencl_extra_margin = 1024ull*1024ull*1024ull;
|
||||
|
||||
// OpenCL does not provide reliable currently-free device memory.
|
||||
// Use total/global memory as a best-effort upper bound.
|
||||
// Improved safety: Reduce by a 1GiB extra margin for common --fit
|
||||
*total = backend_ctx->global_mem_size;
|
||||
*total = dev_ctx->global_mem_size;
|
||||
*free = *total > opencl_extra_margin ? *total - opencl_extra_margin : 0;
|
||||
}
|
||||
|
||||
@@ -7610,7 +7708,7 @@ static void ggml_backend_opencl_device_get_props(ggml_backend_dev_t dev, struct
|
||||
}
|
||||
|
||||
static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, const char * params) {
|
||||
ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(dev);
|
||||
ggml_backend_opencl_context * backend_ctx = ggml_cl_init(dev);
|
||||
// Getting a new reference to the backend, increase ref_count
|
||||
backend_ctx->ref_count++;
|
||||
|
||||
@@ -7647,6 +7745,7 @@ static ggml_backend_buffer_t ggml_backend_opencl_device_buffer_from_ptr(ggml_bac
|
||||
}
|
||||
|
||||
static bool ggml_backend_opencl_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
||||
ggml_cl_init(dev);
|
||||
return ggml_opencl_supports_op(dev, op);
|
||||
}
|
||||
|
||||
@@ -7659,8 +7758,8 @@ static bool ggml_backend_opencl_device_supports_buft(ggml_backend_dev_t dev, ggm
|
||||
|
||||
// Check cl_context is the same. clEnqueue* commands may not use
|
||||
// buffers from another cl_context.
|
||||
ggml_backend_opencl_context * backend_ctx0 = ggml_cl2_init(dev);
|
||||
ggml_backend_opencl_context * backend_ctx1 = ggml_cl2_init(buft->device);
|
||||
ggml_backend_opencl_context * backend_ctx0 = ggml_cl_init(dev);
|
||||
ggml_backend_opencl_context * backend_ctx1 = ggml_cl_init(buft->device);
|
||||
return backend_ctx0->context == backend_ctx1->context;
|
||||
}
|
||||
|
||||
@@ -14218,7 +14317,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
||||
CL_CHECK(status);
|
||||
|
||||
// set thread grid
|
||||
global_size[0] = static_cast<size_t>(ne01);
|
||||
global_size[0] = static_cast<size_t>(((ne01 + 63) / 64) * 64);
|
||||
global_size[1] = 4;
|
||||
global_size[2] = static_cast<size_t>(ne20);
|
||||
local_size[1] = 4;
|
||||
@@ -14434,7 +14533,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
||||
CL_CHECK(status);
|
||||
|
||||
// set thread grid
|
||||
global_size[0] = static_cast<size_t>(ne01);
|
||||
global_size[0] = static_cast<size_t>(((ne01 + 63) / 64) * 64);
|
||||
global_size[1] = 4;
|
||||
global_size[2] = static_cast<size_t>(ne20);
|
||||
local_size[1] = 4;
|
||||
@@ -14610,7 +14709,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
||||
CL_CHECK(status);
|
||||
|
||||
// set thread grid
|
||||
global_size[0] = static_cast<size_t>(ne01);
|
||||
global_size[0] = static_cast<size_t>(((ne01 + 63) / 64) * 64);
|
||||
global_size[1] = 4;
|
||||
global_size[2] = static_cast<size_t>(ne20);
|
||||
local_size[1] = 4;
|
||||
@@ -14786,7 +14885,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
||||
CL_CHECK(status);
|
||||
|
||||
// set thread grid
|
||||
global_size[0] = static_cast<size_t>(ne01);
|
||||
global_size[0] = static_cast<size_t>(((ne01 + 63) / 64) * 64);
|
||||
global_size[1] = 4;
|
||||
global_size[2] = static_cast<size_t>(ne20);
|
||||
local_size[1] = 4;
|
||||
@@ -15039,7 +15138,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
||||
CL_CHECK(status);
|
||||
|
||||
// set thread grid
|
||||
global_size[0] = static_cast<size_t>(ne01);
|
||||
global_size[0] = static_cast<size_t>(((ne01 + 63) / 64) * 64);
|
||||
global_size[1] = 4;
|
||||
global_size[2] = static_cast<size_t>(ne20);
|
||||
local_size[1] = 4;
|
||||
@@ -15212,7 +15311,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
||||
CL_CHECK(status);
|
||||
|
||||
// set thread grid
|
||||
global_size[0] = static_cast<size_t>(ne01);
|
||||
global_size[0] = static_cast<size_t>(((ne01 + 63) / 64) * 64);
|
||||
global_size[1] = 4;
|
||||
global_size[2] = static_cast<size_t>(ne20);
|
||||
local_size[1] = 4;
|
||||
@@ -15390,7 +15489,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
||||
CL_CHECK(status);
|
||||
|
||||
// set thread grid
|
||||
global_size[0] = static_cast<size_t>(ne01);
|
||||
global_size[0] = static_cast<size_t>(((ne01 + 63) / 64) * 64);
|
||||
global_size[1] = 4;
|
||||
global_size[2] = static_cast<size_t>(ne20);
|
||||
local_size[1] = 4;
|
||||
@@ -15565,7 +15664,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
||||
CL_CHECK(status);
|
||||
|
||||
// set thread grid
|
||||
global_size[0] = static_cast<size_t>(ne01);
|
||||
global_size[0] = static_cast<size_t>(((ne01 + 63) / 64) * 64);
|
||||
global_size[1] = 4;
|
||||
global_size[2] = static_cast<size_t>(ne20);
|
||||
local_size[1] = 4;
|
||||
|
||||
@@ -220,6 +220,10 @@ kernel void kernel_convert_block_q4_0_trans4_ns(
|
||||
uint i01 = get_global_id(0);
|
||||
uint i02 = get_global_id(2);
|
||||
|
||||
if (i01 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint ne00_blk = ne00 / QK4_0;
|
||||
uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
||||
uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
||||
@@ -263,6 +267,10 @@ kernel void kernel_restore_block_q4_0_trans4_ns(
|
||||
uint i01 = get_global_id(0);
|
||||
uint i02 = get_global_id(2);
|
||||
|
||||
if (i01 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint ne00_blk = ne00 / QK4_0;
|
||||
uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
||||
uint src_d_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
||||
@@ -401,6 +409,10 @@ kernel void kernel_convert_block_q4_1_trans4_ns(
|
||||
uint i01 = get_global_id(0);
|
||||
uint i02 = get_global_id(2);
|
||||
|
||||
if (i01 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint ne00_blk = ne00 / QK4_1;
|
||||
uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
||||
uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
||||
@@ -446,6 +458,10 @@ kernel void kernel_restore_block_q4_1_trans4_ns(
|
||||
uint i01 = get_global_id(0);
|
||||
uint i02 = get_global_id(2);
|
||||
|
||||
if (i01 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint ne00_blk = ne00 / QK4_1;
|
||||
uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
||||
uint src_dm_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
||||
@@ -491,6 +507,10 @@ kernel void kernel_convert_block_q5_0_trans4_ns(
|
||||
uint i01 = get_global_id(0);
|
||||
uint i02 = get_global_id(2);
|
||||
|
||||
if (i01 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint ne00_blk = ne00 / QK5_0;
|
||||
uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
||||
uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
||||
@@ -536,6 +556,10 @@ kernel void kernel_restore_block_q5_0_trans4_ns(
|
||||
uint i01 = get_global_id(0);
|
||||
uint i02 = get_global_id(2);
|
||||
|
||||
if (i01 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint ne00_blk = ne00 / QK5_0;
|
||||
uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
||||
uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
||||
@@ -583,6 +607,10 @@ kernel void kernel_convert_block_q5_1_trans4_ns(
|
||||
uint i01 = get_global_id(0);
|
||||
uint i02 = get_global_id(2);
|
||||
|
||||
if (i01 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint ne00_blk = ne00 / QK5_1;
|
||||
uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
||||
uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
||||
@@ -630,6 +658,10 @@ kernel void kernel_restore_block_q5_1_trans4_ns(
|
||||
uint i01 = get_global_id(0);
|
||||
uint i02 = get_global_id(2);
|
||||
|
||||
if (i01 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint ne00_blk = ne00 / QK5_1;
|
||||
uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
||||
uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
||||
@@ -679,6 +711,10 @@ kernel void kernel_convert_block_q4_k_trans4_ns(
|
||||
uint i01 = get_global_id(0);
|
||||
uint i02 = get_global_id(2);
|
||||
|
||||
if (i01 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint ne00_blk = ne00 / QK_K;
|
||||
uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
||||
uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
||||
@@ -732,6 +768,10 @@ kernel void kernel_restore_block_q4_k_trans4_ns(
|
||||
uint i01 = get_global_id(0); // row index
|
||||
uint i02 = get_global_id(2); // batch index
|
||||
|
||||
if (i01 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint ne00_blk = ne00 / QK_K;
|
||||
|
||||
uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
||||
@@ -784,6 +824,10 @@ kernel void kernel_convert_block_q5_k_trans4_ns(
|
||||
uint i01 = get_global_id(0);
|
||||
uint i02 = get_global_id(2);
|
||||
|
||||
if (i01 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint ne00_blk = ne00 / QK_K;
|
||||
uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
||||
uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
||||
@@ -850,6 +894,10 @@ kernel void kernel_restore_block_q5_k_trans4_ns(
|
||||
uint i01 = get_global_id(0); // row index
|
||||
uint i02 = get_global_id(2); // batch index
|
||||
|
||||
if (i01 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint ne00_blk = ne00 / QK_K;
|
||||
|
||||
uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
||||
@@ -916,6 +964,10 @@ kernel void kernel_convert_block_q6_k_trans4_ns(
|
||||
uint i01 = get_global_id(0);
|
||||
uint i02 = get_global_id(2);
|
||||
|
||||
if (i01 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint ne00_blk = ne00 / QK_K;
|
||||
|
||||
uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
||||
@@ -993,6 +1045,10 @@ kernel void kernel_restore_block_q6_k_trans4_ns(
|
||||
uint i01 = get_global_id(0); // row index
|
||||
uint i02 = get_global_id(2); // batch index
|
||||
|
||||
if (i01 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint ne00_blk = ne00 / QK_K;
|
||||
|
||||
uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
||||
@@ -1147,6 +1203,10 @@ kernel void kernel_convert_block_mxfp4_trans4_ns(
|
||||
uint i01 = get_global_id(0);
|
||||
uint i02 = get_global_id(2);
|
||||
|
||||
if (i01 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint ne00_blk = ne00 / QK_MXFP4;
|
||||
uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
||||
uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
||||
@@ -1190,6 +1250,10 @@ kernel void kernel_restore_block_mxfp4_trans4_ns(
|
||||
uint i01 = get_global_id(0);
|
||||
uint i02 = get_global_id(2);
|
||||
|
||||
if (i01 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint ne00_blk = ne00 / QK_MXFP4;
|
||||
uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
||||
uint src_d_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
||||
|
||||
@@ -163,7 +163,7 @@ kernel void kernel_gemm_moe_mxfp4_f32_ns(
|
||||
uint block_id_n = get_global_id(2); // n_tile
|
||||
|
||||
// Boundary check
|
||||
if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
|
||||
if (block_id_n >= total_tiles[0]) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -248,6 +248,10 @@ kernel void kernel_gemm_moe_mxfp4_f32_ns(
|
||||
dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
|
||||
}
|
||||
|
||||
if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Load poster router and share in LM
|
||||
__local uint out_idx[TILESIZE_N];
|
||||
|
||||
|
||||
@@ -115,7 +115,7 @@ kernel void kernel_gemm_moe_q4_0_f32_ns(
|
||||
uint block_id_n = get_global_id(2); // n_tile
|
||||
|
||||
// Boundary check
|
||||
if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
|
||||
if (block_id_n >= total_tiles[0]) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -198,6 +198,10 @@ kernel void kernel_gemm_moe_q4_0_f32_ns(
|
||||
dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
|
||||
}
|
||||
|
||||
if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Load poster router and share in LM
|
||||
__local uint out_idx[TILESIZE_N];
|
||||
|
||||
|
||||
@@ -116,7 +116,7 @@ kernel void kernel_gemm_moe_q4_1_f32_ns(
|
||||
uint block_id_n = get_global_id(2); // n_tile
|
||||
|
||||
// Boundary check
|
||||
if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
|
||||
if (block_id_n >= total_tiles[0]) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -200,6 +200,10 @@ kernel void kernel_gemm_moe_q4_1_f32_ns(
|
||||
dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
|
||||
}
|
||||
|
||||
if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Load poster router and share in LM
|
||||
__local uint out_idx[TILESIZE_N];
|
||||
|
||||
|
||||
@@ -133,7 +133,7 @@ kernel void kernel_gemm_moe_q4_k_f32_ns(
|
||||
uint block_id_n = get_global_id(2); // n_tile
|
||||
|
||||
// Boundary check
|
||||
if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
|
||||
if (block_id_n >= total_tiles[0]) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -225,6 +225,10 @@ kernel void kernel_gemm_moe_q4_k_f32_ns(
|
||||
dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
|
||||
}
|
||||
|
||||
if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Load post router and share in LM
|
||||
__local uint out_idx[TILESIZE_N];
|
||||
|
||||
|
||||
@@ -116,7 +116,7 @@ kernel void kernel_gemm_moe_q5_0_f32_ns(
|
||||
uint block_id_n = get_global_id(2); // n_tile
|
||||
|
||||
// Boundary check
|
||||
if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
|
||||
if (block_id_n >= total_tiles[0]) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -202,6 +202,10 @@ kernel void kernel_gemm_moe_q5_0_f32_ns(
|
||||
dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
|
||||
}
|
||||
|
||||
if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Load poster router and share in LM
|
||||
__local uint out_idx[TILESIZE_N];
|
||||
|
||||
|
||||
@@ -117,7 +117,7 @@ kernel void kernel_gemm_moe_q5_1_f32_ns(
|
||||
uint block_id_n = get_global_id(2); // n_tile
|
||||
|
||||
// Boundary check
|
||||
if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
|
||||
if (block_id_n >= total_tiles[0]) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -204,6 +204,10 @@ kernel void kernel_gemm_moe_q5_1_f32_ns(
|
||||
dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
|
||||
}
|
||||
|
||||
if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Load poster router and share in LM
|
||||
__local uint out_idx[TILESIZE_N];
|
||||
|
||||
|
||||
@@ -134,7 +134,7 @@ kernel void kernel_gemm_moe_q5_k_f32_ns(
|
||||
uint block_id_n = get_global_id(2); // n_tile
|
||||
|
||||
// Boundary check
|
||||
if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
|
||||
if (block_id_n >= total_tiles[0]) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -230,6 +230,10 @@ kernel void kernel_gemm_moe_q5_k_f32_ns(
|
||||
dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
|
||||
}
|
||||
|
||||
if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Load post router and share in LM
|
||||
__local uint out_idx[TILESIZE_N];
|
||||
|
||||
|
||||
@@ -117,7 +117,7 @@ kernel void kernel_gemm_moe_q6_k_f32_ns(
|
||||
uint block_id_n = get_global_id(2); // n_tile
|
||||
|
||||
// Boundary check
|
||||
if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
|
||||
if (block_id_n >= total_tiles[0]) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -209,6 +209,10 @@ kernel void kernel_gemm_moe_q6_k_f32_ns(
|
||||
dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
|
||||
}
|
||||
|
||||
if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Load post router and share in LM
|
||||
__local uint out_idx[TILESIZE_N];
|
||||
|
||||
|
||||
@@ -82,6 +82,10 @@ __kernel void kernel_gemv_moe_mxfp4_f32_ns(
|
||||
uint sgid = get_local_id(1);
|
||||
uint slid = get_sub_group_local_id();
|
||||
|
||||
if (i01 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint i11 = i20 % ne11;
|
||||
|
||||
uint expert_id = src2[i20];
|
||||
|
||||
@@ -37,6 +37,10 @@ __kernel void kernel_gemv_moe_q4_0_f32_ns(
|
||||
uint sgid = get_local_id(1);
|
||||
uint slid = get_sub_group_local_id();
|
||||
|
||||
if (i01 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint i11 = i20 % ne11;
|
||||
|
||||
uint expert_id = src2[i20];
|
||||
|
||||
@@ -38,6 +38,10 @@ __kernel void kernel_gemv_moe_q4_1_f32_ns(
|
||||
uint sgid = get_local_id(1);
|
||||
uint slid = get_sub_group_local_id();
|
||||
|
||||
if (i01 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint i11 = i20 % ne11;
|
||||
|
||||
uint expert_id = src2[i20];
|
||||
|
||||
@@ -54,6 +54,10 @@ __kernel void kernel_gemv_moe_q4_k_f32_ns(
|
||||
uint sgid = get_local_id(1);
|
||||
uint slid = get_sub_group_local_id();
|
||||
|
||||
if (i01 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint i11 = i20 % ne11;
|
||||
|
||||
uint expert_id = src2[i20];
|
||||
|
||||
@@ -38,6 +38,10 @@ __kernel void kernel_gemv_moe_q5_0_f32_ns(
|
||||
uint sgid = get_local_id(1);
|
||||
uint slid = get_sub_group_local_id();
|
||||
|
||||
if (i01 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint i11 = i20 % ne11;
|
||||
|
||||
uint expert_id = src2[i20];
|
||||
|
||||
@@ -39,6 +39,10 @@ __kernel void kernel_gemv_moe_q5_1_f32_ns(
|
||||
uint sgid = get_local_id(1);
|
||||
uint slid = get_sub_group_local_id();
|
||||
|
||||
if (i01 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint i11 = i20 % ne11;
|
||||
|
||||
uint expert_id = src2[i20];
|
||||
|
||||
@@ -55,6 +55,10 @@ __kernel void kernel_gemv_moe_q5_k_f32_ns(
|
||||
uint sgid = get_local_id(1);
|
||||
uint slid = get_sub_group_local_id();
|
||||
|
||||
if (i01 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint i11 = i20 % ne11;
|
||||
|
||||
uint expert_id = src2[i20];
|
||||
|
||||
@@ -38,6 +38,10 @@ __kernel void kernel_gemv_moe_q6_k_f32_ns(
|
||||
uint sgid = get_local_id(1);
|
||||
uint slid = get_sub_group_local_id();
|
||||
|
||||
if (i01 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint i11 = i20 % ne11;
|
||||
|
||||
uint expert_id = src2[i20];
|
||||
|
||||
@@ -238,6 +238,8 @@ struct ggml_sycl_device_info {
|
||||
std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
|
||||
|
||||
int max_work_group_sizes[GGML_SYCL_MAX_DEVICES] = {0};
|
||||
|
||||
bool ext_oneapi_level_zero = true; // sycl::backend::ext_oneapi_level_zero used by all enumerated GPU devices
|
||||
};
|
||||
|
||||
const ggml_sycl_device_info & ggml_sycl_info();
|
||||
|
||||
@@ -3,6 +3,13 @@
|
||||
#include "dequantize.hpp"
|
||||
#include "presets.hpp"
|
||||
|
||||
#if defined(__INTEL_LLVM_COMPILER)
|
||||
#if __has_include(<sycl/ext/oneapi/bfloat16.hpp>)
|
||||
#include <sycl/ext/oneapi/bfloat16.hpp>
|
||||
#define GGML_SYCL_DMMV_HAS_BF16
|
||||
#endif
|
||||
#endif
|
||||
|
||||
static void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||
const sycl::half *x = (const sycl::half *)vx;
|
||||
|
||||
@@ -11,6 +18,16 @@ static void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat
|
||||
v.y() = x[ib + iqs + 1];
|
||||
}
|
||||
|
||||
#ifdef GGML_SYCL_DMMV_HAS_BF16
|
||||
static void convert_bf16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||
const sycl::ext::oneapi::bfloat16 *x = (const sycl::ext::oneapi::bfloat16 *)vx;
|
||||
|
||||
// automatic bfloat16 -> float type cast if dfloat == float
|
||||
v.x() = x[ib + iqs + 0];
|
||||
v.y() = x[ib + iqs + 1];
|
||||
}
|
||||
#endif
|
||||
|
||||
static void convert_f32(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||
const float * x = (const float *) vx;
|
||||
|
||||
@@ -217,6 +234,28 @@ static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y,
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef GGML_SYCL_DMMV_HAS_BF16
|
||||
static void convert_mul_mat_vec_bf16_sycl(const void *vx, const dfloat *y,
|
||||
float *dst, const int ncols,
|
||||
const int nrows,
|
||||
dpct::queue_ptr stream) {
|
||||
// The qk=1 kernel iterates with stride 2*GGML_SYCL_DMMV_X, so ncols must be a
|
||||
// multiple of that — not just GGML_SYCL_DMMV_X — to avoid out-of-bounds reads.
|
||||
GGML_ASSERT(ncols % (2*GGML_SYCL_DMMV_X) == 0);
|
||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
{
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec<1, 1, convert_bf16>(vx, y, dst, ncols,
|
||||
nrows, item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
DPCT1110:4: The total declared local variable size in device function
|
||||
dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register
|
||||
@@ -1497,7 +1536,8 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
|
||||
bool src1_convert_f16 =
|
||||
src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
|
||||
src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
|
||||
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
|
||||
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16 ||
|
||||
src0->type == GGML_TYPE_BF16;
|
||||
|
||||
if (src1_convert_f16) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_sycl", dst, /*num_src=*/2,
|
||||
@@ -1565,6 +1605,11 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
|
||||
case GGML_TYPE_F16:
|
||||
convert_mul_mat_vec_f16_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
||||
break;
|
||||
#ifdef GGML_SYCL_DMMV_HAS_BF16
|
||||
case GGML_TYPE_BF16:
|
||||
convert_mul_mat_vec_bf16_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
||||
break;
|
||||
#endif
|
||||
default:
|
||||
printf("ggml_sycl_op_dequantize_mul_mat_vec unsupported GGML_TYPE %d\n", src0->type);
|
||||
GGML_ABORT("fatal error");
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
#include <cmath>
|
||||
|
||||
|
||||
template <int S_v, bool KDA>
|
||||
template <int S_v, bool KDA, bool keep_rs_t>
|
||||
void gated_delta_net_sycl(const float * q,
|
||||
const float * k,
|
||||
const float * v,
|
||||
@@ -28,7 +28,8 @@ void gated_delta_net_sycl(const float * q,
|
||||
int64_t sb3,
|
||||
const sycl::uint3 neqk1_magic,
|
||||
const sycl::uint3 rq3_magic,
|
||||
float scale) {
|
||||
float scale,
|
||||
int K) {
|
||||
auto item_ct1 = sycl::ext::oneapi::this_work_item::get_nd_item<3>();
|
||||
const uint32_t h_idx = item_ct1.get_group(2);
|
||||
const uint32_t sequence = item_ct1.get_group(1);
|
||||
@@ -43,9 +44,13 @@ void gated_delta_net_sycl(const float * q,
|
||||
float * attn_data = dst;
|
||||
float * state = dst + attn_score_elems;
|
||||
|
||||
const int64_t state_offset = (sequence * H + h_idx) * S_v * S_v;
|
||||
state += state_offset;
|
||||
curr_state += state_offset;
|
||||
// input state layout (D, K, n_seqs) — seq stride is K * D = K * H * S_v * S_v.
|
||||
// output state layout (per-slot D * n_seqs) — same per-(seq,head) offset as before.
|
||||
const int64_t state_in_offset = sequence * K * H * S_v * S_v + h_idx * S_v * S_v;
|
||||
const int64_t state_out_offset = (sequence * H + h_idx) * S_v * S_v;
|
||||
const int64_t state_size_per_token = S_v * S_v * H * n_seqs; // per-slot stride in output
|
||||
state += state_out_offset;
|
||||
curr_state += state_in_offset + col * S_v;
|
||||
attn_data += (sequence * n_tokens * H + h_idx) * S_v;
|
||||
|
||||
constexpr int warp_size = ggml_sycl_get_physical_warp_size() < S_v ? ggml_sycl_get_physical_warp_size() : S_v;
|
||||
@@ -55,9 +60,13 @@ void gated_delta_net_sycl(const float * q,
|
||||
#pragma unroll
|
||||
for (int r = 0; r < rows_per_lane; r++) {
|
||||
const int i = r * warp_size + lane;
|
||||
s_shard[r] = curr_state[col * S_v + i];
|
||||
s_shard[r] = curr_state[i];
|
||||
}
|
||||
|
||||
// slot mapping: target_slot = t - shift. When n_tokens < K only the last n_tokens slots
|
||||
// are written; earlier slots are left untouched (caller-owned).
|
||||
const int shift = (int) n_tokens - K;
|
||||
|
||||
for (int t = 0; t < n_tokens; t++) {
|
||||
const float * q_t = q + iq3 * sq3 + t * sq2 + iq1 * sq1;
|
||||
const float * k_t = k + iq3 * sq3 + t * sq2 + iq1 * sq1;
|
||||
@@ -131,17 +140,32 @@ void gated_delta_net_sycl(const float * q,
|
||||
}
|
||||
|
||||
attn_data += S_v * H;
|
||||
}
|
||||
|
||||
|
||||
// Write state back to global memory
|
||||
if constexpr (keep_rs_t) {
|
||||
const int target_slot = t - shift;
|
||||
if (target_slot >= 0 && target_slot < K) {
|
||||
float * curr_state = (dst + attn_score_elems) + target_slot * state_size_per_token + state_out_offset;
|
||||
#pragma unroll
|
||||
for (int r = 0; r < rows_per_lane; r++) {
|
||||
const int i = r * warp_size + lane;
|
||||
state[col * S_v + i] = s_shard[r];
|
||||
for (int r = 0; r < rows_per_lane; r++) {
|
||||
const int i = r * warp_size + lane;
|
||||
curr_state[col * S_v + i] = s_shard[r];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if constexpr (!keep_rs_t) {
|
||||
#pragma unroll
|
||||
for (int r = 0; r < rows_per_lane; r++) {
|
||||
const int i = r * warp_size + lane;
|
||||
state[col * S_v + i] = s_shard[r];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <bool KDA>
|
||||
template <bool KDA, bool keep_rs_t>
|
||||
static void launch_gated_delta_net(const float * q_d,
|
||||
const float * k_d,
|
||||
const float * v_d,
|
||||
@@ -165,6 +189,7 @@ static void launch_gated_delta_net(const float * q_d,
|
||||
int64_t neqk1,
|
||||
int64_t rq3,
|
||||
float scale,
|
||||
int K,
|
||||
dpct::queue_ptr stream) {
|
||||
//TODO: Add chunked kernel for even faster pre-fill
|
||||
const int warp_size = ggml_sycl_info().devices[ggml_sycl_get_device()].warp_size;
|
||||
@@ -182,9 +207,9 @@ static void launch_gated_delta_net(const float * q_d,
|
||||
constexpr int sv = 16;
|
||||
stream->parallel_for(sycl::nd_range<3>(grid_dims * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> /*item_ct1*/) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
gated_delta_net_sycl<sv, KDA>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens,
|
||||
gated_delta_net_sycl<sv, KDA, keep_rs_t>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens,
|
||||
n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, sb1, sb2,
|
||||
sb3, neqk1_magic, rq3_magic, scale);
|
||||
sb3, neqk1_magic, rq3_magic, scale, K);
|
||||
});
|
||||
}
|
||||
break;
|
||||
@@ -193,9 +218,9 @@ static void launch_gated_delta_net(const float * q_d,
|
||||
constexpr int sv = 32;
|
||||
stream->parallel_for(sycl::nd_range<3>(grid_dims * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> /*item_ct1*/) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
gated_delta_net_sycl<sv, KDA>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens,
|
||||
gated_delta_net_sycl<sv, KDA, keep_rs_t>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens,
|
||||
n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, sb1, sb2,
|
||||
sb3, neqk1_magic, rq3_magic, scale);
|
||||
sb3, neqk1_magic, rq3_magic, scale, K);
|
||||
});
|
||||
}
|
||||
break;
|
||||
@@ -204,9 +229,9 @@ static void launch_gated_delta_net(const float * q_d,
|
||||
constexpr int sv = 64;
|
||||
stream->parallel_for(sycl::nd_range<3>(grid_dims * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> /*item_ct1*/) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
gated_delta_net_sycl<sv, KDA>(
|
||||
gated_delta_net_sycl<sv, KDA, keep_rs_t>(
|
||||
q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens, n_seqs, sq1, sq2,
|
||||
sq3, sv1, sv2, sv3, sb1, sb2, sb3, neqk1_magic, rq3_magic, scale);
|
||||
sq3, sv1, sv2, sv3, sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K);
|
||||
});
|
||||
}
|
||||
break;
|
||||
@@ -216,9 +241,9 @@ static void launch_gated_delta_net(const float * q_d,
|
||||
constexpr int sv = 128;
|
||||
stream->parallel_for(sycl::nd_range<3>(grid_dims * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> /*item_ct1*/) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
gated_delta_net_sycl<sv, KDA>(
|
||||
gated_delta_net_sycl<sv, KDA, keep_rs_t>(
|
||||
q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens, n_seqs, sq1, sq2,
|
||||
sq3, sv1, sv2, sv3, sb1, sb2, sb3, neqk1_magic, rq3_magic, scale);
|
||||
sq3, sv1, sv2, sv3, sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K);
|
||||
});
|
||||
}
|
||||
break;
|
||||
@@ -290,14 +315,30 @@ void ggml_sycl_op_gated_delta_net(ggml_backend_sycl_context & ctx, ggml_tensor *
|
||||
|
||||
dpct::queue_ptr stream = ctx.stream();
|
||||
|
||||
// state is 3D (S_v*S_v*H, K, n_seqs); K is the snapshot slot count.
|
||||
const int K = (int) src_state->ne[1];
|
||||
const bool keep_rs = K > 1;
|
||||
|
||||
if (kda) {
|
||||
launch_gated_delta_net<true>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d,
|
||||
S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
||||
sb1, sb2, sb3, neqk1, rq3, scale, stream);
|
||||
if (keep_rs) {
|
||||
launch_gated_delta_net<true, true>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d,
|
||||
S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
||||
sb1, sb2, sb3, neqk1, rq3, scale, K, stream);
|
||||
} else {
|
||||
launch_gated_delta_net<true, false>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d,
|
||||
S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
||||
sb1, sb2, sb3, neqk1, rq3, scale, K, stream);
|
||||
}
|
||||
} else {
|
||||
launch_gated_delta_net<false>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d,
|
||||
S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
||||
sb1, sb2, sb3, neqk1, rq3, scale, stream);
|
||||
if (keep_rs) {
|
||||
launch_gated_delta_net<false, true>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d,
|
||||
S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
||||
sb1, sb2, sb3, neqk1, rq3, scale, K, stream);
|
||||
} else {
|
||||
launch_gated_delta_net<false, false>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d,
|
||||
S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
||||
sb1, sb2, sb3, neqk1, rq3, scale, K, stream);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
+121
-110
@@ -98,7 +98,7 @@ static ggml_sycl_device_info ggml_sycl_init() {
|
||||
for (int i = 0; i < info.device_count; ++i) {
|
||||
info.devices[i].vmm = 0;
|
||||
dpct::device_info prop;
|
||||
sycl::device device = dpct::dev_mgr::instance().get_device(i);
|
||||
auto & device = dpct::dev_mgr::instance().get_device(i);
|
||||
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
|
||||
prop, device)));
|
||||
@@ -117,6 +117,12 @@ static ggml_sycl_device_info ggml_sycl_init() {
|
||||
info.devices[i].max_wg_per_cu = info.max_work_group_sizes[i] / prop.get_max_compute_units();
|
||||
info.devices[i].hw_info = get_device_hw_info(&device);
|
||||
|
||||
// Only check GPU devices; CPU devices use OpenCL and would otherwise
|
||||
// disable Level Zero for the GPUs on systems without ONEAPI_DEVICE_SELECTOR set.
|
||||
if (device.is_gpu() && device.default_queue().get_backend() != sycl::backend::ext_oneapi_level_zero) {
|
||||
GGML_LOG_WARN("SYCL GPU device %d does not use Level Zero backend, disabling Level Zero memory API\n", i);
|
||||
info.ext_oneapi_level_zero = false;
|
||||
}
|
||||
}
|
||||
|
||||
for (int id = 0; id < info.device_count; ++id) {
|
||||
@@ -230,26 +236,10 @@ static void ggml_check_sycl() try {
|
||||
g_ggml_sycl_disable_dnn = get_sycl_env("GGML_SYCL_DISABLE_DNN", 0);
|
||||
g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0);
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
||||
g_ggml_sycl_enable_level_zero = get_sycl_env("GGML_SYCL_ENABLE_LEVEL_ZERO", 1);
|
||||
g_ggml_sycl_enable_level_zero = get_sycl_env("GGML_SYCL_ENABLE_LEVEL_ZERO", ggml_sycl_info().ext_oneapi_level_zero);
|
||||
#else
|
||||
g_ggml_sycl_enable_level_zero = 0;
|
||||
#endif
|
||||
if (g_ggml_sycl_enable_level_zero) {
|
||||
// Verify all GPU devices use the Level Zero backend before enabling L0 APIs.
|
||||
// Only check GPU devices; CPU devices use OpenCL and would otherwise
|
||||
// disable Level Zero for the GPUs on systems without ONEAPI_DEVICE_SELECTOR set.
|
||||
for (unsigned int i = 0; i < dpct::dev_mgr::instance().device_count(); i++) {
|
||||
auto & q = dpct::dev_mgr::instance().get_device(i).default_queue();
|
||||
if (!q.get_device().is_gpu()) {
|
||||
continue;
|
||||
}
|
||||
if (q.get_backend() != sycl::backend::ext_oneapi_level_zero) {
|
||||
GGML_LOG_WARN("SYCL GPU device %d does not use Level Zero backend, disabling Level Zero memory API\n", i);
|
||||
g_ggml_sycl_enable_level_zero = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef SYCL_FLASH_ATTN
|
||||
g_ggml_sycl_enable_flash_attention = get_sycl_env("GGML_SYCL_ENABLE_FLASH_ATTN", 1);
|
||||
@@ -3455,6 +3445,7 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_BF16:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
@@ -3818,8 +3809,13 @@ static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor *
|
||||
|
||||
|
||||
static bool can_use_dequantize_mul_mat_vec(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
// The F16/BF16 qk=1 kernel iterates with stride 2*DMMV_X, requiring ne[0] to be
|
||||
// a multiple of 2*DMMV_X. Quantized types use block-structured kernels that only
|
||||
// need ne[0] % DMMV_X == 0.
|
||||
const int64_t dmmv_x_required = (src0->type == GGML_TYPE_BF16 || src0->type == GGML_TYPE_F16) ?
|
||||
2*GGML_SYCL_DMMV_X : GGML_SYCL_DMMV_X;
|
||||
return ggml_sycl_supports_dmmv(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
|
||||
src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1;
|
||||
src0->ne[0] % dmmv_x_required == 0 && src1->ne[1] == 1;
|
||||
}
|
||||
|
||||
static bool can_use_mul_mat_vec_q(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
@@ -3923,35 +3919,17 @@ struct mmid_row_mapping {
|
||||
|
||||
__dpct_inline__ static void k_copy_src1_to_contiguous(
|
||||
const char *__restrict__ src1_original, char *__restrict__ src1_contiguous,
|
||||
int *__restrict__ cur_src1_row, mmid_row_mapping *__restrict__ row_mapping,
|
||||
const char *__restrict ids, int64_t i02, size_t ids_nb1, size_t ids_nb0,
|
||||
const mmid_row_mapping *__restrict__ row_mapping,
|
||||
int64_t ne11, int64_t ne10, size_t nb11, size_t nb12,
|
||||
const sycl::nd_item<3> &item_ct1, int &src1_row) {
|
||||
int32_t iid1 = item_ct1.get_group(2);
|
||||
int32_t id = item_ct1.get_group(1);
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
const int32_t src1_row = item_ct1.get_group(2);
|
||||
|
||||
const int32_t row_id_i = *(const int32_t *) (ids + iid1*ids_nb1 + id*ids_nb0);
|
||||
|
||||
if (row_id_i != i02) {
|
||||
return;
|
||||
}
|
||||
const int32_t iid1 = row_mapping[src1_row].i2;
|
||||
const int32_t id = row_mapping[src1_row].i1;
|
||||
|
||||
const int64_t i11 = id % ne11;
|
||||
const int64_t i12 = iid1;
|
||||
|
||||
if (item_ct1.get_local_id(2) == 0) {
|
||||
src1_row =
|
||||
dpct::atomic_fetch_add<sycl::access::address_space::generic_space>(
|
||||
cur_src1_row, 1);
|
||||
row_mapping[src1_row] = {id, iid1};
|
||||
}
|
||||
/*
|
||||
DPCT1065:194: Consider replacing sycl::nd_item::barrier() with
|
||||
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
|
||||
performance if there is no access to global memory.
|
||||
*/
|
||||
item_ct1.barrier();
|
||||
|
||||
const float * src1_row_original = (const float *)(src1_original + i11*nb11 + i12*nb12);
|
||||
float * src1_row_contiguous = (float *)(src1_contiguous + src1_row*nb11);
|
||||
|
||||
@@ -4026,6 +4004,47 @@ static bool ggml_sycl_mul_mat_id_mmvq_fused(
|
||||
src1_row_stride, stream);
|
||||
}
|
||||
|
||||
// counting sort of the routed rows by expert id (row_id_i, as chosen by the router):
|
||||
// builds a projection of a memory layout where each expert's slice is contiguous
|
||||
static void mmid_counting_sort_rows(
|
||||
const ggml_tensor * ids, const char * ids_host,
|
||||
int64_t n_ids, int64_t n_as, int64_t n_routed_rows,
|
||||
std::vector<int64_t> & expert_counts,
|
||||
std::vector<int64_t> & expert_row_offsets,
|
||||
std::vector<mmid_row_mapping> & routed_row_src) {
|
||||
|
||||
// frequencies: how many routed rows each expert "owns"
|
||||
expert_counts.assign(n_as, 0);
|
||||
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
|
||||
for (int64_t id = 0; id < n_ids; id++) {
|
||||
const int32_t row_id_i = *(const int32_t *) (ids_host + iid1*ids->nb[1] + id*ids->nb[0]);
|
||||
GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
|
||||
expert_counts[row_id_i]++;
|
||||
}
|
||||
}
|
||||
|
||||
// where each expert's slice starts (row indices) and the previous ends
|
||||
expert_row_offsets.assign(n_as + 1, 0);
|
||||
for (int64_t i02 = 0; i02 < n_as; i02++) {
|
||||
expert_row_offsets[i02 + 1] = expert_row_offsets[i02] + expert_counts[i02];
|
||||
}
|
||||
|
||||
std::vector<int64_t> expert_row_next = expert_row_offsets;
|
||||
routed_row_src.resize(n_routed_rows);
|
||||
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
|
||||
for (int64_t id = 0; id < n_ids; id++) {
|
||||
const int32_t row_id_i = *(const int32_t *) (ids_host + iid1*ids->nb[1] + id*ids->nb[0]);
|
||||
GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
|
||||
|
||||
// find and validate the next free row for a given expert (row_id_i)
|
||||
const int64_t routed_row = expert_row_next[row_id_i]++;
|
||||
GGML_ASSERT(routed_row >= expert_row_offsets[row_id_i]);
|
||||
GGML_ASSERT(routed_row < expert_row_offsets[row_id_i + 1]);
|
||||
routed_row_src[routed_row] = {(int32_t) id, (int32_t) iid1};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
|
||||
ggml_tensor *dst) try {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/3);
|
||||
@@ -4104,99 +4123,91 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
|
||||
src1_row.data = src1_contiguous.get();
|
||||
dst_row.data = dst_contiguous.get();
|
||||
|
||||
// how many "owned" routed rows to pass to each expert
|
||||
std::vector<int64_t> expert_row_counts;
|
||||
// where each expert's slice starts and the previous ends (row indices, right-exclusive)
|
||||
std::vector<int64_t> expert_row_offsets;
|
||||
// the sources (slot/token pairs) of contiguous rows to guide k_copy_src1_to_contiguous
|
||||
std::vector<mmid_row_mapping> routed_row_src;
|
||||
|
||||
mmid_counting_sort_rows(ids, ids_host.data(), n_ids, n_as, n_routed_rows,
|
||||
expert_row_counts, expert_row_offsets, routed_row_src);
|
||||
|
||||
ggml_sycl_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool(), n_routed_rows);
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||
stream->memcpy(dev_row_mapping.get(), routed_row_src.data(), n_routed_rows*sizeof(mmid_row_mapping))));
|
||||
|
||||
const unsigned int max_work_group_size = ggml_sycl_info().max_work_group_sizes[ctx.device];
|
||||
assert(max_work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
|
||||
|
||||
{
|
||||
sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne10, max_work_group_size));
|
||||
sycl::range<3> grid_dims(1, 1, n_routed_rows);
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
char *__restrict src1_contiguous_get =
|
||||
src1_contiguous.get();
|
||||
mmid_row_mapping *__restrict dev_row_mapping_get =
|
||||
dev_row_mapping.get();
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(grid_dims * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
k_copy_src1_to_contiguous(
|
||||
src1_original, src1_contiguous_get,
|
||||
dev_row_mapping_get,
|
||||
ne11, ne10, nb11, nb12,
|
||||
item_ct1);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
for (int64_t i02 = 0; i02 < n_as; i02++) {
|
||||
int64_t num_src1_rows = 0;
|
||||
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
|
||||
for (int64_t id = 0; id < n_ids; id++) {
|
||||
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
|
||||
|
||||
GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
|
||||
|
||||
if (row_id_i != i02) {
|
||||
continue;
|
||||
}
|
||||
|
||||
num_src1_rows++;
|
||||
}
|
||||
}
|
||||
const int64_t num_src1_rows = expert_row_counts[i02];
|
||||
|
||||
if (num_src1_rows == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
ggml_sycl_pool_alloc<int> dev_cur_src1_row(ctx.pool(), 1);
|
||||
ggml_sycl_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool(), num_src1_rows);
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||
stream->memset(dev_cur_src1_row.get(), 0, sizeof(int))));
|
||||
|
||||
const unsigned int max_work_group_size = ggml_sycl_info().max_work_group_sizes[ctx.device];
|
||||
assert(max_work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
|
||||
|
||||
{
|
||||
sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne10, max_work_group_size));
|
||||
sycl::range<3> grid_dims(1, n_ids, ids->ne[1]);
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl::local_accessor<int, 0> src1_row_acc(cgh);
|
||||
|
||||
char *__restrict src1_contiguous_get =
|
||||
src1_contiguous.get();
|
||||
int *__restrict dev_cur_src1_row_get =
|
||||
dev_cur_src1_row.get();
|
||||
mmid_row_mapping *__restrict dev_row_mapping_get =
|
||||
dev_row_mapping.get();
|
||||
size_t ids_nb_ct6 = ids->nb[1];
|
||||
size_t ids_nb_ct7 = ids->nb[0];
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(grid_dims * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
k_copy_src1_to_contiguous(
|
||||
src1_original, src1_contiguous_get,
|
||||
dev_cur_src1_row_get,
|
||||
dev_row_mapping_get, ids_dev, i02,
|
||||
ids_nb_ct6, ids_nb_ct7, ne11, ne10, nb11, nb12,
|
||||
item_ct1, src1_row_acc);
|
||||
});
|
||||
});
|
||||
}
|
||||
const int64_t expert_row_offset = expert_row_offsets[i02];
|
||||
|
||||
src0_row.data = src0_original + i02*nb02;
|
||||
|
||||
GGML_ASSERT(nb11 == sizeof(float)*ne10);
|
||||
GGML_ASSERT(nb1 == sizeof(float)*ne0);
|
||||
src1_row.data = src1_contiguous.get() + expert_row_offset*nb11;
|
||||
src1_row.ne[1] = num_src1_rows;
|
||||
|
||||
src1_row.nb[1] = nb11;
|
||||
src1_row.nb[2] = num_src1_rows*nb11;
|
||||
src1_row.nb[3] = num_src1_rows*nb11;
|
||||
|
||||
dst_row.data = dst_contiguous.get() + expert_row_offset*nb1;
|
||||
dst_row.ne[1] = num_src1_rows;
|
||||
dst_row.nb[1] = nb1;
|
||||
dst_row.nb[2] = num_src1_rows*nb1;
|
||||
dst_row.nb[3] = num_src1_rows*nb1;
|
||||
|
||||
ggml_sycl_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
|
||||
}
|
||||
|
||||
{
|
||||
sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne0, max_work_group_size));
|
||||
sycl::range<3> grid_dims(1, 1, num_src1_rows);
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
const char *__restrict dst_contiguous_get =
|
||||
dst_contiguous.get();
|
||||
const mmid_row_mapping *__restrict dev_row_mapping_get =
|
||||
dev_row_mapping.get();
|
||||
{
|
||||
sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne0, max_work_group_size));
|
||||
sycl::range<3> grid_dims(1, 1, n_routed_rows);
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
const char *__restrict dst_contiguous_get =
|
||||
dst_contiguous.get();
|
||||
const mmid_row_mapping *__restrict dev_row_mapping_get =
|
||||
dev_row_mapping.get();
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(grid_dims * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
k_copy_dst_from_contiguous(dst_original,
|
||||
dst_contiguous_get,
|
||||
dev_row_mapping_get,
|
||||
ne0, nb1, nb2, item_ct1);
|
||||
});
|
||||
});
|
||||
}
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(grid_dims * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
k_copy_dst_from_contiguous(dst_original,
|
||||
dst_contiguous_get,
|
||||
dev_row_mapping_get,
|
||||
ne0, nb1, nb2, item_ct1);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,7 +8,10 @@ endif()
|
||||
|
||||
find_package(Vulkan COMPONENTS glslc REQUIRED)
|
||||
|
||||
find_package(SPIRV-Headers REQUIRED)
|
||||
if (DEFINED ENV{VULKAN_SDK})
|
||||
list(APPEND CMAKE_PREFIX_PATH "$ENV{VULKAN_SDK}")
|
||||
endif()
|
||||
find_package(SPIRV-Headers CONFIG REQUIRED)
|
||||
|
||||
if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
|
||||
# Parallel build object files
|
||||
|
||||
@@ -499,6 +499,12 @@ static constexpr std::initializer_list<ggml_op> topk_moe_late_softmax { GGM
|
||||
GGML_OP_GET_ROWS, GGML_OP_RESHAPE,
|
||||
GGML_OP_SOFT_MAX, GGML_OP_RESHAPE };
|
||||
|
||||
// Snake activation: y = x + sin(a*x)^2 * inv_b. Used by the optimize_graph reorder
|
||||
// pass so it keeps the chain contiguous and by the dispatcher to detect the fusion.
|
||||
static constexpr std::initializer_list<ggml_op> snake_pattern { GGML_OP_MUL, GGML_OP_SIN,
|
||||
GGML_OP_SQR, GGML_OP_MUL,
|
||||
GGML_OP_ADD };
|
||||
|
||||
//node #978 ( SOFT_MAX): ffn_moe_probs-15 ( 0K) [Vulka ] use=2: ffn_moe_logits-15 ( 0K) [Vulka ]
|
||||
//node #979 ( RESHAPE): ffn_moe_probs-15 (re ( 0K) [Vulka ] use=1: ffn_moe_probs-15 ( 0K) [Vulka ]
|
||||
//node #980 ( ARGSORT): ffn_moe_argsort-15 ( 0K) [Vulka ] use=1: ffn_moe_probs-15 ( 0K) [Vulka ]
|
||||
@@ -846,6 +852,9 @@ struct vk_device_struct {
|
||||
vk_pipeline pipeline_im2col_3d_f32, pipeline_im2col_3d_f32_f16;
|
||||
vk_pipeline pipeline_timestep_embedding_f32;
|
||||
vk_pipeline pipeline_conv_transpose_1d_f32;
|
||||
vk_pipeline pipeline_snake_f32;
|
||||
vk_pipeline pipeline_snake_f16;
|
||||
vk_pipeline pipeline_snake_bf16;
|
||||
vk_pipeline pipeline_pool2d_f32;
|
||||
vk_pipeline pipeline_rwkv_wkv6_f32;
|
||||
vk_pipeline pipeline_rwkv_wkv7_f32;
|
||||
@@ -1475,6 +1484,11 @@ struct vk_op_conv_transpose_1d_push_constants {
|
||||
int32_t s0;
|
||||
};
|
||||
|
||||
struct vk_op_snake_push_constants {
|
||||
uint32_t ne0;
|
||||
uint32_t ne1;
|
||||
};
|
||||
|
||||
struct vk_op_pool2d_push_constants {
|
||||
uint32_t IW; uint32_t IH;
|
||||
uint32_t OW; uint32_t OH;
|
||||
@@ -4845,6 +4859,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_conv_transpose_1d_f32, "conv_transpose_1d_f32", conv_transpose_1d_f32_len, conv_transpose_1d_f32_data, "main", 3, sizeof(vk_op_conv_transpose_1d_push_constants), {1, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_snake_f32, "snake_f32", snake_f32_len, snake_f32_data, "main", 4, sizeof(vk_op_snake_push_constants), {256, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_snake_f16, "snake_f16", snake_f16_len, snake_f16_data, "main", 4, sizeof(vk_op_snake_push_constants), {256, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_snake_bf16, "snake_bf16", snake_bf16_len, snake_bf16_data, "main", 4, sizeof(vk_op_snake_push_constants), {256, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_pool2d_f32, "pool2d_f32", pool2d_f32_len, pool2d_f32_data, "main", 2, sizeof(vk_op_pool2d_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv6_f32, "rwkv_wkv6_f32", rwkv_wkv6_f32_len, rwkv_wkv6_f32_data, "main", 7, sizeof(vk_op_rwkv_wkv6_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
|
||||
@@ -12110,6 +12128,45 @@ static void ggml_vk_conv_transpose_1d(ggml_backend_vk_context * ctx, vk_context&
|
||||
ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_TRANSPOSE_1D, std::move(p));
|
||||
}
|
||||
|
||||
// Dispatch the fused snake activation: y = x + sin^2(a * x) * inv_b.
|
||||
// Match the naive mul -> sin -> sqr -> mul -> add chain and run the
|
||||
// dedicated kernel directly. The pattern is validated by
|
||||
// ggml_vk_can_fuse_snake before this call.
|
||||
static void ggml_vk_snake_dispatch_fused(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx) {
|
||||
const ggml_tensor * mul0 = cgraph->nodes[node_idx + 0];
|
||||
const ggml_tensor * sqr = cgraph->nodes[node_idx + 2];
|
||||
const ggml_tensor * mul1 = cgraph->nodes[node_idx + 3];
|
||||
ggml_tensor * add = cgraph->nodes[node_idx + 4];
|
||||
|
||||
// x carries the full activation shape, a is the broadcast operand
|
||||
const ggml_tensor * x = ggml_are_same_shape(mul0, mul0->src[0]) ? mul0->src[0] : mul0->src[1];
|
||||
const ggml_tensor * a = (x == mul0->src[0]) ? mul0->src[1] : mul0->src[0];
|
||||
|
||||
// mul1 reads sqr and inv_b in either operand order
|
||||
const ggml_tensor * inv_b = (mul1->src[0] == sqr) ? mul1->src[1] : mul1->src[0];
|
||||
|
||||
vk_pipeline pipeline = nullptr;
|
||||
switch (x->type) {
|
||||
case GGML_TYPE_F32: pipeline = ctx->device->pipeline_snake_f32; break;
|
||||
case GGML_TYPE_F16: pipeline = ctx->device->pipeline_snake_f16; break;
|
||||
case GGML_TYPE_BF16: pipeline = ctx->device->pipeline_snake_bf16; break;
|
||||
default: GGML_ABORT("unsupported type");
|
||||
}
|
||||
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
||||
|
||||
vk_subbuffer x_buf = ggml_vk_tensor_subbuffer(ctx, x);
|
||||
vk_subbuffer a_buf = ggml_vk_tensor_subbuffer(ctx, a);
|
||||
vk_subbuffer inv_b_buf = ggml_vk_tensor_subbuffer(ctx, inv_b);
|
||||
vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, add);
|
||||
|
||||
vk_op_snake_push_constants pc{};
|
||||
pc.ne0 = static_cast<uint32_t>(x->ne[0]);
|
||||
pc.ne1 = static_cast<uint32_t>(x->ne[1]);
|
||||
|
||||
std::array<uint32_t, 3> elements = { pc.ne0, pc.ne1, 1 };
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { x_buf, a_buf, inv_b_buf, dst_buf }, pc, elements);
|
||||
}
|
||||
|
||||
static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
uint32_t op = static_cast<uint32_t>(dst->op_params[0]);
|
||||
const int32_t k1 = dst->op_params[1];
|
||||
@@ -13318,7 +13375,11 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
||||
|
||||
break;
|
||||
case GGML_OP_MUL:
|
||||
ggml_vk_mul(ctx, compute_ctx, src0, src1, node);
|
||||
if (ctx->num_additional_fused_ops) {
|
||||
ggml_vk_snake_dispatch_fused(ctx, compute_ctx, cgraph, node_idx);
|
||||
} else {
|
||||
ggml_vk_mul(ctx, compute_ctx, src0, src1, node);
|
||||
}
|
||||
|
||||
break;
|
||||
case GGML_OP_DIV:
|
||||
@@ -14691,6 +14752,65 @@ static bool ggml_vk_can_fuse_rope_set_rows(ggml_backend_vk_context * ctx, const
|
||||
return true;
|
||||
}
|
||||
|
||||
// Pattern check for the 5-op Snake fusion: mul -> sin -> sqr -> mul -> add.
|
||||
// Verifies the chain shape, the closure x_in_add == x_in_mul0, and that
|
||||
// the broadcast operands a and inv_b share a [1, C] layout.
|
||||
static bool ggml_vk_can_fuse_snake(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx) {
|
||||
GGML_UNUSED(ctx);
|
||||
if (!ggml_can_fuse(cgraph, node_idx, snake_pattern)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const ggml_tensor * mul0 = cgraph->nodes[node_idx + 0];
|
||||
const ggml_tensor * sin_node = cgraph->nodes[node_idx + 1];
|
||||
const ggml_tensor * sqr = cgraph->nodes[node_idx + 2];
|
||||
const ggml_tensor * mul1 = cgraph->nodes[node_idx + 3];
|
||||
const ggml_tensor * add = cgraph->nodes[node_idx + 4];
|
||||
|
||||
const ggml_tensor * x = ggml_are_same_shape(mul0, mul0->src[0]) ? mul0->src[0] : mul0->src[1];
|
||||
const ggml_tensor * a = (x == mul0->src[0]) ? mul0->src[1] : mul0->src[0];
|
||||
|
||||
const ggml_tensor * inv_b = (mul1->src[0] == sqr) ? mul1->src[1] : mul1->src[0];
|
||||
const ggml_tensor * x_in_add = (add->src[0] == mul1) ? add->src[1] : add->src[0];
|
||||
|
||||
if (x_in_add != x) {
|
||||
return false;
|
||||
}
|
||||
if (x->type != GGML_TYPE_F32 && x->type != GGML_TYPE_F16 && x->type != GGML_TYPE_BF16) {
|
||||
return false;
|
||||
}
|
||||
// Shader bindings: data_a is A_TYPE so it follows x's precision, while
|
||||
// data_b and data_c are hardcoded float, so the broadcast operands must
|
||||
// be F32 regardless of x's type.
|
||||
if (a->type != GGML_TYPE_F32) return false;
|
||||
if (inv_b->type != GGML_TYPE_F32) return false;
|
||||
// Chain intermediates and output share x's precision (single A_TYPE / D_TYPE pipeline).
|
||||
if (mul0->type != x->type) return false;
|
||||
if (sin_node->type != x->type) return false;
|
||||
if (sqr->type != x->type) return false;
|
||||
if (mul1->type != x->type) return false;
|
||||
if (add->type != x->type) return false;
|
||||
if (!ggml_are_same_shape(a, inv_b)) {
|
||||
return false;
|
||||
}
|
||||
if (a->ne[0] != 1 || a->ne[1] != x->ne[1]) {
|
||||
return false;
|
||||
}
|
||||
// Dispatch is 2D over (ne0, ne1), so x and add must be 2D and a / inv_b
|
||||
// must collapse to [1, C, 1, 1]. Higher dims are not handled by the shader.
|
||||
if (x->ne[2] != 1 || x->ne[3] != 1) return false;
|
||||
if (add->ne[2] != 1 || add->ne[3] != 1) return false;
|
||||
if (a->ne[2] != 1 || a->ne[3] != 1) return false;
|
||||
if (inv_b->ne[2] != 1 || inv_b->ne[3] != 1) return false;
|
||||
// Shader uses idx = i0 + i1 * ne0 and reads data_b[i1] / data_c[i1],
|
||||
// so every operand must be contiguous.
|
||||
if (!ggml_is_contiguous(x) || !ggml_is_contiguous(add) ||
|
||||
!ggml_is_contiguous(a) || !ggml_is_contiguous(inv_b)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check whether the tensors overlap in memory.
|
||||
// Fusions can potentially overwrite src tensors in ways that are not prevented
|
||||
// by ggml-alloc. If the fusion src is being applied in a way that's elementwise
|
||||
@@ -14998,6 +15118,14 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
op_srcs_fused_elementwise[0] = false;
|
||||
op_srcs_fused_elementwise[1] = false;
|
||||
op_srcs_fused_elementwise[2] = false;
|
||||
} else if (ggml_vk_can_fuse_snake(ctx, cgraph, i)) {
|
||||
ctx->num_additional_fused_ops = 4;
|
||||
fusion_string = "SNAKE";
|
||||
// elementwise=true: snake.comp is safe under exact aliasing because each
|
||||
// thread reads data_x[idx] into a register before writing data_d[idx]
|
||||
// with a data dependency on that register. The overlap check still
|
||||
// rejects partial overlaps (different base or size).
|
||||
std::fill_n(op_srcs_fused_elementwise, 5, true);
|
||||
} else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax_norm, { i + 3, i + 9 }) &&
|
||||
ggml_check_edges(cgraph, i, topk_moe_early_softmax_norm_edges) &&
|
||||
ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX_NORM)) {
|
||||
@@ -15288,6 +15416,9 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph *
|
||||
if (keep_pattern(topk_moe_late_softmax)) {
|
||||
continue;
|
||||
}
|
||||
if (keep_pattern(snake_pattern)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// First, grab the next unused node.
|
||||
current_set.push_back(first_unused);
|
||||
@@ -15310,7 +15441,8 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph *
|
||||
if (match_pattern(topk_moe_early_softmax_norm, j) ||
|
||||
match_pattern(topk_moe_sigmoid_norm_bias, j) ||
|
||||
match_pattern(topk_moe_early_softmax, j) ||
|
||||
match_pattern(topk_moe_late_softmax, j)) {
|
||||
match_pattern(topk_moe_late_softmax, j) ||
|
||||
match_pattern(snake_pattern, j)) {
|
||||
continue;
|
||||
}
|
||||
bool ok = true;
|
||||
|
||||
@@ -44,36 +44,81 @@ void im2col(const uint ow, const uint z_idx) {
|
||||
|
||||
const uint KHKW = p.KH * p.KW;
|
||||
|
||||
// Precompute base input coordinates
|
||||
const int base_iw = int(ow * p.s0) - p.p0;
|
||||
const int base_ih = int(oh * p.s1) - p.p1;
|
||||
|
||||
// Precompute step deltas
|
||||
const uint delta_ic = BLOCK_SIZE / KHKW;
|
||||
const uint delta_rem = BLOCK_SIZE % KHKW;
|
||||
|
||||
const uint delta_ky = delta_rem / p.KW;
|
||||
const uint delta_kx = delta_rem % p.KW;
|
||||
|
||||
const uint delta_ic_offset = delta_ic * p.offset_delta;
|
||||
|
||||
// If using BDA mode, precompute the base pointer and step size
|
||||
#if BDA
|
||||
const BDA_STORAGE_T base_dst_addr = p.dst_addr + D_SIZE * dst_row;
|
||||
const uint bda_step = D_SIZE * BLOCK_SIZE;
|
||||
#endif
|
||||
|
||||
uint wg_x = gl_WorkGroupID.x;
|
||||
do {
|
||||
const uint wg_offset = wg_x * 512;
|
||||
|
||||
[[unroll]] for (uint i = 0; i < NUM_ITER; ++i) {
|
||||
const uint chw_idx = wg_offset + gidx + i * BLOCK_SIZE;
|
||||
uint chw_idx = wg_offset + gidx;
|
||||
|
||||
uint ic = chw_idx / KHKW;
|
||||
uint rem = chw_idx % KHKW;
|
||||
|
||||
uint ky = rem / p.KW;
|
||||
uint kx = rem % p.KW;
|
||||
|
||||
uint ic_offset = src_batch + ic * p.offset_delta;
|
||||
|
||||
// Initialize running pointer/index for the destination buffer
|
||||
#if BDA
|
||||
BDA_STORAGE_T current_dst_addr = base_dst_addr + D_SIZE * chw_idx;
|
||||
#else
|
||||
uint current_dst_idx = dst_row + chw_idx;
|
||||
#endif
|
||||
|
||||
[[unroll]] for (uint i = 0; i < NUM_ITER; ++i) {
|
||||
if (chw_idx >= p.CHW) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint ic = chw_idx / KHKW;
|
||||
const uint rem = chw_idx - ic * KHKW;
|
||||
const uint ky = rem / p.KW;
|
||||
const uint kx = rem - ky * p.KW;
|
||||
|
||||
const uint iiw = ow * p.s0 + kx * p.d0 - p.p0;
|
||||
const uint iih = oh * p.s1 + ky * p.d1 - p.p1;
|
||||
const int iiw = base_iw + int(kx * p.d0);
|
||||
const int iih = base_ih + int(ky * p.d1);
|
||||
|
||||
A_TYPE val = A_TYPE(0);
|
||||
if (iih < p.IH && iiw < p.IW) {
|
||||
val = data_a[src_batch + ic * p.offset_delta + iih * p.IW + iiw];
|
||||
if (uint(iih) < p.IH && uint(iiw) < p.IW) {
|
||||
val = data_a[ic_offset + uint(iih) * p.IW + uint(iiw)];
|
||||
}
|
||||
|
||||
#if BDA
|
||||
D_ptr out_ptr = D_ptr(p.dst_addr + D_SIZE * (dst_row + chw_idx));
|
||||
out_ptr.d = D_TYPE(val);
|
||||
D_ptr(current_dst_addr).d = D_TYPE(val);
|
||||
current_dst_addr += bda_step;
|
||||
#else
|
||||
data_d[dst_row + chw_idx] = D_TYPE(val);
|
||||
data_d[current_dst_idx] = D_TYPE(val);
|
||||
current_dst_idx += BLOCK_SIZE;
|
||||
#endif
|
||||
|
||||
chw_idx += BLOCK_SIZE;
|
||||
ic_offset += delta_ic_offset;
|
||||
kx += delta_kx;
|
||||
ky += delta_ky;
|
||||
|
||||
// Handle X axis wrap
|
||||
uint kx_wrap = uint(kx >= p.KW);
|
||||
kx -= kx_wrap * p.KW;
|
||||
ky += kx_wrap;
|
||||
|
||||
// Handle Y axis wrap
|
||||
uint ky_wrap = uint(ky >= p.KH);
|
||||
ky -= ky_wrap * p.KH;
|
||||
ic_offset += ky_wrap * p.offset_delta;
|
||||
}
|
||||
|
||||
wg_x += gl_NumWorkGroups.x;
|
||||
|
||||
@@ -0,0 +1,49 @@
|
||||
#version 450
|
||||
|
||||
#include "types.glsl"
|
||||
|
||||
// Fused snake activation: y = x + sin(b * x)^2 * c
|
||||
// data_a [ne0, ne1] per element activation x (A_TYPE)
|
||||
// data_b [1, ne1] per channel multiplier (float)
|
||||
// data_c [1, ne1] per channel inverse scale (float, precomputed as 1 / freq)
|
||||
// data_d [ne0, ne1] output y (D_TYPE)
|
||||
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
||||
layout (binding = 1) readonly buffer B {float data_b[];};
|
||||
layout (binding = 2) readonly buffer C {float data_c[];};
|
||||
layout (binding = 3) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint32_t ne0;
|
||||
uint32_t ne1;
|
||||
} p;
|
||||
|
||||
// Load A_TYPE to float
|
||||
float load_val(uint32_t idx) {
|
||||
#if defined(DATA_A_BF16)
|
||||
return bf16_to_fp32(uint32_t(data_a[idx]));
|
||||
#else
|
||||
return float(data_a[idx]);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Store float as D_TYPE
|
||||
void store_val(uint32_t idx, float v) {
|
||||
#if defined(DATA_D_BF16)
|
||||
data_d[idx] = D_TYPE(fp32_to_bf16(v));
|
||||
#else
|
||||
data_d[idx] = D_TYPE(v);
|
||||
#endif
|
||||
}
|
||||
|
||||
void main() {
|
||||
const uint32_t i0 = gl_GlobalInvocationID.x;
|
||||
const uint32_t i1 = gl_GlobalInvocationID.y;
|
||||
if (i0 >= p.ne0 || i1 >= p.ne1) return;
|
||||
|
||||
const uint32_t idx = i0 + i1 * p.ne0;
|
||||
const float xi = load_val(idx);
|
||||
const float s = sin(data_b[i1] * xi);
|
||||
store_val(idx, xi + s * s * data_c[i1]);
|
||||
}
|
||||
@@ -952,6 +952,10 @@ void process_shaders() {
|
||||
|
||||
string_to_spv("conv_transpose_1d_f32", "conv_transpose_1d.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
|
||||
string_to_spv("snake_f32", "snake.comp", {{"DATA_A_F32", "1"}, {"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("snake_f16", "snake.comp", {{"DATA_A_F16", "1"}, {"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("snake_bf16", "snake.comp", {{"DATA_A_BF16", "1"}, {"DATA_D_BF16", "1"}, {"A_TYPE", "uint16_t"}, {"D_TYPE", "uint16_t"}});
|
||||
|
||||
string_to_spv("pool2d_f32", "pool2d.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
|
||||
string_to_spv("rwkv_wkv6_f32", "wkv6.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
|
||||
|
||||
@@ -122,9 +122,9 @@ const V_CHUNKS: u32 = HEAD_DIM_V / 4u;
|
||||
const SCORE_REGS_PER_LANE: u32 = (KV_TILE + MIN_SUBGROUP_SIZE - 1u) / MIN_SUBGROUP_SIZE;
|
||||
const OUT_REGS_PER_LANE: u32 = (V_CHUNKS + MIN_SUBGROUP_SIZE - 1u) / MIN_SUBGROUP_SIZE;
|
||||
|
||||
var<workgroup> q_shmem: array<f32, Q_TILE * HEAD_DIM_QK>;
|
||||
var<workgroup> kv_shmem: array<f32, KV_TILE * KV_STAGE_STRIDE>;
|
||||
var<workgroup> p_shmem: array<f32, Q_TILE * KV_TILE>;
|
||||
var<workgroup> q_shmem: array<Q_TYPE, Q_TILE * HEAD_DIM_QK>;
|
||||
var<workgroup> kv_shmem: array<KV_TYPE, KV_TILE * KV_STAGE_STRIDE>;
|
||||
var<workgroup> p_shmem: array<KV_TYPE, Q_TILE * KV_TILE>;
|
||||
|
||||
@compute @workgroup_size(WG_SIZE)
|
||||
fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
|
||||
@@ -169,10 +169,10 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
|
||||
|
||||
let head = f32(head_idx);
|
||||
let slope = select(1.0,
|
||||
select(pow(params.m1, 2.0 * (head - params.n_head_log2) + 1.0),
|
||||
pow(params.m0, head + 1.0),
|
||||
head < params.n_head_log2),
|
||||
params.max_bias > 0.0);
|
||||
select(pow(params.m1, 2.0 * (head - params.n_head_log2) + 1.0),
|
||||
pow(params.m0, head + 1.0),
|
||||
head < params.n_head_log2),
|
||||
params.max_bias > 0.0);
|
||||
|
||||
for (var elem_idx = local_id.x; elem_idx < Q_TILE * HEAD_DIM_QK; elem_idx += WG_SIZE) {
|
||||
let q_tile_row = elem_idx / HEAD_DIM_QK;
|
||||
@@ -181,7 +181,7 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
|
||||
let global_q_row_offset = q_head_offset + head_q_row * params.stride_q1;
|
||||
q_shmem[elem_idx] = select(
|
||||
0.0,
|
||||
f32(Q[global_q_row_offset + q_col]) * params.scale,
|
||||
Q_TYPE(Q[global_q_row_offset + q_col]) * params.scale,
|
||||
head_q_row < params.seq_len_q);
|
||||
}
|
||||
|
||||
@@ -213,10 +213,10 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
|
||||
let k_vec_index = (k_head_offset + global_k_row * params.stride_k1 + chunk * 4u) >> 2u;
|
||||
let k4 = K[k_vec_index];
|
||||
let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u;
|
||||
kv_shmem[kv_off + 0u] = f32(k4.x);
|
||||
kv_shmem[kv_off + 1u] = f32(k4.y);
|
||||
kv_shmem[kv_off + 2u] = f32(k4.z);
|
||||
kv_shmem[kv_off + 3u] = f32(k4.w);
|
||||
kv_shmem[kv_off + 0u] = KV_TYPE(k4.x);
|
||||
kv_shmem[kv_off + 1u] = KV_TYPE(k4.y);
|
||||
kv_shmem[kv_off + 2u] = KV_TYPE(k4.z);
|
||||
kv_shmem[kv_off + 3u] = KV_TYPE(k4.w);
|
||||
}
|
||||
|
||||
workgroupBarrier();
|
||||
@@ -233,18 +233,18 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
|
||||
var dot_val = 0.0;
|
||||
for (var chunk = 0u; chunk < Q_CHUNKS; chunk += 1u) {
|
||||
let q_off = q_base + chunk * 4u;
|
||||
let qv = vec4<f32>(
|
||||
let qv = vec4<Q_TYPE>(
|
||||
q_shmem[q_off + 0u],
|
||||
q_shmem[q_off + 1u],
|
||||
q_shmem[q_off + 2u],
|
||||
q_shmem[q_off + 3u]);
|
||||
let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u;
|
||||
let kv = vec4<f32>(
|
||||
let kv = vec4<KV_TYPE>(
|
||||
kv_shmem[kv_off + 0u],
|
||||
kv_shmem[kv_off + 1u],
|
||||
kv_shmem[kv_off + 2u],
|
||||
kv_shmem[kv_off + 3u]);
|
||||
dot_val += dot(qv, kv);
|
||||
dot_val += dot(vec4<f32>(qv), vec4<f32>(kv));
|
||||
}
|
||||
#ifdef LOGIT_SOFTCAP
|
||||
dot_val = params.logit_softcap * tanh(dot_val);
|
||||
@@ -271,7 +271,7 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
|
||||
let kv_local = sg_inv_id + slot * subgroup_size;
|
||||
if (row_active && kv_local < kv_count) {
|
||||
let p = exp(local_scores[slot] - new_max);
|
||||
p_shmem[subgroup_p_offset + kv_local] = p;
|
||||
p_shmem[subgroup_p_offset + kv_local] = KV_TYPE(p);
|
||||
local_sum += p;
|
||||
}
|
||||
}
|
||||
@@ -285,10 +285,10 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
|
||||
let v_vec_index = (v_head_offset + global_v_row * params.stride_v1 + chunk * 4u) >> 2u;
|
||||
let v4 = V[v_vec_index];
|
||||
let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u;
|
||||
kv_shmem[kv_off + 0u] = f32(v4.x);
|
||||
kv_shmem[kv_off + 1u] = f32(v4.y);
|
||||
kv_shmem[kv_off + 2u] = f32(v4.z);
|
||||
kv_shmem[kv_off + 3u] = f32(v4.w);
|
||||
kv_shmem[kv_off + 0u] = KV_TYPE(v4.x);
|
||||
kv_shmem[kv_off + 1u] = KV_TYPE(v4.y);
|
||||
kv_shmem[kv_off + 2u] = KV_TYPE(v4.z);
|
||||
kv_shmem[kv_off + 3u] = KV_TYPE(v4.w);
|
||||
}
|
||||
|
||||
workgroupBarrier();
|
||||
@@ -308,12 +308,12 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
|
||||
for (var kv_local = 0u; kv_local < kv_count; kv_local += 1u) {
|
||||
let p = p_shmem[subgroup_p_offset + kv_local];
|
||||
let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u;
|
||||
let v4 = vec4<f32>(
|
||||
let v4 = vec4<KV_TYPE>(
|
||||
kv_shmem[kv_off + 0u],
|
||||
kv_shmem[kv_off + 1u],
|
||||
kv_shmem[kv_off + 2u],
|
||||
kv_shmem[kv_off + 3u]);
|
||||
acc += p * v4;
|
||||
acc += f32(p) * vec4<f32>(v4);
|
||||
}
|
||||
out_regs[reg_idx] = acc;
|
||||
}
|
||||
|
||||
@@ -28,7 +28,7 @@ if (NOT ZENDNN_ROOT OR ZENDNN_ROOT STREQUAL "" OR ZENDNN_ROOT STREQUAL "OFF")
|
||||
ExternalProject_Add(
|
||||
zendnn
|
||||
GIT_REPOSITORY https://github.com/amd/ZenDNN.git
|
||||
GIT_TAG ac9e580d9434b7b98985f2627a7ebfb5eba4bb0d # ZenDNN-2026-WW17
|
||||
GIT_TAG 253b94ce0d7e9284c265fefb485714944caff9d3 # ZenDNN-2026-WW19
|
||||
PREFIX ${ZENDNN_PREFIX}
|
||||
SOURCE_DIR ${ZENDNN_SOURCE_DIR}
|
||||
BINARY_DIR ${ZENDNN_BUILD_DIR}
|
||||
|
||||
@@ -2,6 +2,10 @@
|
||||
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-impl.h"
|
||||
|
||||
#define GGML_COMMON_DECL_CPP
|
||||
#include "ggml-common.h"
|
||||
|
||||
#include "zendnnl.hpp"
|
||||
|
||||
#include <cstring>
|
||||
@@ -19,6 +23,8 @@ zendnnl::common::data_type_t ggml_to_zendnn_type() {
|
||||
return zendnnl::common::data_type_t::f32;
|
||||
} else if constexpr (std::is_same_v<T, ggml_bf16_t>) {
|
||||
return zendnnl::common::data_type_t::bf16;
|
||||
} else if constexpr (std::is_same_v<T, block_q8_0>) {
|
||||
return zendnnl::common::data_type_t::s8;
|
||||
} else {
|
||||
return zendnnl::common::data_type_t::none;
|
||||
}
|
||||
@@ -48,6 +54,17 @@ static bool ggml_zendnn_matmul(ggml_backend_zendnn_context * ctx, int64_t m, int
|
||||
params.num_threads = ctx->n_threads;
|
||||
|
||||
zendnnl::lowoha::matmul::matmul_batch_params_t batch_params;
|
||||
|
||||
if constexpr (std::is_same_v<TA, block_q8_0>) {
|
||||
params.dtypes.compute = zendnnl::common::data_type_t::s8;
|
||||
const int64_t num_groups = k / QK8_0;
|
||||
params.dynamic_quant = true;
|
||||
params.quant_params.src_scale.buff = nullptr;
|
||||
params.quant_params.src_scale.dt = zendnnl::common::data_type_t::bf16;
|
||||
params.quant_params.src_scale.dims = {n, num_groups};
|
||||
params.packing.pack_format_b = 1;
|
||||
}
|
||||
|
||||
zendnnl::error_handling::status_t status = zendnnl::lowoha::matmul::matmul_direct(
|
||||
'r', false, true, // row-major, don't transpose B, transpose A (because it's column-major)
|
||||
n, // M: rows of B and C
|
||||
@@ -108,6 +125,14 @@ static bool ggml_zendnn_sgemm(ggml_backend_zendnn_context * ctx, int64_t m, int6
|
||||
(const ggml_bf16_t *)B, ldb,
|
||||
(float *)C, ldc);
|
||||
return false;
|
||||
case GGML_TYPE_Q8_0:
|
||||
if (Btype != GGML_TYPE_F32 || Ctype != GGML_TYPE_F32)
|
||||
return false;
|
||||
return ggml_zendnn_matmul<block_q8_0, float, float>(
|
||||
ctx, m, n, k,
|
||||
(const block_q8_0 *)A, lda,
|
||||
(const float *)B, ldb,
|
||||
(float *)C, ldc);
|
||||
default:
|
||||
return false; // unsupported type
|
||||
}
|
||||
@@ -145,7 +170,9 @@ static void ggml_zendnn_compute_forward_mul_mat(
|
||||
const int64_t r3 = ne13/ne03;
|
||||
|
||||
void * work_data = ctx->work_data.get();
|
||||
if (src1->type != vec_dot_type) {
|
||||
|
||||
// ZenDNN requires FP32 for dynamic quantization, so conversion is skipped
|
||||
if (src1->type != vec_dot_type && src0->type != GGML_TYPE_Q8_0) {
|
||||
const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
|
||||
const size_t nbw2 = nbw1 * ne11;
|
||||
const size_t nbw3 = nbw2 * ne12;
|
||||
@@ -171,7 +198,7 @@ static void ggml_zendnn_compute_forward_mul_mat(
|
||||
|
||||
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
||||
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
||||
const void* wdata = src1->type == vec_dot_type ? src1->data : work_data;
|
||||
const void* wdata = (src1->type == vec_dot_type || src0->type == GGML_TYPE_Q8_0) ? src1->data : work_data;
|
||||
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
||||
if (!ggml_zendnn_sgemm(ctx,
|
||||
ne01, // m
|
||||
@@ -184,7 +211,7 @@ static void ggml_zendnn_compute_forward_mul_mat(
|
||||
static_cast<char *>(dst->data) + i12*nb2 + i13*nb3,
|
||||
ne01, // ldc
|
||||
src0->type,
|
||||
vec_dot_type,
|
||||
src0->type == GGML_TYPE_Q8_0 ? GGML_TYPE_F32 : vec_dot_type,
|
||||
dst->type))
|
||||
GGML_ABORT("%s: ZenDNN sgemm failed\n", __func__);
|
||||
}
|
||||
@@ -261,10 +288,15 @@ static void ggml_zendnn_compute_forward_mul_mat_id(
|
||||
const size_t nbw1 = row_size;
|
||||
const size_t nbw2 = nbw1 * ne11;
|
||||
const size_t nbw3 = nbw2 * ne12;
|
||||
const size_t src1_conv_size = (src1->type != vec_dot_type) ? ne13 * nbw3 : 0;
|
||||
const size_t src1_conv_size = (src1->type != vec_dot_type && src0->type != GGML_TYPE_Q8_0) ? ne13 * nbw3 : 0;
|
||||
|
||||
// For Q8_0, src1 is always F32; the gather buffer must hold F32 rows (ne10*4 bytes),
|
||||
// not Q8_0-encoded rows (row_size ≈ ne10/32*34 bytes) — they differ by ~4x.
|
||||
const size_t f32_row_size = (size_t)ne10 * sizeof(float);
|
||||
const size_t gather_row_size = (src0->type == GGML_TYPE_Q8_0) ? f32_row_size : row_size;
|
||||
|
||||
// size for MoE gather/scatter buffers
|
||||
const size_t wdata_cur_size = max_rows * row_size;
|
||||
const size_t wdata_cur_size = max_rows * gather_row_size;
|
||||
const size_t dst_cur_size = max_rows * ggml_row_size(dst->type, ne01);
|
||||
|
||||
// allocate single buffer for all needs
|
||||
@@ -279,7 +311,8 @@ static void ggml_zendnn_compute_forward_mul_mat_id(
|
||||
char * wdata_cur = work_data + src1_conv_size;
|
||||
char * dst_cur = wdata_cur + wdata_cur_size;
|
||||
|
||||
if (src1->type != vec_dot_type) {
|
||||
// ZenDNN requires FP32 for dynamic quantization, so conversion is skipped
|
||||
if (src1->type != vec_dot_type && src0->type != GGML_TYPE_Q8_0) {
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
|
||||
#pragma omp parallel for collapse(3) num_threads(ctx->n_threads) schedule(static)
|
||||
@@ -294,7 +327,7 @@ static void ggml_zendnn_compute_forward_mul_mat_id(
|
||||
}
|
||||
}
|
||||
|
||||
const void * wdata = src1->type == vec_dot_type ? src1->data : work_data;
|
||||
const void * wdata = (src1->type == vec_dot_type || src0->type == GGML_TYPE_Q8_0) ? src1->data : work_data;
|
||||
|
||||
// process each expert with gather -> gemm -> scatter pattern
|
||||
for (int64_t cur_a = 0; cur_a < n_as; ++cur_a) {
|
||||
@@ -315,9 +348,9 @@ static void ggml_zendnn_compute_forward_mul_mat_id(
|
||||
const int64_t i12 = row_mapping.i2;
|
||||
|
||||
std::memcpy(
|
||||
wdata_cur + ir1 * row_size,
|
||||
(const char *) wdata + (i11 + i12*ne11) * row_size,
|
||||
row_size
|
||||
wdata_cur + ir1 * gather_row_size,
|
||||
(const char *) wdata + (i11 + i12*ne11) * gather_row_size,
|
||||
gather_row_size
|
||||
);
|
||||
}
|
||||
|
||||
@@ -333,7 +366,7 @@ static void ggml_zendnn_compute_forward_mul_mat_id(
|
||||
dst_cur,
|
||||
ne01, // ldc
|
||||
src0->type,
|
||||
vec_dot_type,
|
||||
src0->type == GGML_TYPE_Q8_0 ? GGML_TYPE_F32 : vec_dot_type,
|
||||
dst->type)) {
|
||||
GGML_ABORT("%s: ZenDNN sgemm failed\n", __func__);
|
||||
}
|
||||
@@ -577,6 +610,7 @@ static bool ggml_backend_zendnn_device_supports_op(ggml_backend_dev_t dev, const
|
||||
switch (weights->type) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_BF16:
|
||||
case GGML_TYPE_Q8_0:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
|
||||
@@ -747,7 +747,7 @@ class MODEL_TENSOR(IntEnum):
|
||||
V_LAYER_OUT_SCALE = auto()
|
||||
V_PRE_NORM = auto()
|
||||
V_POST_NORM = auto()
|
||||
V_MM_PRE_NORM = auto() # hunyuanocr
|
||||
V_MM_PRE_NORM = auto() # hunyuanvl
|
||||
V_MM_POST_NORM = auto()
|
||||
V_MM_INP_NORM = auto()
|
||||
V_MM_INP_PROJ = auto() # gemma3
|
||||
@@ -791,8 +791,8 @@ class MODEL_TENSOR(IntEnum):
|
||||
V_MM_GATE = auto() # cogvlm
|
||||
V_TOK_BOI = auto() # cogvlm
|
||||
V_TOK_EOI = auto() # cogvlm
|
||||
V_TOK_IMG_BEGIN = auto() # hunyuanocr
|
||||
V_TOK_IMG_END = auto() # hunyuanocr
|
||||
V_TOK_IMG_BEGIN = auto() # hunyuanvl
|
||||
V_TOK_IMG_END = auto() # hunyuanvl
|
||||
V_STD_BIAS = auto() # gemma4
|
||||
V_STD_SCALE = auto() # gemma4
|
||||
V_SAM_POS_EMBD = auto() # Deepseek-OCR
|
||||
@@ -4273,7 +4273,6 @@ class VisionProjectorType:
|
||||
GLM4V = "glm4v"
|
||||
YOUTUVL = "youtuvl"
|
||||
NEMOTRON_V2_VL = "nemotron_v2_vl"
|
||||
HUNYUANOCR = "hunyuanocr"
|
||||
HUNYUANVL = "hunyuanvl"
|
||||
MINICPMV4_6 = "minicpmv4_6"
|
||||
GRANITE_SPEECH = "granite_speech" # audio
|
||||
|
||||
@@ -28,6 +28,7 @@ def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizati
|
||||
# This is faster than np.vectorize and np.apply_along_axis because it works on more than one row at a time
|
||||
def _apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.ndarray, otype: DTypeLike, oshape: tuple[int, ...]) -> np.ndarray:
|
||||
rows = arr.reshape((-1, arr.shape[-1]))
|
||||
assert len(rows.shape)
|
||||
osize = 1
|
||||
for dim in oshape:
|
||||
osize *= dim
|
||||
|
||||
@@ -1366,7 +1366,7 @@ class TensorNameMap:
|
||||
"mlp_AR.linear_{bid}", # PaddleOCR-VL
|
||||
"merger.mlp.{bid}",
|
||||
"vision_tower.merger.mlp.{bid}", # dots.ocr
|
||||
"vit.perceive.proj.{bid}", # HunyuanOCR (proj.0 = conv1, proj.2 = conv2)
|
||||
"vit.perceive.proj.{bid}", # HunyuanVL (proj.0 = conv1, proj.2 = conv2)
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_MMPROJ_FC: (
|
||||
@@ -1374,7 +1374,7 @@ class TensorNameMap:
|
||||
"model.vision.linear_proj.linear_proj", # cogvlm
|
||||
"model.projector.layers", # Deepseek-OCR
|
||||
"visual.merger.proj", # glm4v
|
||||
"vit.perceive.mlp", # HunyuanOCR
|
||||
"vit.perceive.mlp", # HunyuanVL
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_MMPROJ_MLP: (
|
||||
@@ -1403,7 +1403,7 @@ class TensorNameMap:
|
||||
"model.vision_tower.embeddings.patch_embeddings.projection", # Intern-S1
|
||||
"vpm.embeddings.patch_embedding",
|
||||
"model.vision_model.embeddings.patch_embedding", # SmolVLM
|
||||
"vit.embeddings.patch_embedding", # HunyuanOCR
|
||||
"vit.embeddings.patch_embedding", # HunyuanVL
|
||||
"vision_tower.patch_conv", # pixtral-hf
|
||||
"vision_encoder.patch_conv", # pixtral
|
||||
"vision_model.patch_embedding.linear", # llama 4
|
||||
@@ -1429,7 +1429,7 @@ class TensorNameMap:
|
||||
"model.vision_tower.embeddings.position_embeddings", # Intern-S1
|
||||
"vpm.embeddings.position_embedding",
|
||||
"model.vision_model.embeddings.position_embedding", # SmolVLM
|
||||
"vit.embeddings.position_embedding", # HunyuanOCR
|
||||
"vit.embeddings.position_embedding", # HunyuanVL
|
||||
"vision_model.positional_embedding_vlm", # llama 4
|
||||
"vision_tower.patch_embed.pos_emb", # kimi-vl
|
||||
"visual.pos_embed", # qwen3vl
|
||||
@@ -1442,12 +1442,12 @@ class TensorNameMap:
|
||||
|
||||
MODEL_TENSOR.V_ENC_EMBD_IMGNL: (
|
||||
"model.image_newline", # Deepseek-OCR
|
||||
"vit.perceive.image_newline", # HunyuanOCR
|
||||
"vit.perceive.image_newline", # HunyuanVL
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_EMBD_VSEP: (
|
||||
"model.view_seperator", # Deepseek-OCR
|
||||
"vit.perceive.image_sep", # HunyuanOCR
|
||||
"vit.perceive.image_sep", # HunyuanVL
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_QKV: (
|
||||
@@ -1466,7 +1466,7 @@ class TensorNameMap:
|
||||
"model.vision_tower.encoder.layer.{bid}.attention.q_proj", # Intern-S1
|
||||
"vpm.encoder.layers.{bid}.self_attn.q_proj",
|
||||
"model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM
|
||||
"vit.layers.{bid}.self_attn.q_proj", # HunyuanOCR
|
||||
"vit.layers.{bid}.self_attn.q_proj", # HunyuanVL
|
||||
"vision_model.model.layers.{bid}.self_attn.q_proj", # llama4
|
||||
"vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral-hf
|
||||
"vision_encoder.transformer.layers.{bid}.attention.wq", # pixtral
|
||||
@@ -1490,7 +1490,7 @@ class TensorNameMap:
|
||||
"model.vision_tower.encoder.layer.{bid}.attention.k_proj", # Intern-S1
|
||||
"vpm.encoder.layers.{bid}.self_attn.k_proj",
|
||||
"model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM
|
||||
"vit.layers.{bid}.self_attn.k_proj", # HunyuanOCR
|
||||
"vit.layers.{bid}.self_attn.k_proj", # HunyuanVL
|
||||
"vision_model.model.layers.{bid}.self_attn.k_proj", # llama4
|
||||
"vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral-hf
|
||||
"vision_encoder.transformer.layers.{bid}.attention.wk", # pixtral
|
||||
@@ -1514,7 +1514,7 @@ class TensorNameMap:
|
||||
"model.vision_tower.encoder.layer.{bid}.attention.v_proj", # Intern-S1
|
||||
"vpm.encoder.layers.{bid}.self_attn.v_proj",
|
||||
"model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM
|
||||
"vit.layers.{bid}.self_attn.v_proj", # HunyuanOCR
|
||||
"vit.layers.{bid}.self_attn.v_proj", # HunyuanVL
|
||||
"vision_model.model.layers.{bid}.self_attn.v_proj", # llama4
|
||||
"vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral-hf
|
||||
"vision_encoder.transformer.layers.{bid}.attention.wv", # pixtral
|
||||
@@ -1532,7 +1532,7 @@ class TensorNameMap:
|
||||
"model.vision_tower.encoder.layer.{bid}.layernorm_before", # Intern-S1
|
||||
"vpm.encoder.layers.{bid}.layer_norm1",
|
||||
"model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM
|
||||
"vit.layers.{bid}.input_layernorm", # HunyuanOCR
|
||||
"vit.layers.{bid}.input_layernorm", # HunyuanVL
|
||||
"vision_tower.transformer.layers.{bid}.attention_norm", # pixtral-hf
|
||||
"vision_encoder.transformer.layers.{bid}.attention_norm", # pixtral
|
||||
"vision_model.model.layers.{bid}.input_layernorm", # llama4, gemma4
|
||||
@@ -1553,7 +1553,7 @@ class TensorNameMap:
|
||||
"model.vision_tower.encoder.layer.{bid}.attention.projection_layer", # Intern-S1
|
||||
"vpm.encoder.layers.{bid}.self_attn.out_proj",
|
||||
"model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM
|
||||
"vit.layers.{bid}.self_attn.o_proj", # HunyuanOCR
|
||||
"vit.layers.{bid}.self_attn.o_proj", # HunyuanVL
|
||||
"model.vision_model.encoder.layers.{bid}.self_attn.projection_layer", # Janus Pro
|
||||
"vision_model.model.layers.{bid}.self_attn.o_proj", # llama4
|
||||
"vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral-hf
|
||||
@@ -1580,7 +1580,7 @@ class TensorNameMap:
|
||||
"model.vision_tower.encoder.layer.{bid}.layernorm_after", # Intern-S1
|
||||
"vpm.encoder.layers.{bid}.layer_norm2",
|
||||
"model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM
|
||||
"vit.layers.{bid}.post_attention_layernorm", # HunyuanOCR
|
||||
"vit.layers.{bid}.post_attention_layernorm", # HunyuanVL
|
||||
"vision_model.model.layers.{bid}.post_attention_layernorm", # llama4
|
||||
"vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral-hf
|
||||
"vision_encoder.transformer.layers.{bid}.ffn_norm", # pixtral
|
||||
@@ -1601,7 +1601,7 @@ class TensorNameMap:
|
||||
"model.vision_tower.encoder.layer.{bid}.mlp.fc1", # Intern-S1
|
||||
"vpm.encoder.layers.{bid}.mlp.fc1",
|
||||
"model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3
|
||||
"vit.layers.{bid}.mlp.dense_h_to_4h", # HunyuanOCR
|
||||
"vit.layers.{bid}.mlp.dense_h_to_4h", # HunyuanVL
|
||||
"vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral-hf
|
||||
"vision_encoder.transformer.layers.{bid}.feed_forward.w3", # pixtral
|
||||
"vision_model.model.layers.{bid}.mlp.fc1", # llama4
|
||||
@@ -1630,7 +1630,7 @@ class TensorNameMap:
|
||||
"model.vision_tower.encoder.layer.{bid}.mlp.fc2", # Intern-S1
|
||||
"vpm.encoder.layers.{bid}.mlp.fc2",
|
||||
"model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3
|
||||
"vit.layers.{bid}.mlp.dense_4h_to_h", # HunyuanOCR
|
||||
"vit.layers.{bid}.mlp.dense_4h_to_h", # HunyuanVL
|
||||
"vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral-hf
|
||||
"vision_encoder.transformer.layers.{bid}.feed_forward.w2", # pixtral
|
||||
"vision_model.model.layers.{bid}.mlp.fc2", # llama4
|
||||
@@ -1694,7 +1694,7 @@ class TensorNameMap:
|
||||
MODEL_TENSOR.V_MM_POST_NORM: (
|
||||
"visual.merger.post_projection_norm", # glm4v
|
||||
"vision_tower.post_trunk_norm", # dots.ocr
|
||||
"vit.perceive.after_rms", # HunyuanOCR
|
||||
"vit.perceive.after_rms", # HunyuanVL
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_MM_INP_PROJ: (
|
||||
@@ -1899,15 +1899,15 @@ class TensorNameMap:
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_MM_PRE_NORM: (
|
||||
"vit.perceive.before_rms", # HunyuanOCR
|
||||
"vit.perceive.before_rms", # HunyuanVL
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_TOK_IMG_BEGIN: (
|
||||
"vit.perceive.image_begin", # HunyuanOCR
|
||||
"vit.perceive.image_begin", # HunyuanVL
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_TOK_IMG_END: (
|
||||
"vit.perceive.image_end", # HunyuanOCR
|
||||
"vit.perceive.image_end", # HunyuanVL
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_STD_BIAS: (
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
-r ./requirements-convert_legacy_llama.txt
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||
|
||||
## Embedding Gemma requires PyTorch 2.6.0 or later
|
||||
torch~=2.6.0; platform_machine != "s390x"
|
||||
## Embedding Gemma requires PyTorch 2.6.0 or later, bumped to 2.11.0 for compatibility
|
||||
torch==2.11.0; platform_machine != "s390x"
|
||||
|
||||
# torch s390x packages can only be found from nightly builds
|
||||
--extra-index-url https://download.pytorch.org/whl/nightly
|
||||
|
||||
@@ -45,5 +45,5 @@ adb $adbserial $adbhost shell " \
|
||||
ADSP_LIBRARY_PATH=$basedir/$branch/lib \
|
||||
$ndev $nhvx $opmask $verbose $profile $hb ./$branch/bin/llama-bench --device $device --mmap 0 -m $basedir/../gguf/$model \
|
||||
--poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 \
|
||||
--ubatch-size 256 -fa 1 -ngl 99 $cli_opts $@ \
|
||||
--ubatch-size 1024 -fa 1 -ngl 99 $cli_opts $@ \
|
||||
"
|
||||
|
||||
@@ -73,6 +73,6 @@ adb $adbserial $adbhost shell " \
|
||||
$verbose $sched $opmask $profile $nhvx $hmx $ndev $hb $opbatch $opqueue $opflt $vmem $mbuf \
|
||||
./$branch/bin/llama-cli --no-mmap -m $basedir/../gguf/$model \
|
||||
--poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 \
|
||||
--ctx-size 8192 --ubatch-size 256 -fa on \
|
||||
--ctx-size 8192 --ubatch-size 1024 -fa on \
|
||||
-ngl 99 --device $device $cli_opts $@ \
|
||||
"
|
||||
|
||||
@@ -69,6 +69,6 @@ adb $adbserial $adbhost shell " \
|
||||
$verbose $sched $opmask $profile $nhvx $hmx $ndev $hb $opbatch $opqueue $opflt $vmem $mbuf \
|
||||
./$branch/bin/llama-completion --no-mmap -m $basedir/../gguf/$model \
|
||||
--poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 \
|
||||
--ctx-size 8192 --ubatch-size 256 -fa on \
|
||||
-ngl 99 --device $device $cli_opts $@ \
|
||||
--ctx-size 8192 --ubatch-size 1024 -fa on \
|
||||
-ngl 99 --device $device $cli_opts $@ \
|
||||
"
|
||||
|
||||
@@ -66,6 +66,6 @@ adb $adbserial $adbhost shell " \
|
||||
--mmproj $basedir/../gguf/$mmproj \
|
||||
--image $basedir/../gguf/$image \
|
||||
--poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 \
|
||||
--ctx-size 8192 --ubatch-size 256 -fa on \
|
||||
--ctx-size 8192 --ubatch-size 1024 -fa on \
|
||||
-ngl 99 --device $device -v $cli_opts $@ \
|
||||
"
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user