mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-14 17:56:43 +02:00
Compare commits
101 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 8ed274ef46 | |||
| 46722116b9 | |||
| c2ba3e47a2 | |||
| 53bd47ea5b | |||
| 4988f6e866 | |||
| f05cf4676a | |||
| e8067a8b36 | |||
| 341babcf73 | |||
| 1a7718b4c5 | |||
| 597b6672e8 | |||
| 57fe1f07c3 | |||
| d8a24ccee2 | |||
| c34b92235b | |||
| e37abd6b5f | |||
| f58bad4137 | |||
| cd5044661c | |||
| ebc10770ac | |||
| 3e7bd4f39a | |||
| f7ca93d12c | |||
| 02182fc5b9 | |||
| f532be8fac | |||
| e08c226a2c | |||
| 70b54e140c | |||
| 6471e3c090 | |||
| 88a39274ec | |||
| 85f99dca8b | |||
| 099ea76fb4 | |||
| ba1df050f3 | |||
| 1593d5684d | |||
| 4c6595503f | |||
| 263cc04a54 | |||
| 17e59d6209 | |||
| fdc3db9b65 | |||
| 1af154a76f | |||
| 18ef86ecec | |||
| 1bfbdb134e | |||
| 68f30663cf | |||
| db94854ff5 | |||
| ac4cddeb0d | |||
| e95dae18d6 | |||
| d2462f8f7a | |||
| fb83cc9a07 | |||
| 039e20a2db | |||
| d2e22ed975 | |||
| 76da2450a4 | |||
| d73cd07674 | |||
| e25a32e98c | |||
| 483609509d | |||
| 49f3542190 | |||
| d6d0ce8215 | |||
| b4e3dc613b | |||
| ae735b1314 | |||
| 9682e351b8 | |||
| 1e912561dd | |||
| efbacf8d21 | |||
| 26021699bc | |||
| 961e9a3e46 | |||
| f0152efe40 | |||
| fd3271e0b4 | |||
| e3471b3e73 | |||
| 3ac3c20c96 | |||
| 1e1aca09da | |||
| 7d2b45b4f7 | |||
| 42a0afd594 | |||
| a66d50588b | |||
| 1705d434f6 | |||
| 3b3da01dc2 | |||
| 3ebe862b5d | |||
| 8f83d6c271 | |||
| c2b1518fd4 | |||
| 6a1de6fbf1 | |||
| 715b86a366 | |||
| c74759a244 | |||
| 0f7fada56b | |||
| 19bba67c1f | |||
| daf6bc9f2d | |||
| d403f00ec3 | |||
| 9e3b928fd8 | |||
| 8a963fc10e | |||
| 379ac6673b | |||
| f0156d1401 | |||
| 04eb4c446d | |||
| 8a091c47ab | |||
| 465b1f0e75 | |||
| f71af352a5 | |||
| 3f7c79d7b5 | |||
| 98d5e8ba8a | |||
| 31e82494c0 | |||
| 6b80c74f28 | |||
| 588f0dc2ce | |||
| f5c6ae1827 | |||
| 5a69c97439 | |||
| 5343f4502a | |||
| 603300b008 | |||
| 308f61c31f | |||
| da87e9b612 | |||
| e82beaa60d | |||
| c4a278d68e | |||
| 64086f2b2f | |||
| 6effcecd0b | |||
| 86591c7536 |
@@ -53,7 +53,7 @@ LABEL org.opencontainers.image.created=$BUILD_DATE \
|
||||
org.opencontainers.image.source=$IMAGE_SOURCE
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 curl \
|
||||
&& apt-get install -y libgomp1 curl ffmpeg \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
ARG UBUNTU_VERSION=24.04
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG CUDA_VERSION=12.8.1
|
||||
ARG GCC_VERSION=14
|
||||
# Target the CUDA build image
|
||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||
|
||||
@@ -12,13 +13,14 @@ ARG APP_REVISION=N/A
|
||||
|
||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
||||
|
||||
ARG GCC_VERSION
|
||||
# CUDA architecture to build for (defaults to all supported archs)
|
||||
ARG CUDA_DOCKER_ARCH=default
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y gcc-14 g++-14 build-essential cmake python3 python3-pip git libssl-dev libgomp1
|
||||
apt-get install -y gcc-${GCC_VERSION} g++-${GCC_VERSION} build-essential cmake python3 python3-pip git libssl-dev libgomp1
|
||||
|
||||
ENV CC=gcc-14 CXX=g++-14 CUDAHOSTCXX=g++-14
|
||||
ENV CC=gcc-${GCC_VERSION} CXX=g++-${GCC_VERSION} CUDAHOSTCXX=g++-${GCC_VERSION}
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
@@ -59,7 +61,7 @@ LABEL org.opencontainers.image.created=$BUILD_DATE \
|
||||
org.opencontainers.image.source=$IMAGE_SOURCE
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 curl \
|
||||
&& apt-get install -y libgomp1 curl ffmpeg \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
|
||||
@@ -57,11 +57,21 @@ LABEL org.opencontainers.image.created=$BUILD_DATE \
|
||||
org.opencontainers.image.url=$IMAGE_URL \
|
||||
org.opencontainers.image.source=$IMAGE_SOURCE
|
||||
|
||||
ARG IGC_VERSION=v2.20.5
|
||||
ARG IGC_VERSION_FULL=2_2.20.5+19972
|
||||
ARG COMPUTE_RUNTIME_VERSION=25.40.35563.10
|
||||
ARG COMPUTE_RUNTIME_VERSION_FULL=25.40.35563.10-0
|
||||
ARG IGDGMM_VERSION=22.8.2
|
||||
#Following versions are for multiple GPUs, since 26.x has known issue:
|
||||
# https://github.com/ggml-org/llama.cpp/issues/21747,
|
||||
# https://github.com/intel/compute-runtime/issues/921.
|
||||
#ARG IGC_VERSION=v2.20.5
|
||||
#ARG IGC_VERSION_FULL=2_2.20.5+19972
|
||||
#ARG COMPUTE_RUNTIME_VERSION=25.40.35563.10
|
||||
#ARG COMPUTE_RUNTIME_VERSION_FULL=25.40.35563.10-0
|
||||
#ARG IGDGMM_VERSION=22.8.2
|
||||
|
||||
|
||||
ARG IGC_VERSION=v2.34.4
|
||||
ARG IGC_VERSION_FULL=2_2.34.4+21428
|
||||
ARG COMPUTE_RUNTIME_VERSION=26.18.38308.1
|
||||
ARG COMPUTE_RUNTIME_VERSION_FULL=26.18.38308.1-0
|
||||
ARG IGDGMM_VERSION=22.10.0
|
||||
RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
|
||||
&& wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
|
||||
&& wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
|
||||
@@ -75,7 +85,7 @@ RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
|
||||
&& dpkg --install *.deb
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 curl \
|
||||
&& apt-get install -y libgomp1 curl ffmpeg \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
|
||||
@@ -64,7 +64,7 @@ LABEL org.opencontainers.image.created=$BUILD_DATE \
|
||||
org.opencontainers.image.source=$IMAGE_SOURCE
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 curl \
|
||||
&& apt-get install -y libgomp1 curl ffmpeg \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
|
||||
@@ -107,7 +107,7 @@ LABEL org.opencontainers.image.created=$BUILD_DATE \
|
||||
org.opencontainers.image.source=$IMAGE_SOURCE
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 libtbb12 curl wget ocl-icd-libopencl1 \
|
||||
&& apt-get install -y libgomp1 libtbb12 curl wget ffmpeg ocl-icd-libopencl1 \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
|
||||
@@ -76,7 +76,7 @@ LABEL org.opencontainers.image.created=$BUILD_DATE \
|
||||
org.opencontainers.image.source=$IMAGE_SOURCE
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 curl \
|
||||
&& apt-get install -y libgomp1 curl ffmpeg \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
|
||||
@@ -49,7 +49,7 @@ LABEL org.opencontainers.image.created=$BUILD_DATE \
|
||||
org.opencontainers.image.source=$IMAGE_SOURCE
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 curl libvulkan1 mesa-vulkan-drivers \
|
||||
&& apt-get install -y libgomp1 curl ffmpeg libvulkan1 mesa-vulkan-drivers \
|
||||
libglvnd0 libgl1 libglx0 libegl1 libgles2 \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
|
||||
@@ -46,7 +46,7 @@ LABEL org.opencontainers.image.created=$BUILD_DATE \
|
||||
org.opencontainers.image.source=$IMAGE_SOURCE
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 libnuma1 curl \
|
||||
&& apt-get install -y libgomp1 libnuma1 curl ffmpeg \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
|
||||
+1
-1
@@ -12,7 +12,7 @@ SYCL:
|
||||
- ggml/src/ggml-sycl/**
|
||||
- docs/backend/SYCL.md
|
||||
- examples/sycl/**
|
||||
Nvidia GPU:
|
||||
CUDA:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml/include/ggml-cuda.h
|
||||
|
||||
+103
-124
@@ -34,129 +34,108 @@ env:
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ubuntu-24-sycl:
|
||||
strategy:
|
||||
matrix:
|
||||
build: [fp32, fp16]
|
||||
include:
|
||||
- build: fp32
|
||||
fp16: OFF
|
||||
- build: fp16
|
||||
fp16: ON
|
||||
|
||||
# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
# in order to enable it again, we have to provision dedicated runners to run it
|
||||
# ubuntu-24-sycl:
|
||||
# strategy:
|
||||
# matrix:
|
||||
# build: [fp32]
|
||||
# include:
|
||||
# - build: fp32
|
||||
# fp16: OFF
|
||||
#
|
||||
# runs-on: ubuntu-24.04
|
||||
#
|
||||
# env:
|
||||
# ONEAPI_ROOT: /opt/intel/oneapi/
|
||||
# ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
# LEVEL_ZERO_VERSION: "1.28.2"
|
||||
# LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
|
||||
#
|
||||
# continue-on-error: true
|
||||
#
|
||||
# steps:
|
||||
# - uses: actions/checkout@v6
|
||||
#
|
||||
# - name: Use oneAPI Installation Cache
|
||||
# uses: actions/cache@v5
|
||||
# id: cache-sycl
|
||||
# with:
|
||||
# path: ${{ env.ONEAPI_ROOT }}
|
||||
# key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
#
|
||||
# - name: Download & Install oneAPI
|
||||
# shell: bash
|
||||
# if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
# run: |
|
||||
# cd /tmp
|
||||
# wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
|
||||
# sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
|
||||
#
|
||||
# - name: Install Level Zero SDK
|
||||
# shell: bash
|
||||
# run: |
|
||||
# cd /tmp
|
||||
# wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
|
||||
# wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
|
||||
# sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
|
||||
#
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: sycl-ubuntu-24-${{ matrix.build }}
|
||||
# evict-old-files: 1d
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
#
|
||||
# - name: Build
|
||||
# id: cmake_build
|
||||
# run: |
|
||||
# source /opt/intel/oneapi/setvars.sh
|
||||
# cmake -B build \
|
||||
# -G "Ninja" \
|
||||
# -DCMAKE_BUILD_TYPE=Release \
|
||||
# -DGGML_SYCL=ON \
|
||||
# -DCMAKE_C_COMPILER=icx \
|
||||
# -DCMAKE_CXX_COMPILER=icpx \
|
||||
# -DLLAMA_OPENSSL=OFF \
|
||||
# -DGGML_NATIVE=OFF \
|
||||
# -DGGML_SYCL_F16=${{ matrix.fp16 }}
|
||||
# time cmake --build build --config Release -j $(nproc)
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
# in order to enable it again, we have to provision dedicated runners to run it
|
||||
# windows-latest-sycl:
|
||||
# runs-on: windows-2022
|
||||
#
|
||||
# defaults:
|
||||
# run:
|
||||
# shell: bash
|
||||
#
|
||||
# env:
|
||||
# WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
|
||||
# WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
|
||||
# LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
|
||||
# ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||
# ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: Use oneAPI Installation Cache
|
||||
# uses: actions/cache@v5
|
||||
# id: cache-sycl
|
||||
# with:
|
||||
# path: ${{ env.ONEAPI_ROOT }}
|
||||
# key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
#
|
||||
# - name: Download & Install oneAPI
|
||||
# shell: bash
|
||||
# if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
# run: |
|
||||
# scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
#
|
||||
# - name: Install Level Zero SDK
|
||||
# shell: pwsh
|
||||
# run: |
|
||||
# Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
|
||||
# Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
|
||||
# "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||
#
|
||||
# - name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: sycl-windows-latest
|
||||
# variant: ccache
|
||||
# evict-old-files: 1d
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
#
|
||||
# # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
|
||||
#
|
||||
# - name: Build
|
||||
# id: cmake_build
|
||||
# run: examples/sycl/win-build-sycl.bat
|
||||
env:
|
||||
ONEAPI_ROOT: /opt/intel/oneapi/
|
||||
ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
LEVEL_ZERO_VERSION: "1.28.2"
|
||||
LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
|
||||
|
||||
continue-on-error: true
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Download & Install oneAPI
|
||||
shell: bash
|
||||
run: |
|
||||
cd /tmp
|
||||
wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
|
||||
sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
|
||||
|
||||
- name: Install Level Zero SDK
|
||||
shell: bash
|
||||
run: |
|
||||
cd /tmp
|
||||
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
|
||||
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
|
||||
sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: sycl-ubuntu-24-${{ matrix.build }}
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
cmake -B build \
|
||||
-G "Ninja" \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_SYCL=ON \
|
||||
-DCMAKE_C_COMPILER=icx \
|
||||
-DCMAKE_CXX_COMPILER=icpx \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_SYCL_F16=${{ matrix.fp16 }}
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
windows-latest-sycl:
|
||||
runs-on: windows-2022
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
env:
|
||||
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
|
||||
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
|
||||
LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
|
||||
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||
ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Download & Install oneAPI
|
||||
shell: bash
|
||||
run: |
|
||||
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
|
||||
- name: Install Level Zero SDK
|
||||
shell: pwsh
|
||||
run: |
|
||||
Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
|
||||
Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
|
||||
"LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: sycl-windows-latest
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
# TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: examples/sycl/win-build-sycl.bat
|
||||
|
||||
@@ -35,6 +35,29 @@ env:
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
format:
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Install clang-format 22
|
||||
run: |
|
||||
wget -qO- https://apt.llvm.org/llvm-snapshot.gpg.key |
|
||||
sudo tee /etc/apt/trusted.gpg.d/apt.llvm.org.asc > /dev/null
|
||||
sudo add-apt-repository -y \
|
||||
"deb http://apt.llvm.org/noble/ llvm-toolchain-noble-22 main"
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y clang-format-22
|
||||
|
||||
- name: Check formatting
|
||||
run: |
|
||||
find ggml/src/ggml-webgpu \
|
||||
-type f \( -name '*.cpp' -o -name '*.hpp' -o -name '*.h' \) \
|
||||
-print0 |
|
||||
xargs -0 clang-format-22 --dry-run --Werror
|
||||
|
||||
macos:
|
||||
runs-on: macos-latest
|
||||
|
||||
|
||||
@@ -82,8 +82,8 @@ jobs:
|
||||
{ "tag": "cpu", "dockerfile": ".devops/s390x.Dockerfile", "platforms": "linux/s390x", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-s390x" },
|
||||
{ "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
|
||||
{ "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
|
||||
{ "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.1.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
|
||||
{ "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.1.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
|
||||
{ "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.3.0", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
|
||||
{ "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.3.0", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
|
||||
{ "tag": "musa", "dockerfile": ".devops/musa.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
|
||||
{ "tag": "intel", "dockerfile": ".devops/intel.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
|
||||
{ "tag": "vulkan", "dockerfile": ".devops/vulkan.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" },
|
||||
|
||||
+254
-223
@@ -59,8 +59,31 @@ jobs:
|
||||
echo "should_release=false" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
get-version:
|
||||
runs-on: ubuntu-slim
|
||||
outputs:
|
||||
ui_version: ${{ steps.version.outputs.ui_version }}
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- id: version
|
||||
run: |
|
||||
# Resolve UI version: BUILD_NUMBER from cmake/build-info.cmake > git hash + epoch > fallback
|
||||
version=""
|
||||
if grep -q "BUILD_NUMBER" cmake/build-info.cmake; then
|
||||
build_number=$(grep "set(BUILD_NUMBER" cmake/build-info.cmake | grep -oP '\d+')
|
||||
if [ -n "$build_number" ] && [ "$build_number" -gt 0 ]; then
|
||||
version="b${build_number}"
|
||||
fi
|
||||
fi
|
||||
if [ -z "$version" ]; then
|
||||
version=$(git rev-parse --short HEAD)-$(date +%s)
|
||||
fi
|
||||
echo "ui_version=${version}" >> $GITHUB_OUTPUT
|
||||
|
||||
macos-cpu:
|
||||
needs: [check-release]
|
||||
needs: [check-release, get-version]
|
||||
if: ${{ needs.check-release.outputs.should_release == 'true' }}
|
||||
strategy:
|
||||
matrix:
|
||||
@@ -116,6 +139,7 @@ jobs:
|
||||
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||
-DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
@@ -141,7 +165,7 @@ jobs:
|
||||
name: llama-bin-macos-${{ matrix.build }}.tar.gz
|
||||
|
||||
ubuntu-cpu:
|
||||
needs: [check-release]
|
||||
needs: [check-release, get-version]
|
||||
if: ${{ needs.check-release.outputs.should_release == 'true' }}
|
||||
strategy:
|
||||
matrix:
|
||||
@@ -201,6 +225,7 @@ jobs:
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_CPU_ALL_VARIANTS=ON \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
@@ -227,7 +252,7 @@ jobs:
|
||||
name: llama-bin-ubuntu-${{ matrix.build }}.tar.gz
|
||||
|
||||
ubuntu-vulkan:
|
||||
needs: [check-release]
|
||||
needs: [check-release, get-version]
|
||||
if: ${{ needs.check-release.outputs.should_release == 'true' }}
|
||||
|
||||
strategy:
|
||||
@@ -287,6 +312,7 @@ jobs:
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_CPU_ALL_VARIANTS=ON \
|
||||
-DGGML_VULKAN=ON \
|
||||
-DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
@@ -312,7 +338,7 @@ jobs:
|
||||
name: llama-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz
|
||||
|
||||
android-arm64:
|
||||
needs: [check-release]
|
||||
needs: [check-release, get-version]
|
||||
if: ${{ needs.check-release.outputs.should_release == 'true' }}
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
@@ -379,6 +405,7 @@ jobs:
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||
-DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
@@ -404,7 +431,7 @@ jobs:
|
||||
name: llama-bin-android-arm64.tar.gz
|
||||
|
||||
ubuntu-24-openvino:
|
||||
needs: [check-release]
|
||||
needs: [check-release, get-version]
|
||||
if: ${{ needs.check-release.outputs.should_release == 'true' }}
|
||||
|
||||
runs-on: ubuntu-24.04
|
||||
@@ -476,7 +503,8 @@ jobs:
|
||||
source ./openvino_toolkit/setupvars.sh
|
||||
cmake -B build/ReleaseOV -G Ninja \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENVINO=ON
|
||||
-DGGML_OPENVINO=ON \
|
||||
-DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }}
|
||||
cmake --build build/ReleaseOV --config Release -j $(nproc)
|
||||
|
||||
- name: ccache-clear
|
||||
@@ -504,7 +532,7 @@ jobs:
|
||||
needs: [check-release]
|
||||
if: ${{ needs.check-release.outputs.should_release == 'true' }}
|
||||
|
||||
runs-on: windows-2025
|
||||
runs-on: windows-2025-vs2026
|
||||
|
||||
permissions:
|
||||
actions: write
|
||||
@@ -535,12 +563,12 @@ jobs:
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: release-windows-2025-${{ matrix.arch }}-cpu
|
||||
key: release-windows-2025-vs2026-${{ matrix.arch }}-cpu
|
||||
|
||||
- name: Build
|
||||
shell: cmd
|
||||
run: |
|
||||
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch == 'x64' && 'x64' || 'amd64_arm64' }}
|
||||
call "C:\Program Files\Microsoft Visual Studio\18\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch == 'x64' && 'x64' || 'amd64_arm64' }}
|
||||
cmake -S . -B build -G "Ninja Multi-Config" ^
|
||||
-D CMAKE_TOOLCHAIN_FILE=cmake/${{ matrix.arch }}-windows-llvm.cmake ^
|
||||
-DLLAMA_BUILD_BORINGSSL=ON ^
|
||||
@@ -554,12 +582,12 @@ jobs:
|
||||
- name: ccache-clear
|
||||
uses: ./.github/actions/ccache-clear
|
||||
with:
|
||||
key: release-windows-2025-${{ matrix.arch }}-cpu
|
||||
key: release-windows-2025-vs2026-${{ matrix.arch }}-cpu
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.44.35112\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
|
||||
Copy-Item "C:\Program Files\Microsoft Visual Studio\18\Enterprise\VC\Redist\MSVC\14.51.36231\debug_nonredist\${{ matrix.arch }}\Microsoft.VC145.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
|
||||
7z a -snl llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*
|
||||
|
||||
- name: Upload artifacts
|
||||
@@ -754,213 +782,209 @@ jobs:
|
||||
path: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
|
||||
name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
|
||||
|
||||
# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
# in order to enable it again, we have to provision dedicated runners to run it
|
||||
# windows-sycl:
|
||||
#
|
||||
# runs-on: windows-2022
|
||||
#
|
||||
# defaults:
|
||||
# run:
|
||||
# shell: bash
|
||||
#
|
||||
# env:
|
||||
# WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
|
||||
# WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
|
||||
# LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
|
||||
# ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||
# ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: Use oneAPI Installation Cache
|
||||
# uses: actions/cache@v5
|
||||
# id: cache-sycl
|
||||
# with:
|
||||
# path: ${{ env.ONEAPI_ROOT }}
|
||||
# key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
#
|
||||
# - name: Download & Install oneAPI
|
||||
# shell: bash
|
||||
# if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
# run: |
|
||||
# scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
#
|
||||
# - name: Install Level Zero SDK
|
||||
# shell: pwsh
|
||||
# run: |
|
||||
# Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
|
||||
# Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
|
||||
# "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||
#
|
||||
# - name: Setup Node.js
|
||||
# uses: actions/setup-node@v6
|
||||
# with:
|
||||
# node-version: "24"
|
||||
# cache: "npm"
|
||||
# cache-dependency-path: "tools/ui/package-lock.json"
|
||||
#
|
||||
# - name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: release-windows-2022-x64-sycl
|
||||
#
|
||||
# - name: Build
|
||||
# id: cmake_build
|
||||
# shell: cmd
|
||||
# run: |
|
||||
# call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||
# cmake -G "Ninja" -B build ^
|
||||
# -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^
|
||||
# -DCMAKE_BUILD_TYPE=Release ^
|
||||
# -DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
|
||||
# -DGGML_CPU=OFF -DGGML_SYCL=ON ^
|
||||
# -DLLAMA_BUILD_BORINGSSL=ON
|
||||
# cmake --build build --target ggml-sycl -j
|
||||
#
|
||||
# - name: Build the release package
|
||||
# id: pack_artifacts
|
||||
# run: |
|
||||
# echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
|
||||
#
|
||||
# cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
|
||||
#
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero_v2.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
|
||||
# ZE_LOADER_DLL=$(find "${{ env.ONEAPI_ROOT }}" "$LEVEL_ZERO_V1_SDK_PATH" -iname ze_loader.dll -print -quit 2>/dev/null || true)
|
||||
# if [ -n "$ZE_LOADER_DLL" ]; then
|
||||
# echo "Using Level Zero loader: $ZE_LOADER_DLL"
|
||||
# cp "$ZE_LOADER_DLL" ./build/bin
|
||||
# else
|
||||
# echo "Level Zero loader DLL not found in oneAPI or SDK; relying on system driver/runtime"
|
||||
# fi
|
||||
#
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-fallback-bfloat16.spv" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-native-bfloat16.spv" ./build/bin
|
||||
#
|
||||
# cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
|
||||
#
|
||||
# cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/tcm.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/libhwloc-15.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/umf/latest/bin/umf.dll" ./build/bin
|
||||
#
|
||||
# echo "cp oneAPI running time dll files to ./build/bin done"
|
||||
# 7z a -snl llama-bin-win-sycl-x64.zip ./build/bin/*
|
||||
#
|
||||
# - name: Upload the release package
|
||||
# uses: actions/upload-artifact@v6
|
||||
# with:
|
||||
# path: llama-bin-win-sycl-x64.zip
|
||||
# name: llama-bin-win-sycl-x64.zip
|
||||
windows-sycl:
|
||||
needs: [check-release]
|
||||
if: ${{ needs.check-release.outputs.should_release == 'true' }}
|
||||
|
||||
# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
# in order to enable it again, we have to provision dedicated runners to run it
|
||||
# ubuntu-24-sycl:
|
||||
#
|
||||
# strategy:
|
||||
# matrix:
|
||||
# build: [fp32]
|
||||
# include:
|
||||
# - build: fp32
|
||||
# fp16: OFF
|
||||
#
|
||||
# runs-on: ubuntu-24.04
|
||||
#
|
||||
# env:
|
||||
# ONEAPI_ROOT: /opt/intel/oneapi/
|
||||
# ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
# LEVEL_ZERO_VERSION: "1.28.2"
|
||||
# LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
# with:
|
||||
# fetch-depth: 0
|
||||
#
|
||||
# - name: Use oneAPI Installation Cache
|
||||
# uses: actions/cache@v5
|
||||
# id: cache-sycl
|
||||
# with:
|
||||
# path: ${{ env.ONEAPI_ROOT }}
|
||||
# key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
#
|
||||
# - name: Download & Install oneAPI
|
||||
# shell: bash
|
||||
# if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
# run: |
|
||||
# cd /tmp
|
||||
# wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
|
||||
# sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
|
||||
#
|
||||
# - name: Install Level Zero SDK
|
||||
# shell: bash
|
||||
# run: |
|
||||
# cd /tmp
|
||||
# wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
|
||||
# wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
|
||||
# sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
|
||||
#
|
||||
# - name: Setup Node.js
|
||||
# uses: actions/setup-node@v6
|
||||
# with:
|
||||
# node-version: "24"
|
||||
# cache: "npm"
|
||||
# cache-dependency-path: "tools/ui/package-lock.json"
|
||||
#
|
||||
# - name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: release-ubuntu-24.04-sycl
|
||||
#
|
||||
# - name: Build
|
||||
# id: cmake_build
|
||||
# run: |
|
||||
# source /opt/intel/oneapi/setvars.sh
|
||||
# cmake -B build \
|
||||
# -G "Ninja" \
|
||||
# -DCMAKE_BUILD_TYPE=Release \
|
||||
# -DGGML_SYCL=ON \
|
||||
# -DCMAKE_C_COMPILER=icx \
|
||||
# -DCMAKE_CXX_COMPILER=icpx \
|
||||
# -DLLAMA_OPENSSL=OFF \
|
||||
# -DGGML_NATIVE=OFF \
|
||||
# -DGGML_SYCL_F16=${{ matrix.fp16 }}
|
||||
# time cmake --build build --config Release -j $(nproc)
|
||||
#
|
||||
# - name: Determine tag name
|
||||
# id: tag
|
||||
# uses: ./.github/actions/get-tag-name
|
||||
#
|
||||
# - name: Pack artifacts
|
||||
# id: pack_artifacts
|
||||
# run: |
|
||||
# cp LICENSE ./build/bin/
|
||||
# tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
|
||||
#
|
||||
# - name: Upload artifacts
|
||||
# uses: actions/upload-artifact@v6
|
||||
# with:
|
||||
# path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
|
||||
# name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
|
||||
runs-on: windows-2022
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
env:
|
||||
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
|
||||
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
|
||||
LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
|
||||
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||
ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Download & Install oneAPI
|
||||
shell: bash
|
||||
run: |
|
||||
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
|
||||
- name: Install Level Zero SDK
|
||||
shell: pwsh
|
||||
run: |
|
||||
Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
|
||||
Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
|
||||
"LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v6
|
||||
with:
|
||||
node-version: "24"
|
||||
cache: "npm"
|
||||
cache-dependency-path: "tools/ui/package-lock.json"
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: release-windows-2022-x64-sycl
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
shell: cmd
|
||||
run: |
|
||||
call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||
cmake -G "Ninja" -B build ^
|
||||
-DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^
|
||||
-DCMAKE_BUILD_TYPE=Release ^
|
||||
-DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
|
||||
-DGGML_CPU=OFF -DGGML_SYCL=ON ^
|
||||
-DLLAMA_BUILD_BORINGSSL=ON
|
||||
cmake --build build --target ggml-sycl -j %NUMBER_OF_PROCESSORS%
|
||||
|
||||
- name: ccache-clear
|
||||
uses: ./.github/actions/ccache-clear
|
||||
with:
|
||||
key: release-windows-2022-x64-sycl
|
||||
|
||||
- name: Build the release package
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero_v2.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
|
||||
ZE_LOADER_DLL=$(find "${{ env.ONEAPI_ROOT }}" "$LEVEL_ZERO_V1_SDK_PATH" -iname ze_loader.dll -print -quit 2>/dev/null || true)
|
||||
if [ -n "$ZE_LOADER_DLL" ]; then
|
||||
echo "Using Level Zero loader: $ZE_LOADER_DLL"
|
||||
cp "$ZE_LOADER_DLL" ./build/bin
|
||||
else
|
||||
echo "Level Zero loader DLL not found in oneAPI or SDK; relying on system driver/runtime"
|
||||
fi
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-fallback-bfloat16.spv" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-native-bfloat16.spv" ./build/bin
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/tcm.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/libhwloc-15.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/umf/latest/bin/umf.dll" ./build/bin
|
||||
|
||||
echo "cp oneAPI running time dll files to ./build/bin done"
|
||||
7z a -snl llama-bin-win-sycl-x64.zip ./build/bin/*
|
||||
|
||||
- name: Upload the release package
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
path: llama-bin-win-sycl-x64.zip
|
||||
name: llama-bin-win-sycl-x64.zip
|
||||
|
||||
ubuntu-24-sycl:
|
||||
needs: [check-release]
|
||||
if: ${{ needs.check-release.outputs.should_release == 'true' }}
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
build: [fp32, fp16]
|
||||
include:
|
||||
- build: fp32
|
||||
fp16: OFF
|
||||
- build: fp16
|
||||
fp16: ON
|
||||
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
env:
|
||||
ONEAPI_ROOT: /opt/intel/oneapi/
|
||||
ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
LEVEL_ZERO_VERSION: "1.28.2"
|
||||
LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Download & Install oneAPI
|
||||
shell: bash
|
||||
run: |
|
||||
cd /tmp
|
||||
wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
|
||||
sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
|
||||
|
||||
- name: Install Level Zero SDK
|
||||
shell: bash
|
||||
run: |
|
||||
cd /tmp
|
||||
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
|
||||
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
|
||||
sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v6
|
||||
with:
|
||||
node-version: "24"
|
||||
cache: "npm"
|
||||
cache-dependency-path: "tools/ui/package-lock.json"
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: release-ubuntu-24.04-sycl-${{ matrix.build }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
cmake -B build \
|
||||
-G "Ninja" \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_SYCL=ON \
|
||||
-DCMAKE_C_COMPILER=icx \
|
||||
-DCMAKE_CXX_COMPILER=icpx \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_SYCL_F16=${{ matrix.fp16 }}
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: ccache-clear
|
||||
uses: ./.github/actions/ccache-clear
|
||||
with:
|
||||
key: release-ubuntu-24.04-sycl-${{ matrix.build }}
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
|
||||
name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
|
||||
|
||||
ubuntu-22-rocm:
|
||||
needs: [check-release]
|
||||
needs: [check-release, get-version]
|
||||
if: ${{ needs.check-release.outputs.should_release == 'true' }}
|
||||
|
||||
runs-on: ubuntu-22.04
|
||||
@@ -1052,6 +1076,7 @@ jobs:
|
||||
-DGGML_HIP=ON \
|
||||
-DHIP_PLATFORM=amd \
|
||||
-DGGML_HIP_ROCWMMA_FATTN=ON \
|
||||
-DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
@@ -1080,7 +1105,7 @@ jobs:
|
||||
name: llama-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz
|
||||
|
||||
windows-hip:
|
||||
needs: [check-release]
|
||||
needs: [check-release, get-version]
|
||||
if: ${{ needs.check-release.outputs.should_release == 'true' }}
|
||||
|
||||
runs-on: windows-2022
|
||||
@@ -1176,6 +1201,7 @@ jobs:
|
||||
-DGPU_TARGETS="${{ matrix.gpu_targets }}" `
|
||||
-DGGML_HIP_ROCWMMA_FATTN=ON `
|
||||
-DGGML_HIP=ON `
|
||||
-DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} `
|
||||
-DLLAMA_BUILD_BORINGSSL=ON
|
||||
cmake --build build --target ggml-hip -j ${env:NUMBER_OF_PROCESSORS}
|
||||
md "build\bin\rocblas\library\"
|
||||
@@ -1203,7 +1229,7 @@ jobs:
|
||||
name: llama-bin-win-hip-${{ matrix.name }}-x64.zip
|
||||
|
||||
ios-xcode:
|
||||
needs: [check-release]
|
||||
needs: [check-release, get-version]
|
||||
if: ${{ needs.check-release.outputs.should_release == 'true' }}
|
||||
runs-on: macos-26
|
||||
|
||||
@@ -1232,7 +1258,8 @@ jobs:
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=iOS \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=16.0 \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml \
|
||||
-DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }}
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||
|
||||
- name: xcodebuild for swift package
|
||||
@@ -1352,10 +1379,12 @@ jobs:
|
||||
# path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
|
||||
# name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
|
||||
|
||||
ui:
|
||||
needs: [check-release]
|
||||
ui-build:
|
||||
needs: [check-release, get-version]
|
||||
if: ${{ needs.check-release.outputs.should_release == 'true' }}
|
||||
uses: ./.github/workflows/ui-build.yml
|
||||
with:
|
||||
hf_ui_version: ${{ needs.get-version.outputs.ui_version }}
|
||||
|
||||
release:
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
@@ -1368,6 +1397,7 @@ jobs:
|
||||
runs-on: ubuntu-slim
|
||||
|
||||
needs:
|
||||
- get-version
|
||||
- windows
|
||||
- windows-cpu
|
||||
- windows-cuda
|
||||
@@ -1382,7 +1412,7 @@ jobs:
|
||||
- macos-cpu
|
||||
- ios-xcode
|
||||
#- openEuler-cann
|
||||
- ui
|
||||
- ui-build
|
||||
|
||||
outputs:
|
||||
tag_name: ${{ steps.tag.outputs.name }}
|
||||
@@ -1482,7 +1512,8 @@ jobs:
|
||||
- [Ubuntu arm64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-arm64.tar.gz)
|
||||
- [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
|
||||
- [Ubuntu x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ needs.ubuntu-24-openvino.outputs.openvino_version }}-x64.tar.gz)
|
||||
- Ubuntu x64 (SYCL FP32) [DISABLED](https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
- [Ubuntu x64 (SYCL FP32)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp32-x64.tar.gz)
|
||||
- [Ubuntu x64 (SYCL FP16)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp16-x64.tar.gz)
|
||||
|
||||
**Android:**
|
||||
- [Android arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz)
|
||||
@@ -1493,7 +1524,7 @@ jobs:
|
||||
- [Windows x64 (CUDA 12)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip) - [CUDA 12.4 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-12.4-x64.zip)
|
||||
- [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.3-x64.zip) - [CUDA 13.3 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-13.3-x64.zip)
|
||||
- [Windows x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-vulkan-x64.zip)
|
||||
- Windows x64 (SYCL) [DISABLED](https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
- [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip)
|
||||
- [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip)
|
||||
|
||||
**openEuler:**
|
||||
|
||||
@@ -28,13 +28,6 @@ jobs:
|
||||
run: npm run build
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Generate checksums
|
||||
run: |
|
||||
cd tools/ui/dist
|
||||
for f in *; do
|
||||
sha256sum "$f" | awk '{print $1, $2}' >> checksums.txt
|
||||
done
|
||||
|
||||
- name: Upload built UI
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
|
||||
@@ -2,6 +2,11 @@ name: UI Build
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
hf_ui_version:
|
||||
description: 'Version string for version.json (e.g. 12345)'
|
||||
required: false
|
||||
type: string
|
||||
|
||||
jobs:
|
||||
build:
|
||||
@@ -25,15 +30,15 @@ jobs:
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Build application
|
||||
env:
|
||||
HF_UI_VERSION: ${{ inputs.hf_ui_version || '' }}
|
||||
LLAMA_BUILD_NUMBER: ${{ inputs.hf_ui_version || 'b0000' }}
|
||||
run: npm run build
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Generate checksums
|
||||
run: |
|
||||
cd tools/ui/dist
|
||||
for f in *; do
|
||||
sha256sum "$f" | awk '{print $1, $2}' >> checksums.txt
|
||||
done
|
||||
- name: Run PWA unit tests (versioned build output)
|
||||
run: npx vitest --project=unit --run tests/unit/pwa.spec.ts
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Upload built UI
|
||||
uses: actions/upload-artifact@v6
|
||||
|
||||
@@ -40,6 +40,12 @@ jobs:
|
||||
name: ui-build
|
||||
path: tools/ui/dist/
|
||||
|
||||
- name: Create distribution archive
|
||||
run: |
|
||||
tar -czf dist.tar.gz -C tools/ui/dist .
|
||||
sha256sum dist.tar.gz > dist.tar.gz.sha256
|
||||
mv dist.tar.gz dist.tar.gz.sha256 tools/ui/dist/
|
||||
|
||||
- name: Install Hugging Face Hub CLI
|
||||
run: pip install -U huggingface_hub
|
||||
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
name: UI (self-hosted)
|
||||
|
||||
# these are the same as ui.yml, but with self-hosted runners
|
||||
# the runners come with pre-installed Playwright browsers version: 1.56.1
|
||||
# the jobs are much lighter because they don't need to install node and playwright browsers
|
||||
# the jobs are lighter because they don't need to install Node.js or Playwright browsers
|
||||
# the runner has pre-installed Playwright browsers for @playwright/test (1.56.1) at /ms-playwright/
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
@@ -61,6 +61,12 @@ jobs:
|
||||
run: npm ci
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Download built UI artifacts
|
||||
uses: actions/download-artifact@v6
|
||||
with:
|
||||
name: ui-build
|
||||
path: tools/ui/dist/
|
||||
|
||||
- name: Run type checking
|
||||
if: ${{ always() && steps.setup.conclusion == 'success' }}
|
||||
run: npm run check
|
||||
@@ -72,12 +78,12 @@ jobs:
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Run Client tests
|
||||
if: ${{ always() }}
|
||||
if: ${{ always() && steps.setup.conclusion == 'success' }}
|
||||
run: npm run test:client
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Run Unit tests
|
||||
if: ${{ always() }}
|
||||
if: ${{ always() && steps.setup.conclusion == 'success' }}
|
||||
run: npm run test:unit
|
||||
working-directory: tools/ui
|
||||
|
||||
@@ -97,22 +103,23 @@ jobs:
|
||||
run: npm ci
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Build application
|
||||
if: ${{ always() && steps.setup.conclusion == 'success' }}
|
||||
run: npm run build
|
||||
working-directory: tools/ui
|
||||
- name: Download built UI artifacts
|
||||
uses: actions/download-artifact@v6
|
||||
with:
|
||||
name: ui-build
|
||||
path: tools/ui/dist/
|
||||
|
||||
- name: Build Storybook
|
||||
if: ${{ always() }}
|
||||
if: ${{ always() && steps.setup.conclusion == 'success' }}
|
||||
run: npm run build-storybook
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Run UI tests
|
||||
if: ${{ always() }}
|
||||
if: ${{ always() && steps.setup.conclusion == 'success' }}
|
||||
run: npm run test:ui -- --testTimeout=60000
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Run E2E tests
|
||||
if: ${{ always() }}
|
||||
if: ${{ always() && steps.setup.conclusion == 'success' }}
|
||||
run: npm run test:e2e
|
||||
working-directory: tools/ui
|
||||
|
||||
@@ -43,7 +43,7 @@ jobs:
|
||||
ui-checks:
|
||||
name: Checks
|
||||
needs: ui-build
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-24.04
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Checkout code
|
||||
@@ -60,6 +60,12 @@ jobs:
|
||||
cache: "npm"
|
||||
cache-dependency-path: "tools/ui/package-lock.json"
|
||||
|
||||
- name: Download built UI artifacts
|
||||
uses: actions/download-artifact@v6
|
||||
with:
|
||||
name: ui-build
|
||||
path: tools/ui/dist/
|
||||
|
||||
- name: Install dependencies
|
||||
id: setup
|
||||
if: ${{ steps.node.conclusion == 'success' }}
|
||||
@@ -87,7 +93,7 @@ jobs:
|
||||
run: npm run test:client
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Run Unit tests
|
||||
- name: Run Unit tests (uses pre-built dist/ from ui-build)
|
||||
if: ${{ always() && steps.playwright.conclusion == 'success' }}
|
||||
run: npm run test:unit
|
||||
working-directory: tools/ui
|
||||
@@ -95,7 +101,7 @@ jobs:
|
||||
e2e-tests:
|
||||
name: E2E Tests
|
||||
needs: ui-build
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v6
|
||||
@@ -117,10 +123,11 @@ jobs:
|
||||
run: npm ci
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Build application
|
||||
if: ${{ always() && steps.setup.conclusion == 'success' }}
|
||||
run: npm run build
|
||||
working-directory: tools/ui
|
||||
- name: Download built UI artifacts (reuses ui-build)
|
||||
uses: actions/download-artifact@v6
|
||||
with:
|
||||
name: ui-build
|
||||
path: tools/ui/dist/
|
||||
|
||||
- name: Install Playwright browsers
|
||||
id: playwright
|
||||
@@ -138,7 +145,7 @@ jobs:
|
||||
run: npm run test:ui -- --testTimeout=60000
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Run E2E tests
|
||||
- name: Run E2E tests (uses pre-built dist/ from ui-build)
|
||||
if: ${{ always() && steps.playwright.conclusion == 'success' }}
|
||||
run: npm run test:e2e
|
||||
working-directory: tools/ui
|
||||
|
||||
@@ -17,7 +17,7 @@ jobs:
|
||||
|
||||
- name: Install komac
|
||||
run: |
|
||||
cargo binstall komac@2.15.0 -y
|
||||
cargo binstall komac@2.16.0 -y
|
||||
|
||||
- name: Find latest release
|
||||
id: find_latest_release
|
||||
|
||||
@@ -92,13 +92,6 @@
|
||||
!/examples/sycl/*.bat
|
||||
!/examples/sycl/*.sh
|
||||
|
||||
# Server Web UI temporary files (+ legacy directory)
|
||||
|
||||
/tools/server/webui/node_modules
|
||||
/tools/server/webui/dist
|
||||
/tools/ui/node_modules
|
||||
/tools/ui/dist
|
||||
|
||||
# Python
|
||||
|
||||
/.venv
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# llama.cpp
|
||||
|
||||

|
||||

|
||||
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
[](https://github.com/ggml-org/llama.cpp/releases)
|
||||
|
||||
+19
-5
@@ -444,7 +444,7 @@ bool common_params_handle_models(common_params & params, llama_example curr_ex)
|
||||
opts.offline = params.offline;
|
||||
opts.skip_download = params.skip_download;
|
||||
opts.download_mtp = spec_type_draft_mtp;
|
||||
opts.download_mmproj = !params.no_mmproj;
|
||||
opts.download_mmproj = !params.no_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty();
|
||||
|
||||
// sub-models (draft, mmproj, vocoder) are explicitly specified by the user,
|
||||
// so we should not auto-discover mtp/mmproj siblings for them
|
||||
@@ -1360,7 +1360,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
add_opt(common_arg(
|
||||
{"--cache-idle-slots"},
|
||||
{"--no-cache-idle-slots"},
|
||||
"save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)",
|
||||
"save idle slots to the prompt cache on new task, and clear them when using unified KV (default: enabled, requires cache-ram)",
|
||||
[](common_params & params, bool value) {
|
||||
params.cache_idle_slots = value;
|
||||
}
|
||||
@@ -1615,7 +1615,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
|
||||
[](common_params & params, const std::string & value) {
|
||||
const auto sampler_names = string_split<std::string>(value, ';');
|
||||
params.sampling.samplers = common_sampler_types_from_names(sampler_names, true);
|
||||
params.sampling.samplers = common_sampler_types_from_names(sampler_names);
|
||||
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS;
|
||||
}
|
||||
).set_sampling());
|
||||
@@ -2221,8 +2221,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_OFFLOAD"));
|
||||
add_opt(common_arg(
|
||||
{"--image", "--audio"}, "FILE",
|
||||
"path to an image or audio file. use with multimodal models, use comma-separated values for multiple files\n",
|
||||
{"--image", "--audio", "--video"}, "FILE",
|
||||
"path to an image, audio, or video file. use with multimodal models, use comma-separated values for multiple files\n",
|
||||
[](common_params & params, const std::string & value) {
|
||||
for (const auto & item : parse_csv_row(value)) {
|
||||
params.image.emplace_back(item);
|
||||
@@ -2243,6 +2243,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.image_max_tokens = value;
|
||||
}
|
||||
).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MAX_TOKENS"));
|
||||
add_opt(common_arg(
|
||||
{"--mtmd-batch-max-tokens"}, "N",
|
||||
string_format("maximum number of image tokens per batch when encoding images (default: %d)", params.mtmd_batch_max_tokens),
|
||||
[](common_params & params, int value) {
|
||||
params.mtmd_batch_max_tokens = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MTMD_BATCH_MAX_TOKENS"));
|
||||
if (llama_supports_rpc()) {
|
||||
add_opt(common_arg(
|
||||
{"--rpc"}, "SERVERS",
|
||||
@@ -3333,6 +3340,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
common_log_set_file(common_log_main(), value.c_str());
|
||||
}
|
||||
).set_env("LLAMA_ARG_LOG_FILE"));
|
||||
add_opt(common_arg(
|
||||
{"--log-prompts-dir"}, "PATH",
|
||||
"Log prompts to directory (only used for debugging, default: disabled)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.path_prompts_log_dir = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
||||
add_opt(common_arg(
|
||||
{"--log-colors"}, "[on|off|auto]",
|
||||
"Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
|
||||
|
||||
@@ -87,6 +87,8 @@ static std::string normalize_quotes_to_json(const std::string & input) {
|
||||
bool in_single_quoted = false;
|
||||
bool in_double_quoted = false;
|
||||
|
||||
auto is_word_char = [](char ch) { return std::isalnum(static_cast<unsigned char>(ch)) || ch == '_'; };
|
||||
|
||||
for (size_t i = 0; i < input.size(); ++i) {
|
||||
char c = input[i];
|
||||
|
||||
@@ -151,6 +153,29 @@ static std::string normalize_quotes_to_json(const std::string & input) {
|
||||
in_single_quoted = true;
|
||||
result += '"';
|
||||
}
|
||||
} else if (!in_single_quoted && !in_double_quoted && (c == 'T' || c == 'F' || c == 'N') &&
|
||||
(i == 0 || !is_word_char(input[i - 1]))) {
|
||||
// Python literals -> JSON; prefix match keeps streamed partials monotonic.
|
||||
static constexpr std::pair<std::string_view, std::string_view> literals[] = {
|
||||
{ "True", "true" }, { "False", "false" }, { "None", "null" },
|
||||
};
|
||||
size_t n = 0;
|
||||
while (i + n < input.size() && is_word_char(input[i + n])) {
|
||||
++n;
|
||||
}
|
||||
std::string_view token(input.data() + i, n);
|
||||
bool matched = false;
|
||||
for (const auto & [py, js] : literals) {
|
||||
if (py.substr(0, n) == token) {
|
||||
result += js.substr(0, n);
|
||||
i += n - 1;
|
||||
matched = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!matched) {
|
||||
result += c;
|
||||
}
|
||||
} else {
|
||||
result += c;
|
||||
}
|
||||
@@ -353,12 +378,8 @@ void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
|
||||
}
|
||||
value_to_add += escape_json_string_inner(value_content);
|
||||
} else if (!value_content.empty()) {
|
||||
// For potential containers, normalize Python-style single quotes to JSON double quotes
|
||||
bool is_potential_container = value_content[0] == '[' || value_content[0] == '{';
|
||||
if (is_potential_container) {
|
||||
value_content = normalize_container_value(value_content);
|
||||
}
|
||||
value_to_add += value_content;
|
||||
// Pythonic scalars/containers -> JSON.
|
||||
value_to_add += normalize_container_value(value_content);
|
||||
}
|
||||
|
||||
args_target() += value_to_add;
|
||||
@@ -466,11 +487,34 @@ common_peg_parser common_chat_peg_builder::standard_constructed_tools(
|
||||
return force_tool_calls ? section : optional(section);
|
||||
}
|
||||
|
||||
// Like python_value(), but the leaf also accepts JSON-cased true/false/null, used by LFM2/LFM2.5
|
||||
common_peg_parser common_chat_peg_builder::python_or_json_value() {
|
||||
return rule("python-or-json-value", [this]() {
|
||||
auto ws = space();
|
||||
auto value = python_or_json_value();
|
||||
|
||||
auto member = sequence({ python_string(), ws, literal(":"), ws, value });
|
||||
auto members = sequence({ member, zero_or_more(sequence({ ws, literal(","), ws, member })) });
|
||||
auto dict = rule("python-or-json-dict", [&]() {
|
||||
return sequence({ literal("{"), ws, choice({ literal("}"), sequence({ members, ws, literal("}") }) }), ws });
|
||||
});
|
||||
|
||||
auto elements = sequence({ value, zero_or_more(sequence({ literal(","), ws, value })) });
|
||||
auto array = rule("python-or-json-array", [&]() {
|
||||
return sequence({ literal("["), ws, choice({ literal("]"), sequence({ elements, ws, literal("]") }) }), ws });
|
||||
});
|
||||
|
||||
return choice({ dict, array, python_string(), python_number(),
|
||||
python_bool(), python_null(), json_bool(), json_null() });
|
||||
});
|
||||
}
|
||||
|
||||
// Python-style tool calls: name(arg1="value1", arg2=123)
|
||||
// Used only by LFM2 for now, so we don't merge it into autoparser
|
||||
common_peg_parser common_chat_peg_builder::python_style_tool_calls(
|
||||
const ordered_json & tools,
|
||||
bool parallel_tool_calls) {
|
||||
bool parallel_tool_calls,
|
||||
bool allow_json_literals) {
|
||||
if (!tools.is_array() || tools.empty()) {
|
||||
return eps();
|
||||
}
|
||||
@@ -504,7 +548,7 @@ common_peg_parser common_chat_peg_builder::python_style_tool_calls(
|
||||
if (is_string_type) {
|
||||
arg_value_parser = string_value_parser;
|
||||
} else {
|
||||
arg_value_parser = tool_arg_value(python_value());
|
||||
arg_value_parser = tool_arg_value(allow_json_literals ? python_or_json_value() : python_value());
|
||||
}
|
||||
|
||||
// Full argument: name="value" or name=value
|
||||
|
||||
@@ -132,9 +132,13 @@ class common_chat_peg_builder : public common_peg_parser_builder {
|
||||
// Helper for Python-style function call format: name(arg1="value1", arg2=123)
|
||||
// Used by LFM2 and similar templates
|
||||
common_peg_parser python_style_tool_calls(const nlohmann::ordered_json & tools,
|
||||
bool parallel_tool_calls);
|
||||
bool parallel_tool_calls,
|
||||
bool allow_json_literals);
|
||||
|
||||
private:
|
||||
// Python values plus JSON true/false/null.
|
||||
common_peg_parser python_or_json_value();
|
||||
|
||||
// Implementation helpers for standard_json_tools — one per JSON tool call layout mode
|
||||
common_peg_parser build_json_tools_function_is_key(const nlohmann::ordered_json & tools,
|
||||
const std::string & args_key,
|
||||
@@ -195,4 +199,3 @@ struct tagged_peg_parser {
|
||||
|
||||
tagged_peg_parser build_tagged_peg_parser(
|
||||
const std::function<common_peg_parser(common_peg_parser_builder & builder)> & fn);
|
||||
|
||||
|
||||
+48
-117
@@ -1608,42 +1608,52 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp
|
||||
return data;
|
||||
}
|
||||
|
||||
// LFM2 format: uses <|tool_list_start|>[...]<|tool_list_end|> in system prompt
|
||||
// and <|tool_call_start|>[name(arg="val")]<|tool_call_end|> for tool calls.
|
||||
// - Reasoning: <think>{reasoning}</think> (optional)
|
||||
// - Content: text before a tool call (optional)
|
||||
// - Tool calls: Python-style, e.g. [function_name(arg1="value1", arg2="value2")]
|
||||
// Tool calls can appear multiple times (parallel tool calls supported)
|
||||
static common_chat_params common_chat_params_init_lfm2(const common_chat_template & tmpl,
|
||||
const autoparser::generation_params & inputs) {
|
||||
// LFM2/LFM2.5 parser. Tool calls are almost Python-style and parallel-capable
|
||||
// (except dotted names and JSON literals true/false/null).
|
||||
// Always wrapped in <|tool_call_start|>[name(args)]<|tool_call_end|> with optional <think> reasoning.
|
||||
// tool_list_tokens preserves LFM2 system tool-list markers.
|
||||
static common_chat_params common_chat_params_init_lfm2(const common_chat_template & tmpl,
|
||||
const autoparser::generation_params & inputs,
|
||||
bool tool_list_tokens) {
|
||||
common_chat_params data;
|
||||
|
||||
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs);
|
||||
data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs);
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||
data.supports_thinking = true;
|
||||
data.preserved_tokens = {
|
||||
"<|tool_list_start|>",
|
||||
"<|tool_list_end|>",
|
||||
"<|tool_call_start|>",
|
||||
"<|tool_call_end|>",
|
||||
"<think>",
|
||||
"</think>",
|
||||
};
|
||||
|
||||
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
|
||||
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
|
||||
auto include_grammar = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;
|
||||
|
||||
const std::string TOOL_CALL_START = "<|tool_call_start|>";
|
||||
const std::string TOOL_CALL_END = "<|tool_call_end|>";
|
||||
const std::string TOOL_LIST_START = "<|tool_list_start|>";
|
||||
const std::string TOOL_LIST_END = "<|tool_list_end|>";
|
||||
const std::string THINK_START = "<think>";
|
||||
const std::string THINK_END = "</think>";
|
||||
const std::string GEN_PROMPT = "<|im_start|>assistant\n";
|
||||
|
||||
// Copy reasoning to the "thinking" field the template expects
|
||||
auto adjusted_messages = json::array();
|
||||
for (auto msg : inputs.messages) {
|
||||
if (msg.contains("reasoning_content") && msg.at("reasoning_content").is_string()) {
|
||||
msg["thinking"] = msg.at("reasoning_content");
|
||||
}
|
||||
adjusted_messages.push_back(msg);
|
||||
}
|
||||
|
||||
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs, adjusted_messages);
|
||||
data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs, adjusted_messages);
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||
data.supports_thinking = true;
|
||||
data.preserved_tokens = { TOOL_CALL_START, TOOL_CALL_END, THINK_START, THINK_END };
|
||||
if (tool_list_tokens) {
|
||||
data.preserved_tokens.push_back(TOOL_LIST_START);
|
||||
data.preserved_tokens.push_back(TOOL_LIST_END);
|
||||
}
|
||||
|
||||
data.thinking_start_tag = THINK_START;
|
||||
data.thinking_end_tag = THINK_END;
|
||||
|
||||
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
|
||||
auto has_response_format = !inputs.json_schema.is_null() && inputs.json_schema.is_object();
|
||||
// Gate by reasoning format and whether the template supports <think>
|
||||
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE &&
|
||||
tmpl.source().find(THINK_START) != std::string::npos;
|
||||
auto include_grammar = has_response_format || (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE);
|
||||
|
||||
if (inputs.has_continuation()) {
|
||||
const auto & msg = inputs.continue_msg;
|
||||
|
||||
@@ -1660,17 +1670,21 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
|
||||
auto end = p.end();
|
||||
|
||||
auto reasoning = p.eps();
|
||||
if (extract_reasoning && inputs.enable_thinking) {
|
||||
if (extract_reasoning) {
|
||||
reasoning = p.optional(THINK_START + p.reasoning(p.until(THINK_END)) + THINK_END);
|
||||
}
|
||||
|
||||
if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
|
||||
if (has_response_format) {
|
||||
auto response_format = p.content(p.schema(p.json(), "response-format-schema", inputs.json_schema));
|
||||
return generation_prompt + reasoning + response_format + end;
|
||||
}
|
||||
return generation_prompt + reasoning + p.content(p.rest()) + end;
|
||||
}
|
||||
auto tool_calls = p.rule("tool-calls",
|
||||
p.trigger_rule("tool-call",
|
||||
p.literal(TOOL_CALL_START) +
|
||||
p.python_style_tool_calls(inputs.tools, inputs.parallel_tool_calls) +
|
||||
p.python_style_tool_calls(inputs.tools, inputs.parallel_tool_calls, /* allow_json_literals = */ true) +
|
||||
p.literal(TOOL_CALL_END)
|
||||
)
|
||||
);
|
||||
@@ -1683,13 +1697,17 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
|
||||
data.parser = parser.save();
|
||||
|
||||
if (include_grammar) {
|
||||
data.grammar_lazy = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
|
||||
data.grammar_lazy = !(has_response_format || (has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED));
|
||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const auto & function = tool.at("function");
|
||||
auto schema = function.at("parameters");
|
||||
builder.resolve_refs(schema);
|
||||
});
|
||||
if (has_response_format) {
|
||||
auto schema = inputs.json_schema;
|
||||
builder.resolve_refs(schema);
|
||||
}
|
||||
parser.build_grammar(builder, data.grammar_lazy);
|
||||
});
|
||||
|
||||
@@ -1697,93 +1715,6 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
|
||||
{ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, TOOL_CALL_START }
|
||||
};
|
||||
}
|
||||
return data;
|
||||
}
|
||||
|
||||
// LFM2.5 format: uses plain "List of tools: [...]" in system prompt, no wrapper tokens.
|
||||
// Tool calls are bare [name(arg="val")], though model may optionally emit <|tool_call_start|>.
|
||||
// - Reasoning: <think>{reasoning}</think> (optional)
|
||||
// - Content: text before a tool call (optional)
|
||||
// - Tool calls: Python-style, e.g. [function_name(arg1="value1", arg2="value2")]
|
||||
// Tool calls can appear multiple times (parallel tool calls supported)
|
||||
static common_chat_params common_chat_params_init_lfm2_5(const common_chat_template & tmpl,
|
||||
const autoparser::generation_params & inputs) {
|
||||
common_chat_params data;
|
||||
|
||||
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs);
|
||||
data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs);
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||
data.supports_thinking = true;
|
||||
data.preserved_tokens = {
|
||||
"<|tool_call_start|>",
|
||||
"<|tool_call_end|>",
|
||||
"<think>",
|
||||
"</think>",
|
||||
};
|
||||
|
||||
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
|
||||
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
|
||||
auto include_grammar = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;
|
||||
|
||||
const std::string THINK_START = "<think>";
|
||||
const std::string THINK_END = "</think>";
|
||||
const std::string GEN_PROMPT = "<|im_start|>assistant\n";
|
||||
|
||||
data.thinking_start_tag = THINK_START;
|
||||
data.thinking_end_tag = THINK_END;
|
||||
|
||||
if (inputs.has_continuation()) {
|
||||
const auto & msg = inputs.continue_msg;
|
||||
|
||||
data.generation_prompt = GEN_PROMPT + THINK_START + msg.reasoning_content;
|
||||
if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
|
||||
data.generation_prompt += THINK_END + msg.render_content();
|
||||
}
|
||||
|
||||
data.prompt += data.generation_prompt;
|
||||
}
|
||||
|
||||
auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
|
||||
auto generation_prompt = p.literal(GEN_PROMPT);
|
||||
auto end = p.end();
|
||||
|
||||
auto reasoning = p.eps();
|
||||
if (extract_reasoning && inputs.enable_thinking) {
|
||||
reasoning = p.optional(THINK_START + p.reasoning(p.until(THINK_END)) + THINK_END);
|
||||
}
|
||||
|
||||
if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
|
||||
return generation_prompt + reasoning + p.content(p.rest()) + end;
|
||||
}
|
||||
|
||||
auto tool_calls = p.rule("tool-calls",
|
||||
p.trigger_rule("tool-call",
|
||||
p.python_style_tool_calls(inputs.tools, inputs.parallel_tool_calls)
|
||||
)
|
||||
);
|
||||
|
||||
auto content = p.content(p.until_one_of({"<|tool_call_start|>", "["}));
|
||||
auto maybe_start = p.optional(p.literal("<|tool_call_start|>"));
|
||||
return generation_prompt + reasoning + content + maybe_start + tool_calls + end;
|
||||
});
|
||||
|
||||
data.parser = parser.save();
|
||||
|
||||
if (include_grammar) {
|
||||
data.grammar_lazy = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
|
||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const auto & function = tool.at("function");
|
||||
auto schema = function.at("parameters");
|
||||
builder.resolve_refs(schema);
|
||||
});
|
||||
parser.build_grammar(builder, data.grammar_lazy);
|
||||
});
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const std::string name = tool.at("function").at("name");
|
||||
data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[" + name + "(" });
|
||||
});
|
||||
}
|
||||
|
||||
return data;
|
||||
}
|
||||
@@ -2298,14 +2229,14 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
|
||||
|
||||
if (is_lfm2_template(src)) {
|
||||
LOG_DBG("Using specialized template: LFM2\n");
|
||||
return common_chat_params_init_lfm2(tmpl, params);
|
||||
return common_chat_params_init_lfm2(tmpl, params, /* tool_list_tokens = */ true);
|
||||
}
|
||||
|
||||
// LFM2.5 format detection: template uses plain "List of tools: [...]" with no special tokens
|
||||
if (src.find("List of tools: [") != std::string::npos &&
|
||||
src.find("<|tool_list_start|>") == std::string::npos) {
|
||||
LOG_DBG("Using specialized template: LFM2.5\n");
|
||||
return common_chat_params_init_lfm2_5(tmpl, params);
|
||||
return common_chat_params_init_lfm2(tmpl, params, /* tool_list_tokens = */ false);
|
||||
}
|
||||
|
||||
// GigaChatV3 format detection
|
||||
|
||||
+1
-1
@@ -1148,7 +1148,7 @@ static void common_init_sampler_from_model(
|
||||
if (llama_model_meta_val_str(model, llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE), buf, sizeof(buf)) > 0) {
|
||||
const std::vector<std::string> sampler_names = string_split<std::string>(std::string(buf), ';');
|
||||
if (!sampler_names.empty()) {
|
||||
sparams.samplers = common_sampler_types_from_names(sampler_names, true);
|
||||
sparams.samplers = common_sampler_types_from_names(sampler_names);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
+3
-1
@@ -489,6 +489,7 @@ struct common_params {
|
||||
std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
|
||||
std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
|
||||
std::string logits_file = ""; // file for saving *all* logits // NOLINT
|
||||
std::string path_prompts_log_dir = ""; // directory with logged prompts // NOLINT
|
||||
|
||||
// llama-debug specific options
|
||||
std::string logits_output_dir = "data"; // directory for saving logits output files // NOLINT
|
||||
@@ -571,9 +572,10 @@ struct common_params {
|
||||
struct common_params_model mmproj;
|
||||
bool mmproj_use_gpu = true; // use GPU for multimodal model
|
||||
bool no_mmproj = false; // explicitly disable multimodal model
|
||||
std::vector<std::string> image; // path to image file(s)
|
||||
std::vector<std::string> image; // path to image file(s) ; TODO: change the name to "media"
|
||||
int image_min_tokens = -1;
|
||||
int image_max_tokens = -1;
|
||||
int mtmd_batch_max_tokens = 1024;
|
||||
|
||||
// finetune
|
||||
struct lr_opt lr;
|
||||
|
||||
+29
-6
@@ -26,7 +26,7 @@ class common_params_fit_exception : public std::runtime_error {
|
||||
using std::runtime_error::runtime_error;
|
||||
};
|
||||
|
||||
std::vector<llama_device_memory_data> common_get_device_memory_data(
|
||||
static std::vector<llama_device_memory_data> common_get_device_memory_data_impl(
|
||||
const char * path_model,
|
||||
const llama_model_params * mparams,
|
||||
const llama_context_params * cparams,
|
||||
@@ -150,6 +150,29 @@ std::vector<llama_device_memory_data> common_get_device_memory_data(
|
||||
return ret;
|
||||
}
|
||||
|
||||
common_device_memory_data_vec common_get_device_memory_data(
|
||||
const char * path_model,
|
||||
const llama_model_params * mparams,
|
||||
const llama_context_params * cparams,
|
||||
std::vector<ggml_backend_dev_t> & devs,
|
||||
uint32_t & hp_ngl,
|
||||
uint32_t & hp_n_ctx_train,
|
||||
uint32_t & hp_n_expert,
|
||||
ggml_log_level log_level) {
|
||||
std::vector<llama_device_memory_data> impl = common_get_device_memory_data_impl(
|
||||
path_model, mparams, cparams, devs, hp_ngl, hp_n_ctx_train, hp_n_expert, log_level);
|
||||
|
||||
common_device_memory_data_vec ret(impl.size());
|
||||
for (size_t i = 0; i < impl.size(); i++) {
|
||||
ret[i].total = impl[i].total;
|
||||
ret[i].free = impl[i].free;
|
||||
ret[i].model = impl[i].mb.model;
|
||||
ret[i].context = impl[i].mb.context;
|
||||
ret[i].compute = impl[i].mb.compute;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void common_params_fit_impl(
|
||||
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
||||
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
||||
@@ -169,7 +192,7 @@ static void common_params_fit_impl(
|
||||
// step 1: get data for default parameters and check whether any changes are necessary in the first place
|
||||
|
||||
LOG_TRC("%s: getting device memory data for initial parameters:\n", __func__);
|
||||
const dmds_t dmds_full = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
const dmds_t dmds_full = common_get_device_memory_data_impl(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
const size_t nd = devs.size(); // number of devices
|
||||
|
||||
std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
|
||||
@@ -304,7 +327,7 @@ static void common_params_fit_impl(
|
||||
|
||||
int64_t sum_projected_used_min_ctx = 0;
|
||||
cparams->n_ctx = n_ctx_min;
|
||||
const dmds_t dmds_min_ctx = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
const dmds_t dmds_min_ctx = common_get_device_memory_data_impl(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
if (nd == 0) {
|
||||
sum_projected_used_min_ctx = dmds_min_ctx.back().mb.total();
|
||||
} else {
|
||||
@@ -482,7 +505,7 @@ static void common_params_fit_impl(
|
||||
llama_model_params mparams_copy = *mparams;
|
||||
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy);
|
||||
|
||||
const dmds_t dmd_nl = common_get_device_memory_data(
|
||||
const dmds_t dmd_nl = common_get_device_memory_data_impl(
|
||||
path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
|
||||
LOG_TRC("%s: memory for test allocation by device:\n", func_name);
|
||||
@@ -510,7 +533,7 @@ static void common_params_fit_impl(
|
||||
mparams->tensor_buft_overrides = tensor_buft_overrides;
|
||||
|
||||
LOG_TRC("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__);
|
||||
const dmds_t dmds_cpu_moe = common_get_device_memory_data(
|
||||
const dmds_t dmds_cpu_moe = common_get_device_memory_data_impl(
|
||||
path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
@@ -940,7 +963,7 @@ void common_fit_print(
|
||||
uint32_t hp_nct = 0; // hparams.n_ctx_train
|
||||
uint32_t hp_nex = 0; // hparams.n_expert
|
||||
|
||||
auto dmd = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, GGML_LOG_LEVEL_ERROR);
|
||||
auto dmd = common_get_device_memory_data_impl(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, GGML_LOG_LEVEL_ERROR);
|
||||
GGML_ASSERT(dmd.size() == devs.size() + 1);
|
||||
|
||||
for (size_t id = 0; id < devs.size(); id++) {
|
||||
|
||||
+32
-24
@@ -1,9 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "llama.h"
|
||||
#include "../src/llama-ext.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
@@ -18,31 +16,41 @@ enum common_params_fit_status {
|
||||
// - this function is NOT thread safe because it modifies the global llama logger state
|
||||
// - only parameters that have the same value as in llama_default_model_params are modified
|
||||
// with the exception of the context size which is modified if and only if equal to 0
|
||||
enum common_params_fit_status common_fit_params(
|
||||
const char * path_model,
|
||||
struct llama_model_params * mparams,
|
||||
struct llama_context_params * cparams,
|
||||
float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements
|
||||
struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
|
||||
size_t * margins, // margins of memory to leave per device in bytes
|
||||
uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use
|
||||
enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log
|
||||
common_params_fit_status common_fit_params(
|
||||
const char * path_model,
|
||||
llama_model_params * mparams,
|
||||
llama_context_params * cparams,
|
||||
float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements
|
||||
llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
|
||||
size_t * margins, // margins of memory to leave per device in bytes
|
||||
uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use
|
||||
ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log
|
||||
|
||||
// print estimated memory to stdout
|
||||
void common_fit_print(
|
||||
const char * path_model,
|
||||
struct llama_model_params * mparams,
|
||||
struct llama_context_params * cparams);
|
||||
const char * path_model,
|
||||
llama_model_params * mparams,
|
||||
llama_context_params * cparams);
|
||||
|
||||
void common_memory_breakdown_print(const struct llama_context * ctx);
|
||||
void common_memory_breakdown_print(const llama_context * ctx);
|
||||
|
||||
struct common_device_memory_data {
|
||||
int64_t total;
|
||||
int64_t free;
|
||||
size_t model;
|
||||
size_t context;
|
||||
size_t compute;
|
||||
};
|
||||
|
||||
using common_device_memory_data_vec = std::vector<common_device_memory_data>;
|
||||
|
||||
// Load a model + context with no_alloc and return the per-device memory breakdown.
|
||||
std::vector<llama_device_memory_data> common_get_device_memory_data(
|
||||
const char * path_model,
|
||||
const struct llama_model_params * mparams,
|
||||
const struct llama_context_params * cparams,
|
||||
std::vector<ggml_backend_dev_t> & devs,
|
||||
uint32_t & hp_ngl,
|
||||
uint32_t & hp_n_ctx_train,
|
||||
uint32_t & hp_n_expert,
|
||||
enum ggml_log_level log_level);
|
||||
common_device_memory_data_vec common_get_device_memory_data(
|
||||
const char * path_model,
|
||||
const llama_model_params * mparams,
|
||||
const llama_context_params * cparams,
|
||||
std::vector<ggml_backend_dev_t> & devs,
|
||||
uint32_t & hp_ngl,
|
||||
uint32_t & hp_n_ctx_train,
|
||||
uint32_t & hp_n_expert,
|
||||
ggml_log_level log_level);
|
||||
|
||||
@@ -761,9 +761,9 @@ value member_expression::execute_impl(context & ctx) {
|
||||
|
||||
if (is_stmt<slice_expression>(this->property)) {
|
||||
auto s = cast_stmt<slice_expression>(this->property);
|
||||
value start_val = s->start_expr ? s->start_expr->execute(ctx) : mk_val<value_int>(0);
|
||||
value stop_val = s->stop_expr ? s->stop_expr->execute(ctx) : mk_val<value_int>(arr_size);
|
||||
value step_val = s->step_expr ? s->step_expr->execute(ctx) : mk_val<value_int>(1);
|
||||
value start_val = s->start_expr ? s->start_expr->execute(ctx) : (step_val->as_int() < 0 ? mk_val<value_int>(arr_size - 1) : mk_val<value_int>(0));
|
||||
value stop_val = s->stop_expr ? s->stop_expr->execute(ctx) : (step_val->as_int() < 0 ? mk_val<value_int>(-1) : mk_val<value_int>(arr_size));
|
||||
|
||||
// translate to function call: obj.slice(start, stop, step)
|
||||
JJ_DEBUG("Member expression is a slice: start %s, stop %s, step %s",
|
||||
|
||||
+26
-7
@@ -90,14 +90,14 @@ static T slice(const T & array, int64_t start, int64_t stop, int64_t step = 1) {
|
||||
stop_val = std::min(stop_val, len);
|
||||
}
|
||||
} else {
|
||||
start_val = len - 1;
|
||||
start_val = start;
|
||||
if (start_val < 0) {
|
||||
start_val = std::max(len + start_val, (int64_t)-1);
|
||||
start_val = std::max(len + start_val, (int64_t)0);
|
||||
} else {
|
||||
start_val = std::min(start_val, len - 1);
|
||||
}
|
||||
|
||||
stop_val = -1;
|
||||
stop_val = stop;
|
||||
if (stop_val < -1) {
|
||||
stop_val = std::max(len + stop_val, (int64_t)-1);
|
||||
} else {
|
||||
@@ -673,6 +673,9 @@ const func_builtins & value_string_t::get_builtins() const {
|
||||
std::string str = val_input->as_string().str();
|
||||
// FIXME: Support non-specified delimiter (split on consecutive (no leading or trailing) whitespace)
|
||||
std::string delim = (args.count() > 1) ? args.get_pos(1)->as_string().str() : " ";
|
||||
if (delim.empty()) {
|
||||
throw raised_exception("empty separator");
|
||||
}
|
||||
int64_t maxsplit = (args.count() > 2) ? args.get_pos(2)->as_int() : -1;
|
||||
auto result = mk_val<value_array>();
|
||||
size_t pos = 0;
|
||||
@@ -697,6 +700,9 @@ const func_builtins & value_string_t::get_builtins() const {
|
||||
std::string str = val_input->as_string().str();
|
||||
// FIXME: Support non-specified delimiter (split on consecutive (no leading or trailing) whitespace)
|
||||
std::string delim = (args.count() > 1) ? args.get_pos(1)->as_string().str() : " ";
|
||||
if (delim.empty()) {
|
||||
throw raised_exception("empty separator");
|
||||
}
|
||||
int64_t maxsplit = (args.count() > 2) ? args.get_pos(2)->as_int() : -1;
|
||||
auto result = mk_val<value_array>();
|
||||
size_t pos = 0;
|
||||
@@ -722,10 +728,23 @@ const func_builtins & value_string_t::get_builtins() const {
|
||||
if (count > 0) {
|
||||
throw not_implemented_exception("String replace with count argument not implemented");
|
||||
}
|
||||
size_t pos = 0;
|
||||
while ((pos = str.find(old_str, pos)) != std::string::npos) {
|
||||
str.replace(pos, old_str.length(), new_str);
|
||||
pos += new_str.length();
|
||||
if (old_str != new_str) {
|
||||
size_t pos = 0;
|
||||
if (old_str.empty()) {
|
||||
std::string new_res;
|
||||
new_res.reserve(str.length() + new_str.length() * (str.length() + 1));
|
||||
new_res += new_str;
|
||||
for (const char c : str) {
|
||||
new_res.push_back(c);
|
||||
new_res += new_str;
|
||||
}
|
||||
str = new_res;
|
||||
} else {
|
||||
while ((pos = str.find(old_str, pos)) != std::string::npos) {
|
||||
str.replace(pos, old_str.length(), new_str);
|
||||
pos += new_str.length();
|
||||
}
|
||||
}
|
||||
}
|
||||
auto res = mk_val<value_string>(str);
|
||||
res->val_str.mark_input_based_on(args.get_pos(0)->val_str);
|
||||
|
||||
+49
-40
@@ -769,54 +769,63 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
|
||||
std::unordered_map<std::string, common_sampler_type> sampler_canonical_name_map {
|
||||
{ "dry", COMMON_SAMPLER_TYPE_DRY },
|
||||
{ "top_k", COMMON_SAMPLER_TYPE_TOP_K },
|
||||
{ "top_p", COMMON_SAMPLER_TYPE_TOP_P },
|
||||
{ "top_n_sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
|
||||
{ "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||
{ "min_p", COMMON_SAMPLER_TYPE_MIN_P },
|
||||
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
|
||||
{ "xtc", COMMON_SAMPLER_TYPE_XTC },
|
||||
{ "infill", COMMON_SAMPLER_TYPE_INFILL },
|
||||
{ "penalties", COMMON_SAMPLER_TYPE_PENALTIES },
|
||||
{ "adaptive_p", COMMON_SAMPLER_TYPE_ADAPTIVE_P },
|
||||
};
|
||||
|
||||
// since samplers names are written multiple ways
|
||||
// make it ready for both system names and input names
|
||||
std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
|
||||
{ "top-k", COMMON_SAMPLER_TYPE_TOP_K },
|
||||
{ "top-p", COMMON_SAMPLER_TYPE_TOP_P },
|
||||
{ "top-n-sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
|
||||
{ "nucleus", COMMON_SAMPLER_TYPE_TOP_P },
|
||||
{ "typical-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||
{ "typical", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||
{ "typ-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||
{ "typ", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||
{ "min-p", COMMON_SAMPLER_TYPE_MIN_P },
|
||||
{ "temp", COMMON_SAMPLER_TYPE_TEMPERATURE },
|
||||
{ "adaptive-p", COMMON_SAMPLER_TYPE_ADAPTIVE_P },
|
||||
};
|
||||
std::vector<common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names) {
|
||||
// sampler names can be written multiple ways; generate aliases from canonical names
|
||||
static const auto sampler_name_map = []{
|
||||
// canonical sampler name mapping
|
||||
std::unordered_map<std::string, common_sampler_type> canonical_name_map {
|
||||
{ "dry", COMMON_SAMPLER_TYPE_DRY },
|
||||
{ "top_k", COMMON_SAMPLER_TYPE_TOP_K },
|
||||
{ "top_p", COMMON_SAMPLER_TYPE_TOP_P },
|
||||
{ "top_n_sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
|
||||
{ "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||
{ "min_p", COMMON_SAMPLER_TYPE_MIN_P },
|
||||
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
|
||||
{ "xtc", COMMON_SAMPLER_TYPE_XTC },
|
||||
{ "infill", COMMON_SAMPLER_TYPE_INFILL },
|
||||
{ "penalties", COMMON_SAMPLER_TYPE_PENALTIES },
|
||||
{ "adaptive_p", COMMON_SAMPLER_TYPE_ADAPTIVE_P }
|
||||
};
|
||||
std::unordered_map<std::string, common_sampler_type> alias_name_map;
|
||||
for (const auto & entry : canonical_name_map) {
|
||||
const std::string & canonical = entry.first;
|
||||
if (canonical.find('_') == std::string::npos) {
|
||||
continue;
|
||||
}
|
||||
// kebab-case: "top-k", "min-p", etc.
|
||||
{
|
||||
std::string kebab_case = canonical;
|
||||
std::replace(kebab_case.begin(), kebab_case.end(), '_', '-');
|
||||
alias_name_map.insert({kebab_case, entry.second});
|
||||
}
|
||||
// no dash: "topk", "minp", etc.
|
||||
{
|
||||
std::string no_dash = canonical;
|
||||
no_dash.erase(std::remove(no_dash.begin(), no_dash.end(), '_'), no_dash.end());
|
||||
alias_name_map.insert({no_dash, entry.second});
|
||||
}
|
||||
}
|
||||
// misc. aliases
|
||||
alias_name_map.insert({"nucleus", COMMON_SAMPLER_TYPE_TOP_P});
|
||||
alias_name_map.insert({"temp", COMMON_SAMPLER_TYPE_TEMPERATURE});
|
||||
alias_name_map.insert({"typ", COMMON_SAMPLER_TYPE_TYPICAL_P});
|
||||
// include aliases + canonical names in the complete mapping
|
||||
alias_name_map.merge(canonical_name_map);
|
||||
return alias_name_map;
|
||||
}();
|
||||
|
||||
std::vector<common_sampler_type> samplers;
|
||||
samplers.reserve(names.size());
|
||||
|
||||
for (const auto & name : names) {
|
||||
auto sampler = sampler_canonical_name_map.find(name);
|
||||
if (sampler != sampler_canonical_name_map.end()) {
|
||||
std::string name_lower = name;
|
||||
std::transform(name_lower.begin(), name_lower.end(), name_lower.begin(), ::tolower);
|
||||
auto sampler = sampler_name_map.find(name_lower);
|
||||
if (sampler != sampler_name_map.end()) {
|
||||
samplers.push_back(sampler->second);
|
||||
continue;
|
||||
}
|
||||
if (allow_alt_names) {
|
||||
sampler = sampler_alt_name_map.find(name);
|
||||
if (sampler != sampler_alt_name_map.end()) {
|
||||
samplers.push_back(sampler->second);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
LOG_WRN("%s: unable to match sampler by name '%s'\n", __func__, name.c_str());
|
||||
LOG_WRN("%s: unable to match sampler by name '%s'\n", __func__, name_lower.c_str());
|
||||
}
|
||||
|
||||
return samplers;
|
||||
|
||||
+1
-1
@@ -109,7 +109,7 @@ std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx,
|
||||
char common_sampler_type_to_chr(enum common_sampler_type cnstr);
|
||||
std::string common_sampler_type_to_str(enum common_sampler_type cnstr);
|
||||
|
||||
std::vector<enum common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
|
||||
std::vector<enum common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names);
|
||||
std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std::string & chars);
|
||||
|
||||
llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab,
|
||||
|
||||
+473
-55
@@ -3,13 +3,14 @@
|
||||
#include "common.h"
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
#include "../src/llama-ext.h" // staging API: llama_set_embeddings_nextn / llama_get_embeddings_nextn_ith (used by MTP)
|
||||
#include "log.h"
|
||||
#include "ngram-cache.h"
|
||||
#include "ngram-map.h"
|
||||
#include "ngram-mod.h"
|
||||
#include "sampling.h"
|
||||
|
||||
#include "../src/llama-ext.h" // staging API: llama_set_embeddings_nextn / llama_get_embeddings_nextn_ith (used by MTP)
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cstring>
|
||||
@@ -58,10 +59,10 @@ static bool common_speculative_are_compatible(
|
||||
const llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt);
|
||||
const llama_vocab * vocab_dft = llama_model_get_vocab(model_dft);
|
||||
|
||||
const bool vocab_type_tgt = llama_vocab_type(vocab_tgt);
|
||||
const auto vocab_type_tgt = llama_vocab_type(vocab_tgt);
|
||||
LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt);
|
||||
|
||||
const bool vocab_type_dft = llama_vocab_type(vocab_dft);
|
||||
const auto vocab_type_dft = llama_vocab_type(vocab_dft);
|
||||
LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
|
||||
|
||||
if (vocab_type_tgt != vocab_type_dft) {
|
||||
@@ -374,31 +375,437 @@ struct common_speculative_impl_draft_simple : public common_speculative_impl {
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
// EAGLE3 speculative decoding state
|
||||
//
|
||||
// Input of draft decoder: (This is different compared to MTP)
|
||||
// At "pos P", the decoder takes input pair (t_{P+1}, g_P), with RoPE at P.
|
||||
// - t_{P+1} = token at sequence pos P+1 (the *next* token after P)
|
||||
// - g_P = encoder output = projection of target's extracted hidden states at P
|
||||
//
|
||||
// Deferred boundary (MTP doesn't have this issue):
|
||||
// Within a single process() call with n_tokens, we can only write decoder KV for
|
||||
// training pos 0..n_tokens-2. The last training pos (n_tokens-1) needs t_{n_tokens}
|
||||
// which lies *outside* this batch — it is the token target will sample next or the first token from next ubatch.
|
||||
// So the last training pos of each process() call is *deferred* to whichever next call has
|
||||
// the missing token in hand:
|
||||
// - multi-ubatch prefill: the next process()'s first token completes the pair
|
||||
// (handled by the per-seq "cross-ubatch bridge")
|
||||
// - single-ubatch prefill / after verify: draft()'s seed step uses "dp.id_last"
|
||||
// (target's freshest sample) to complete the pair
|
||||
//
|
||||
// Per-seq carry-over state:
|
||||
// pending_g_last [n_embd_dec] ┐ the deferred boundary's (g, pos). Set by
|
||||
// pending_pos_last llama_pos ┘ process() at end of ubatch (= last row);
|
||||
// rebased by accept() to first-non-accepted pos.
|
||||
// verify_g [N × n_embd_dec] snapshot of process()'s encoder output;
|
||||
// verify_pos_first llama_pos consumed by accept() to recover the right
|
||||
// verify_g_rows int32_t pending_g_last row for any n_accepted value.
|
||||
//
|
||||
// Performance is overall good but there is waste in verify cycle:
|
||||
// process() runs encoder + decoder on the *full* verify batch including rows for
|
||||
// rejected drafts. The KV at those positions is then dropped.
|
||||
//
|
||||
// TODO: Not sure if we need optimization for this waste?
|
||||
// If so we may need hybrid stash:
|
||||
// in verify mode, have process() only stash features and let draft() seed run
|
||||
// encoder+decoder on n_accepted+1 rows).
|
||||
struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
|
||||
//common_params_speculative_eagle3 params;
|
||||
common_params_speculative_draft params;
|
||||
llama_batch batch;
|
||||
|
||||
std::vector<common_sampler_ptr> smpls;
|
||||
|
||||
int32_t n_embd_dec = 0; // draft hidden size
|
||||
int32_t n_embd_enc = 0; // target_layer_ids_n * target_hidden_size
|
||||
int32_t n_embd_tgt = 0; // target model hidden size
|
||||
|
||||
const int32_t * target_layer_ids = nullptr; // model_dft's extract layer indices
|
||||
uint32_t target_layer_ids_n = 0;
|
||||
|
||||
// [per-seq] deferred boundary state
|
||||
std::vector<std::vector<float>> pending_g_last;
|
||||
std::vector<llama_pos> pending_pos_last;
|
||||
|
||||
// [per-seq] snapshot of the most recent process()'s encoder output
|
||||
std::vector<std::vector<float>> verify_g; // [n_seq][n_rows * n_embd_dec]
|
||||
std::vector<llama_pos> verify_pos_first; // [n_seq] — pos of verify_g[seq][0]
|
||||
std::vector<int32_t> verify_g_rows; // [n_seq] — number of rows
|
||||
|
||||
// scratch buffer for concatenated target features [n_tokens, n_embd_enc]
|
||||
std::vector<float> features_buf;
|
||||
std::vector<float> g_embd_buf;
|
||||
|
||||
common_speculative_impl_draft_eagle3(const common_params_speculative & params, uint32_t n_seq)
|
||||
: common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, n_seq)
|
||||
, params(params.draft)
|
||||
{
|
||||
LOG_INF("%s: adding speculative implementation 'draft-eagle3'\n", __func__);
|
||||
LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%f\n", __func__, params.draft.n_max, params.draft.n_min, params.draft.p_min);
|
||||
|
||||
auto * ctx_tgt = this->params.ctx_tgt;
|
||||
auto * ctx_dft = this->params.ctx_dft;
|
||||
GGML_ASSERT(ctx_tgt && ctx_dft && "EAGLE3 requires ctx_tgt and ctx_dft to be set");
|
||||
|
||||
const llama_model * model_dft = llama_get_model(ctx_dft);
|
||||
const llama_model * model_tgt = llama_get_model(ctx_tgt);
|
||||
|
||||
target_layer_ids = llama_model_target_layer_ids (model_dft);
|
||||
target_layer_ids_n = llama_model_target_layer_ids_n(model_dft);
|
||||
if (target_layer_ids_n != 3) {
|
||||
throw std::runtime_error("draft model is not eagle3 (expected 3 extract layers, got " +
|
||||
std::to_string(target_layer_ids_n) + ")");
|
||||
}
|
||||
|
||||
n_embd_tgt = llama_model_n_embd(model_tgt);
|
||||
n_embd_dec = llama_model_n_embd(model_dft);
|
||||
n_embd_enc = (int32_t) target_layer_ids_n * n_embd_tgt;
|
||||
|
||||
const int32_t n_b = (int32_t) llama_n_batch(ctx_dft);
|
||||
batch = llama_batch_init(/*n_tokens=*/ n_b, /*embd=*/ n_embd_dec, /*n_seq_max=*/ 1);
|
||||
// llama_batch_init allocates only one of token/embd; eagle3 decoder needs both.
|
||||
// TODO: fix, how to call without malloc
|
||||
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_b);
|
||||
|
||||
smpls.resize(n_seq);
|
||||
for (auto & s : smpls) {
|
||||
common_params_sampling sparams;
|
||||
sparams.no_perf = false;
|
||||
sparams.top_k = 10;
|
||||
sparams.samplers = { COMMON_SAMPLER_TYPE_TOP_K };
|
||||
s.reset(common_sampler_init(llama_get_model(ctx_dft), sparams));
|
||||
}
|
||||
|
||||
// turn on extraction of the target layers' input embeddings
|
||||
for (uint32_t k = 0; k < target_layer_ids_n; ++k) {
|
||||
llama_set_embeddings_layer_inp(ctx_tgt, (uint32_t) target_layer_ids[k], true);
|
||||
}
|
||||
|
||||
// turn on extraction of the draft model's pre-norm hidden state
|
||||
// (used both for the encoder output g_embd and the decoder pre-norm output).
|
||||
llama_set_embeddings_nextn(ctx_dft, true, /*masked*/ true);
|
||||
|
||||
pending_g_last.assign(n_seq, std::vector<float>(n_embd_dec, 0.0f));
|
||||
pending_pos_last.assign(n_seq, -1);
|
||||
|
||||
verify_g.assign(n_seq, std::vector<float>());
|
||||
verify_pos_first.assign(n_seq, -1);
|
||||
verify_g_rows.assign(n_seq, 0);
|
||||
}
|
||||
|
||||
void begin(llama_seq_id /*seq_id*/, const llama_tokens & /*prompt*/) override {
|
||||
// noop
|
||||
~common_speculative_impl_draft_eagle3() override {
|
||||
if (batch.token != nullptr) {
|
||||
free(batch.token);
|
||||
batch.token = nullptr;
|
||||
}
|
||||
llama_batch_free(batch);
|
||||
}
|
||||
|
||||
bool process(const llama_batch & /*batch*/) override {
|
||||
// TODO: implement
|
||||
void begin(llama_seq_id seq_id, const llama_tokens & prompt) override {
|
||||
const int32_t N = (int32_t) prompt.size();
|
||||
if (N <= 0) {
|
||||
return;
|
||||
}
|
||||
// expected state after prefill: ctx_dft has pos 0..N-2 (last position is deferred to
|
||||
// draft()'s seed step). Warn only if more than one position is missing.
|
||||
auto * ctx_dft = this->params.ctx_dft;
|
||||
const llama_pos pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), seq_id);
|
||||
if (pos_max < N - 2) {
|
||||
LOG_WRN("%s: ctx_dft pos_max=%d < N-2=%d — process() did not run on every prefill ubatch. "
|
||||
"Drafts may degrade.\n",
|
||||
__func__, (int) pos_max, N - 2);
|
||||
}
|
||||
}
|
||||
|
||||
bool process(const llama_batch & batch_in) override {
|
||||
if (batch_in.n_tokens <= 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (batch_in.token == nullptr || batch_in.embd != nullptr) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const int32_t n_tokens = batch_in.n_tokens;
|
||||
|
||||
// i_batch_beg[seq] / i_batch_end[seq]: inclusive batch indices of this seq's
|
||||
// first/last token in batch_in. Assumes per-seq tokens are contiguous within
|
||||
// the ubatch (server's default ordering).
|
||||
std::vector<int32_t> i_batch_beg(n_seq, -1);
|
||||
std::vector<int32_t> i_batch_end(n_seq, -1);
|
||||
for (int k = 0; k < n_tokens; ++k) {
|
||||
GGML_ASSERT(batch_in.n_seq_id[k] == 1);
|
||||
const llama_seq_id seq_id = batch_in.seq_id[k][0];
|
||||
if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq) {
|
||||
continue;
|
||||
}
|
||||
i_batch_end[seq_id] = k;
|
||||
if (i_batch_beg[seq_id] < 0) {
|
||||
i_batch_beg[seq_id] = k;
|
||||
}
|
||||
}
|
||||
|
||||
auto * ctx_tgt = this->params.ctx_tgt;
|
||||
auto * ctx_dft = this->params.ctx_dft;
|
||||
|
||||
// Interleave each extract_layer's hidden state into a contiguous buffer of
|
||||
// shape [n_tokens, target_layer_ids_n * n_embd_tgt]. Then run EAGLE3 encoder
|
||||
// to get one g_embd row per token.
|
||||
features_buf.resize((size_t) n_tokens * n_embd_enc, 0.0f);
|
||||
|
||||
for (uint32_t k = 0; k < target_layer_ids_n; ++k) {
|
||||
const float * layer = llama_get_embeddings_layer_inp(ctx_tgt, (uint32_t) target_layer_ids[k]);
|
||||
if (!layer) {
|
||||
GGML_ABORT("EAGLE3: target layer %d input not extracted.", target_layer_ids[k]);
|
||||
}
|
||||
for (int32_t i = 0; i < n_tokens; ++i) {
|
||||
float * dst = features_buf.data() + (size_t) i * n_embd_enc + k * (size_t) n_embd_tgt;
|
||||
const float * src = layer + (size_t) i * n_embd_tgt;
|
||||
std::memcpy(dst, src, (size_t) n_embd_tgt * sizeof(float));
|
||||
}
|
||||
}
|
||||
|
||||
g_embd_buf.resize((size_t) n_tokens * n_embd_dec);
|
||||
|
||||
// llama_encode() requires the full encoder batch to fit in n_ubatch.
|
||||
// Allow batch > ubatch: eagle3's per-token encoder can be chunked safely.
|
||||
const int32_t n_ubatch_dft = (int32_t) llama_n_ubatch(ctx_dft);
|
||||
for (int32_t i = 0; i < n_tokens; i += n_ubatch_dft) {
|
||||
const int32_t n_chunk = std::min(n_ubatch_dft, n_tokens - i);
|
||||
|
||||
llama_batch enc_batch = {
|
||||
/*.n_tokens =*/ n_chunk,
|
||||
/*.token =*/ nullptr,
|
||||
/*.embd =*/ features_buf.data() + (size_t) i * n_embd_enc,
|
||||
/*.pos =*/ nullptr,
|
||||
/*.n_seq_id =*/ nullptr,
|
||||
/*.seq_id =*/ nullptr,
|
||||
/*.logits =*/ nullptr,
|
||||
};
|
||||
const int32_t rc = llama_encode(ctx_dft, enc_batch);
|
||||
if (rc != 0) {
|
||||
LOG_ERR("%s: llama_encode(ctx_dft) failed rc=%d (n_tokens=%d, offset=%d)\n",
|
||||
__func__, rc, (int) n_chunk, (int) i);
|
||||
return false;
|
||||
}
|
||||
|
||||
// g_embd has shape [n_chunk, n_embd_dec] in ctx_dft's pre-norm embeddings buffer.
|
||||
const float * g_embd_chunk = llama_get_embeddings_nextn(ctx_dft);
|
||||
GGML_ASSERT(g_embd_chunk && "EAGLE3 encoder produced no output.");
|
||||
std::memcpy(g_embd_buf.data() + (size_t) i * n_embd_dec,
|
||||
g_embd_chunk,
|
||||
(size_t) n_chunk * n_embd_dec * sizeof(float));
|
||||
}
|
||||
|
||||
const float * g_embd = g_embd_buf.data();
|
||||
|
||||
const size_t row_bytes = (size_t) n_embd_dec * sizeof(float);
|
||||
|
||||
// EAGLE3 decoder input convention: at memory pos P the input pair is
|
||||
// (token[P+1], g_embd[P]). This shifts the token index "left by one" relative to g_embd.
|
||||
//
|
||||
// Per seq, in order:
|
||||
// (a) cross-ubatch bridge — when applicable, write the previously-deferred
|
||||
// pos using this ubatch's first token + pending_g_last.
|
||||
// (b) main write loop — for k in [beg, end-1], write (token[k+1], g_embd[k])
|
||||
// at pos[k]. The last training pos (k=end) is left unwritten = new
|
||||
// deferred boundary, completed by the next process() or draft() call.
|
||||
// (c) refresh deferred state — stash this ubatch's full g_embd into verify_g,
|
||||
// update pending_g_last / pending_pos_last to the last row.
|
||||
common_batch_clear(batch);
|
||||
|
||||
for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
|
||||
const int32_t beg = i_batch_beg[seq_id];
|
||||
const int32_t end = i_batch_end[seq_id];
|
||||
if (beg < 0 || end < 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// cross-ubatch bridge — complete the prior ubatch's deferred boundary.
|
||||
// Fires iff all three preconditions hold:
|
||||
// 1) pending_pos_last >= 0
|
||||
// 2) pending_pos_last + 1 == pos[beg]
|
||||
// 3) pending_pos_last > dft_pos_max // TODO: is this check needed?
|
||||
const llama_pos pending_pos = pending_pos_last[seq_id];
|
||||
if (pending_pos >= 0 && pending_pos + 1 == batch_in.pos[beg]) {
|
||||
const llama_pos dft_pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), seq_id);
|
||||
if (pending_pos > dft_pos_max) {
|
||||
common_batch_add(batch, batch_in.token[beg], pending_pos, { seq_id }, /*logits=*/ false);
|
||||
std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd_dec,
|
||||
pending_g_last[seq_id].data(), row_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
for (int32_t k = beg; k < end; ++k) {
|
||||
common_batch_add(batch, batch_in.token[k + 1], batch_in.pos[k], { seq_id }, /*logits=*/ false);
|
||||
std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd_dec,
|
||||
g_embd + (size_t) k * n_embd_dec, row_bytes);
|
||||
}
|
||||
|
||||
// refresh deferred state
|
||||
const int32_t n_rows = end - beg + 1;
|
||||
verify_pos_first[seq_id] = batch_in.pos[beg];
|
||||
pending_pos_last[seq_id] = batch_in.pos[end];
|
||||
verify_g_rows[seq_id] = n_rows;
|
||||
verify_g[seq_id].resize((size_t) n_rows * n_embd_dec, 0.0f);
|
||||
std::memcpy(verify_g[seq_id].data(), g_embd + (size_t) beg * n_embd_dec, row_bytes * n_rows);
|
||||
std::memcpy(pending_g_last[seq_id].data(), g_embd + (size_t) end * n_embd_dec, row_bytes);
|
||||
}
|
||||
|
||||
if (batch.n_tokens > 0) {
|
||||
const int32_t rc = llama_decode(ctx_dft, batch);
|
||||
if (rc != 0) {
|
||||
LOG_ERR("%s: llama_decode(ctx_dft) failed rc=%d (n_tokens=%d, ubatch_pos[0]=%d)\n",
|
||||
__func__, rc, (int) batch.n_tokens, (int) batch_in.pos[0]);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void draft(common_speculative_draft_params_vec & /*dparams*/) override {
|
||||
// TODO: implement
|
||||
void draft(common_speculative_draft_params_vec & dparams) override {
|
||||
auto & ctx_dft = params.ctx_dft;
|
||||
|
||||
common_batch_clear(batch);
|
||||
|
||||
// keep track of which sequences are still drafting
|
||||
int n_drafting = 0;
|
||||
std::vector<bool> drafting(n_seq);
|
||||
|
||||
const size_t row_bytes = (size_t) n_embd_dec * sizeof(float);
|
||||
|
||||
// Complete the deferred boundary pair (dp.id_last, pending_g_last) at memory
|
||||
// pos pending_pos_last. dp.id_last is target's freshest sample (= corrected
|
||||
// token after verify, or first generated token after prefill), matching the
|
||||
// EAGLE3 input convention (token[P+1], g_embd[P]) at pos P.
|
||||
for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
|
||||
auto & dp = dparams[seq_id];
|
||||
|
||||
if (!dp.drafting) {
|
||||
continue;
|
||||
}
|
||||
if (pending_pos_last[seq_id] < 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
n_drafting++;
|
||||
drafting[seq_id] = true;
|
||||
common_sampler_reset(smpls[seq_id].get());
|
||||
|
||||
llama_memory_seq_rm(llama_get_memory(ctx_dft), seq_id, pending_pos_last[seq_id], -1);
|
||||
|
||||
common_batch_add(batch, dp.id_last, pending_pos_last[seq_id], { seq_id }, true);
|
||||
std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd_dec,
|
||||
pending_g_last[seq_id].data(),
|
||||
row_bytes);
|
||||
}
|
||||
|
||||
if (batch.n_tokens == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
int ret = llama_decode(ctx_dft, batch);
|
||||
if (ret != 0) {
|
||||
LOG_WRN("%s: llama_decode returned %d\n", __func__, ret);
|
||||
return;
|
||||
}
|
||||
|
||||
int i = 0;
|
||||
|
||||
while (n_drafting > 0) {
|
||||
int i_batch = 0;
|
||||
|
||||
common_batch_clear(batch);
|
||||
|
||||
for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
|
||||
if (!drafting[seq_id]) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto * smpl = smpls[seq_id].get();
|
||||
|
||||
common_sampler_sample(smpl, ctx_dft, i_batch, true);
|
||||
// pre-norm hidden state of this position becomes g_embd for the next step
|
||||
const float * prenorm = llama_get_embeddings_nextn_ith(ctx_dft, i_batch);
|
||||
++i_batch;
|
||||
|
||||
const auto * cur_p = common_sampler_get_candidates(smpl, true);
|
||||
|
||||
for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
|
||||
LOG_DBG(" - seq_id %d, draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
|
||||
seq_id, k, i, cur_p->data[k].id, cur_p->data[k].p,
|
||||
common_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
|
||||
}
|
||||
|
||||
const llama_token id = cur_p->data[0].id;
|
||||
|
||||
// only collect very high-confidence draft tokens
|
||||
// (configurable via --spec-draft-p-min, set to 0.0 to disable early-stop)
|
||||
if (cur_p->data[0].p < params.p_min) {
|
||||
drafting[seq_id] = false;
|
||||
n_drafting--;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
common_sampler_accept(smpl, id, true);
|
||||
|
||||
auto & dp = dparams.at(seq_id);
|
||||
auto & result = *dp.result;
|
||||
|
||||
result.push_back(id);
|
||||
|
||||
if (params.n_max <= (int) result.size()) {
|
||||
drafting[seq_id] = false;
|
||||
n_drafting--;
|
||||
continue;
|
||||
}
|
||||
|
||||
common_batch_add(batch, id, pending_pos_last[seq_id] + (i + 1), { seq_id }, true);
|
||||
std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd_dec, prenorm, row_bytes);
|
||||
}
|
||||
|
||||
if (batch.n_tokens == 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
ret = llama_decode(ctx_dft, batch);
|
||||
if (ret != 0) {
|
||||
LOG_WRN("%s: llama_decode[%d] returned %d\n", __func__, i, ret);
|
||||
break;
|
||||
}
|
||||
|
||||
++i;
|
||||
}
|
||||
|
||||
for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
|
||||
auto & dp = dparams[seq_id];
|
||||
if (!dp.drafting) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (dp.result->size() < (size_t) params.n_min) {
|
||||
dp.result->clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override {
|
||||
// noop
|
||||
void accept(llama_seq_id seq_id, uint16_t n_accepted, bool /*is_other*/) override {
|
||||
if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int32_t n_rows = verify_g_rows[seq_id];
|
||||
if (n_rows <= 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int32_t i_g = std::min<int32_t>(n_accepted, n_rows - 1);
|
||||
pending_pos_last[seq_id] = verify_pos_first[seq_id] + i_g;
|
||||
std::memcpy(pending_g_last[seq_id].data(),
|
||||
verify_g[seq_id].data() + (size_t) i_g * n_embd_dec,
|
||||
(size_t) n_embd_dec * sizeof(float));
|
||||
}
|
||||
|
||||
bool need_embd() const override {
|
||||
@@ -418,6 +825,8 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
|
||||
|
||||
int32_t n_embd = 0;
|
||||
|
||||
bool is_mem_shared = false;
|
||||
|
||||
// Per-sequence cross-batch carryover: pair (h_p, x_{p+1}) at MTP pos p+1.
|
||||
// The last h-row of one process() call needs the first token of the NEXT
|
||||
// call to pair with, so it's stashed here until that next call fires.
|
||||
@@ -444,7 +853,9 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
|
||||
auto * ctx_dft = this->params.ctx_dft;
|
||||
GGML_ASSERT(ctx_tgt && ctx_dft && "MTP requires ctx_tgt and ctx_dft to be set");
|
||||
|
||||
n_embd = llama_model_n_embd(llama_get_model(ctx_dft));
|
||||
n_embd = llama_model_n_embd_out(llama_get_model(ctx_dft));
|
||||
GGML_ASSERT(n_embd == llama_model_n_embd(llama_get_model(ctx_tgt)) &&
|
||||
"MTP input row width must match the target h_nextn width");
|
||||
|
||||
LOG_INF("%s: adding speculative implementation 'draft-mtp'\n", __func__);
|
||||
LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f, n_embd=%d, backend_sampling=%d\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min, n_embd, (int) this->params.backend_sampling);
|
||||
@@ -490,6 +901,8 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
|
||||
llama_set_embeddings_nextn(ctx_tgt, true, /*masked*/ false);
|
||||
llama_set_embeddings_nextn(ctx_dft, true, /*masked*/ true);
|
||||
|
||||
is_mem_shared = llama_get_ctx_other(ctx_dft) == ctx_tgt;
|
||||
|
||||
pending_h.assign(n_seq, std::vector<float>(n_embd, 0.0f));
|
||||
|
||||
i_batch_beg.assign(n_seq, -1);
|
||||
@@ -526,9 +939,11 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
|
||||
if (N <= 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto * ctx_dft = this->params.ctx_dft;
|
||||
const llama_pos pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), seq_id);
|
||||
if (pos_max < N - 1) {
|
||||
|
||||
if (pos_max < N - 1 && !is_mem_shared) {
|
||||
LOG_WRN("%s: ctx_dft pos_max=%d < N-1=%d - "
|
||||
"process() hook may not have run on every prefill ubatch "
|
||||
"(need_embd / logits=1 on every prompt position?). "
|
||||
@@ -571,48 +986,42 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
|
||||
|
||||
const size_t row_bytes = (size_t) n_embd * sizeof(float);
|
||||
|
||||
common_batch_clear(batch);
|
||||
// if kv is shared with target (e.g Gemma4), then we can skip this catch-up decode
|
||||
if (!is_mem_shared) {
|
||||
common_batch_clear(batch);
|
||||
|
||||
for (int k = 0; k < n_tokens; ++k) {
|
||||
common_batch_add(batch, batch_in.token[k], batch_in.pos[k], { batch_in.seq_id[k][0] }, 0);
|
||||
}
|
||||
|
||||
// shift the tgt embeddings to the right by one position
|
||||
// assumes that the tokens in the batch are sequential for each sequence
|
||||
// i.e. we cannot have seq_id like this: [0, 0, 0, 1, 1, 0, 1, 1]
|
||||
// ^--- this is a problem
|
||||
// TODO:this is generally true, but would be nice to assert it
|
||||
{
|
||||
const float * h_tgt = llama_get_embeddings_nextn(ctx_tgt);
|
||||
std::memcpy(batch.embd + (size_t) 1 * n_embd, h_tgt, row_bytes * (n_tokens-1));
|
||||
|
||||
//{
|
||||
// // string with seq_ids in the batch
|
||||
// std::stringstream ss;
|
||||
// for (int i = 0; i < n_tokens; ++i) {
|
||||
// ss << batch_in.seq_id[i][0] << ",";
|
||||
// }
|
||||
// LOG_WRN("%s: batch_in.seq_id = %s\n", __func__, ss.str().c_str());
|
||||
//}
|
||||
}
|
||||
|
||||
// fill the pending embeddings from a previous run
|
||||
auto set_h = [&](int idx, const float * h_row) {
|
||||
std::memcpy(batch.embd + (size_t) idx * n_embd, h_row, row_bytes);
|
||||
};
|
||||
|
||||
for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
|
||||
if (i_batch_beg[seq_id] < 0) {
|
||||
continue;
|
||||
for (int k = 0; k < n_tokens; ++k) {
|
||||
common_batch_add(batch, batch_in.token[k], batch_in.pos[k], { batch_in.seq_id[k][0] }, 0);
|
||||
}
|
||||
|
||||
set_h(i_batch_beg[seq_id], pending_h[seq_id].data());
|
||||
}
|
||||
// shift the tgt embeddings to the right by one position
|
||||
// assumes that the tokens in the batch are sequential for each sequence
|
||||
// i.e. we cannot have seq_id like this: [0, 0, 0, 1, 1, 0, 1, 1]
|
||||
// ^--- this is a problem
|
||||
// TODO:this is generally true, but would be nice to assert it
|
||||
{
|
||||
const float * h_tgt = llama_get_embeddings_nextn(ctx_tgt);
|
||||
std::memcpy(batch.embd + (size_t) 1 * n_embd, h_tgt, row_bytes * (n_tokens-1));
|
||||
}
|
||||
|
||||
const int32_t rc = llama_decode(ctx_dft, batch);
|
||||
if (rc != 0) {
|
||||
LOG_ERR("%s: llama_decode(ctx_dft) failed rc=%d (pos=%d)\n", __func__, (int) rc, (int) batch_in.pos[0]);
|
||||
return false;
|
||||
// fill the pending embeddings from a previous run
|
||||
auto set_h = [&](int idx, const float * h_row) {
|
||||
std::memcpy(batch.embd + (size_t) idx * n_embd, h_row, row_bytes);
|
||||
};
|
||||
|
||||
for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
|
||||
if (i_batch_beg[seq_id] < 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
set_h(i_batch_beg[seq_id], pending_h[seq_id].data());
|
||||
}
|
||||
|
||||
const int32_t rc = llama_decode(ctx_dft, batch);
|
||||
if (rc != 0) {
|
||||
LOG_ERR("%s: llama_decode(ctx_dft) failed rc=%d (pos=%d)\n", __func__, (int) rc, (int) batch_in.pos[0]);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
|
||||
@@ -721,7 +1130,13 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
|
||||
continue;
|
||||
}
|
||||
|
||||
common_batch_add(batch, id, dp.n_past + i + 1, { seq_id }, true);
|
||||
if (is_mem_shared) {
|
||||
// note: with shared memory (e.g. Gemma4 assistants) we use the same position for all draft tokens
|
||||
// ref: https://github.com/huggingface/transformers/blob/effde20942e3f82a1b97449f60b3a48c5ff96145/docs/source/en/model_doc/gemma4_assistant.md?plain=1#L36-L37
|
||||
common_batch_add(batch, id, dp.n_past, { seq_id }, true);
|
||||
} else {
|
||||
common_batch_add(batch, id, dp.n_past + i + 1, { seq_id }, true);
|
||||
}
|
||||
std::memcpy(batch.embd + n_embd*(batch.n_tokens - 1), h_row, row_bytes);
|
||||
}
|
||||
|
||||
@@ -834,7 +1249,8 @@ struct common_speculative_impl_ngram_map_k : public common_speculative_impl {
|
||||
common_speculative_impl_ngram_map_k(
|
||||
const common_ngram_map & config,
|
||||
uint32_t n_seq)
|
||||
: common_speculative_impl(COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, n_seq)
|
||||
: common_speculative_impl(config.key_only ? COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K
|
||||
: COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, n_seq)
|
||||
{
|
||||
for (uint32_t i = 0; i < n_seq; i++) {
|
||||
this->config.push_back(config);
|
||||
@@ -1360,9 +1776,11 @@ common_speculative * common_speculative_init(common_params_speculative & params,
|
||||
uint32_t enabled_configs = common_get_enabled_speculative_configs(params.types);
|
||||
|
||||
bool has_draft_simple = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE));
|
||||
bool has_draft_eagle3 = false; // TODO PR-18039: if params.speculative.eagle3
|
||||
bool has_draft_eagle3 = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3)) && params.draft.ctx_dft != nullptr;
|
||||
bool has_mtp = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_MTP)) && params.draft.ctx_dft != nullptr;
|
||||
|
||||
|
||||
|
||||
bool has_ngram_cache = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_NGRAM_CACHE));
|
||||
bool has_ngram_simple = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE));
|
||||
bool has_ngram_map_k = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K));
|
||||
|
||||
@@ -40,6 +40,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
|
||||
"ChatGLMModel": "chatglm",
|
||||
"CodeShellForCausalLM": "codeshell",
|
||||
"CogVLMForCausalLM": "cogvlm",
|
||||
"Cohere2MoeForCausalLM": "command_r",
|
||||
"Cohere2ForCausalLM": "command_r",
|
||||
"CohereForCausalLM": "command_r",
|
||||
"DbrxForCausalLM": "dbrx",
|
||||
@@ -75,9 +76,11 @@ TEXT_MODEL_MAP: dict[str, str] = {
|
||||
"Gemma3TextModel": "gemma",
|
||||
"Gemma3nForCausalLM": "gemma",
|
||||
"Gemma3nForConditionalGeneration": "gemma",
|
||||
"Gemma4AssistantForCausalLM": "gemma",
|
||||
"Gemma4ForConditionalGeneration": "gemma",
|
||||
"Gemma4ForCausalLM": "gemma",
|
||||
"Gemma4UnifiedForConditionalGeneration": "gemma",
|
||||
"Gemma4UnifiedAssistantForCausalLM": "gemma",
|
||||
"GemmaForCausalLM": "gemma",
|
||||
"Glm4ForCausalLM": "glm",
|
||||
"Glm4MoeForCausalLM": "glm",
|
||||
@@ -128,6 +131,9 @@ TEXT_MODEL_MAP: dict[str, str] = {
|
||||
"LlamaBidirectionalModel": "llama",
|
||||
"LlamaForCausalLM": "llama",
|
||||
"LlamaModel": "llama",
|
||||
"Eagle3DraftModel": "llama",
|
||||
"Eagle3Speculator": "llama",
|
||||
"LlamaForCausalLMEagle3": "llama",
|
||||
"LlavaForConditionalGeneration": "llama",
|
||||
"LlavaStableLMEpochForCausalLM": "stablelm",
|
||||
"MPTForCausalLM": "mpt",
|
||||
@@ -253,6 +259,7 @@ MMPROJ_MODEL_MAP: dict[str, str] = {
|
||||
"Glm4vMoeForConditionalGeneration": "qwen3vl",
|
||||
"GlmOcrForConditionalGeneration": "qwen3vl",
|
||||
"GlmasrModel": "ultravox",
|
||||
"Granite4VisionForConditionalGeneration": "granite",
|
||||
"GraniteSpeechForConditionalGeneration": "granite",
|
||||
"HunYuanVLForConditionalGeneration": "hunyuan",
|
||||
"Idefics3ForConditionalGeneration": "smolvlm",
|
||||
|
||||
+9
-2
@@ -94,6 +94,7 @@ class ModelBase:
|
||||
metadata: gguf.Metadata
|
||||
dir_model_card: Path
|
||||
remote_hf_model_id: str | None
|
||||
target_model_dir: Path | None
|
||||
|
||||
# subclasses should define this!
|
||||
model_arch: gguf.MODEL_ARCH
|
||||
@@ -119,6 +120,7 @@ class ModelBase:
|
||||
small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None,
|
||||
disable_mistral_community_chat_template: bool = False,
|
||||
sentence_transformers_dense_modules: bool = False,
|
||||
target_model_dir: Path | None = None,
|
||||
fuse_gate_up_exps: bool = False,
|
||||
fp8_as_q8: bool = False):
|
||||
if type(self) is ModelBase or \
|
||||
@@ -139,6 +141,7 @@ class ModelBase:
|
||||
self.dry_run = dry_run
|
||||
self.remote_hf_model_id = remote_hf_model_id
|
||||
self.sentence_transformers_dense_modules = sentence_transformers_dense_modules
|
||||
self.target_model_dir = target_model_dir
|
||||
self.fuse_gate_up_exps = fuse_gate_up_exps
|
||||
self._gate_exp_buffer: dict[int, Tensor] = {}
|
||||
self._up_exp_buffer: dict[int, Tensor] = {}
|
||||
@@ -1192,7 +1195,7 @@ class TextModel(ModelBase):
|
||||
self.gguf_writer.add_embedding_length(n_embd)
|
||||
logger.info(f"gguf: embedding length = {n_embd}")
|
||||
|
||||
if (n_ff := self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"], optional=True)) is not None:
|
||||
if (n_ff := self.find_hparam(["prefix_dense_intermediate_size", "intermediate_size", "n_inner", "hidden_dim"], optional=True)) is not None:
|
||||
self.gguf_writer.add_feed_forward_length(n_ff)
|
||||
logger.info(f"gguf: feed forward length = {n_ff}")
|
||||
|
||||
@@ -1277,7 +1280,7 @@ class TextModel(ModelBase):
|
||||
self.gguf_writer.add_expert_group_used_count(n_group_used)
|
||||
logger.info(f"gguf: expert groups used count = {n_group_used}")
|
||||
|
||||
if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func", "moe_router_activation", "moe_router_activation_func"], optional=True)) is not None:
|
||||
if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func", "moe_router_activation", "moe_router_activation_func", "expert_selection_fn"], optional=True)) is not None:
|
||||
if score_func == "sigmoid":
|
||||
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
|
||||
elif score_func == "softmax":
|
||||
@@ -1492,6 +1495,9 @@ class TextModel(ModelBase):
|
||||
if chkhsh == "d772b220ace2baec124bed8cfafce0ead7d6c38a4b65ef11261cf9d5d62246d1":
|
||||
# ref: https://huggingface.co/CohereLabs/tiny-aya-base
|
||||
res = "tiny_aya"
|
||||
if chkhsh == "52df12b4c8d4176e7481aab4b6e8454d1fd0a210a04a574f6d4e067d10e23c3e":
|
||||
# ref: https://huggingface.co/CohereLabs/North-Mini-Code-1.0
|
||||
res = "cohere2moe"
|
||||
if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
|
||||
# ref: https://huggingface.co/Qwen/Qwen1.5-7B
|
||||
res = "qwen2"
|
||||
@@ -2481,6 +2487,7 @@ class LazyTorchTensor(gguf.LazyBase):
|
||||
torch.float16: np.float16,
|
||||
torch.float32: np.float32,
|
||||
torch.uint8: np.uint8,
|
||||
torch.int64: np.int64,
|
||||
}
|
||||
|
||||
# only used when byteswapping data. Only correct size is needed
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Iterable, TYPE_CHECKING
|
||||
|
||||
import torch
|
||||
@@ -55,3 +56,122 @@ class Cohere2Model(TextModel):
|
||||
return
|
||||
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("Cohere2MoeForCausalLM")
|
||||
class Cohere2MoeModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.COHERE2MOE
|
||||
_n_main_layers: int | None = None
|
||||
_expert_tensor_re = re.compile(
|
||||
r"model\.layers\.(\d+)\.mlp\.experts\.(\d+)\.(down_proj|gate_proj|up_proj)\.weight"
|
||||
)
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
if (n_nextn := int(self.hparams.get("num_nextn_predict_layers", 0) or 0)) > 0 and not self.no_mtp:
|
||||
self.block_count += n_nextn
|
||||
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
||||
self._experts: list[dict[str, Tensor]] = [{} for _ in range(self.block_count)]
|
||||
|
||||
def _set_vocab_gpt2(self) -> None:
|
||||
tokens, toktypes, tokpre = self.get_vocab_base()
|
||||
self.gguf_writer.add_tokenizer_model("gpt2")
|
||||
self.gguf_writer.add_tokenizer_pre(tokpre)
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
hparams = self.hparams
|
||||
expert_intermediate_size = hparams["intermediate_size"]
|
||||
mlp_layer_types = hparams.get("mlp_layer_types")
|
||||
n_dense_lead = hparams.get("first_k_dense_replace", 0)
|
||||
if mlp_layer_types is not None:
|
||||
n_dense_lead = next((i for i, t in enumerate(mlp_layer_types) if t != "dense"), len(mlp_layer_types))
|
||||
|
||||
super().set_gguf_parameters()
|
||||
|
||||
self.gguf_writer.add_logit_scale(hparams["logit_scale"])
|
||||
self.gguf_writer.add_sliding_window(hparams["sliding_window"])
|
||||
self.gguf_writer.add_sliding_window_pattern([t == "sliding_attention" for t in hparams["layer_types"]])
|
||||
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||
self.gguf_writer.add_expert_feed_forward_length(expert_intermediate_size)
|
||||
self.gguf_writer.add_leading_dense_block_count(n_dense_lead)
|
||||
self.gguf_writer.add_expert_weights_norm(hparams.get("norm_topk_prob", False))
|
||||
if (num_shared_experts := hparams.get("num_shared_experts", 0)) > 0:
|
||||
if hparams.get("shared_expert_combination_strategy", "average") != "average":
|
||||
raise ValueError("Cohere2 MoE only supports average shared expert combination")
|
||||
self.gguf_writer.add_expert_shared_count(num_shared_experts)
|
||||
self.gguf_writer.add_expert_shared_feed_forward_length(expert_intermediate_size * num_shared_experts)
|
||||
if (n_nextn := hparams.get("num_nextn_predict_layers", 0)) > 0 and not self.no_mtp:
|
||||
self.gguf_writer.add_nextn_predict_layers(n_nextn)
|
||||
self.gguf_writer.add_rope_dimension_count(hparams["head_dim"])
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
||||
|
||||
def index_tensors(self, remote_hf_model_id: str | None = None):
|
||||
hparams = {**self.hparams, **self.hparams.get("text_config", {})}
|
||||
self._n_main_layers = hparams.get("num_hidden_layers")
|
||||
type(self)._n_main_layers = self._n_main_layers
|
||||
return super().index_tensors(remote_hf_model_id=remote_hf_model_id)
|
||||
|
||||
@classmethod
|
||||
def filter_tensors(cls, item):
|
||||
if (titem := super().filter_tensors(item)) is None:
|
||||
return None
|
||||
name, gen = titem
|
||||
|
||||
if cls._n_main_layers is not None:
|
||||
is_mtp = (m := re.match(r"model\.layers\.(\d+)\.", name)) is not None and int(m.group(1)) >= cls._n_main_layers
|
||||
if is_mtp and cls.no_mtp:
|
||||
return None
|
||||
if cls.mtp_only and not is_mtp and name not in (
|
||||
"model.embed_tokens.weight", "model.norm.weight", "lm_head.weight",
|
||||
):
|
||||
return None
|
||||
|
||||
return name, gen
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if name.endswith(".bias"):
|
||||
if torch.any(data_torch != 0):
|
||||
raise ValueError(f"Bias tensor {name!r} is not zero.")
|
||||
logger.debug(f"Skipping bias tensor {name!r}.")
|
||||
return
|
||||
|
||||
if (m := self._expert_tensor_re.fullmatch(name)) is not None:
|
||||
n_experts = self.hparams["num_experts"]
|
||||
layer_idx = int(m.group(1))
|
||||
assert bid is None or bid == layer_idx
|
||||
|
||||
self._experts[layer_idx][name] = data_torch
|
||||
|
||||
expected = {
|
||||
f"model.layers.{layer_idx}.mlp.experts.{xid}.{w_name}.weight"
|
||||
for xid in range(n_experts)
|
||||
for w_name in ("down_proj", "gate_proj", "up_proj")
|
||||
}
|
||||
if expected.issubset(self._experts[layer_idx]):
|
||||
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
||||
datas: list[Tensor] = []
|
||||
|
||||
for xid in range(n_experts):
|
||||
ename = f"model.layers.{layer_idx}.mlp.experts.{xid}.{w_name}.weight"
|
||||
datas.append(self._experts[layer_idx][ename])
|
||||
del self._experts[layer_idx][ename]
|
||||
|
||||
data_torch = torch.stack(datas, dim=0)
|
||||
merged_name = f"model.layers.{layer_idx}.mlp.experts.{w_name}.weight"
|
||||
|
||||
yield from super().modify_tensors(data_torch, merged_name, layer_idx)
|
||||
return
|
||||
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
def prepare_tensors(self):
|
||||
super().prepare_tensors()
|
||||
|
||||
experts = [k for d in self._experts for k in d.keys()]
|
||||
if len(experts) > 0:
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
+25
-4
@@ -785,6 +785,26 @@ class Gemma4UnifiedModel(Gemma4Model):
|
||||
self.gguf_writer.add_suppress_tokens(suppress_tokens)
|
||||
|
||||
|
||||
@ModelBase.register("Gemma4AssistantForCausalLM", "Gemma4UnifiedAssistantForCausalLM")
|
||||
class Gemma4AssistantModel(Gemma4Model):
|
||||
model_arch = gguf.MODEL_ARCH.GEMMA4_ASSISTANT
|
||||
|
||||
@classmethod
|
||||
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
|
||||
name, gen = item
|
||||
|
||||
if "masked_embedding" in name:
|
||||
logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
|
||||
return None
|
||||
|
||||
return super().filter_tensors(item)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_embedding_length_out(self.hparams["backbone_hidden_size"])
|
||||
self.gguf_writer.add_nextn_predict_layers(self.block_count)
|
||||
|
||||
|
||||
@ModelBase.register("Gemma4ForConditionalGeneration")
|
||||
class Gemma4VisionAudioModel(MmprojModel):
|
||||
has_audio_encoder = True
|
||||
@@ -812,10 +832,11 @@ class Gemma4VisionAudioModel(MmprojModel):
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6))
|
||||
|
||||
# audio params
|
||||
assert self.hparams_audio is not None
|
||||
self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA4A)
|
||||
self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
|
||||
self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-6))
|
||||
if self.has_audio_encoder:
|
||||
assert self.hparams_audio is not None
|
||||
self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA4A)
|
||||
self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
|
||||
self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-6))
|
||||
|
||||
def is_audio_tensor(self, name: str) -> bool:
|
||||
return "audio_tower" in name or "embed_audio" in name
|
||||
|
||||
+154
-4
@@ -1,5 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Any, Callable, Iterable, TYPE_CHECKING
|
||||
|
||||
import torch
|
||||
@@ -13,7 +14,7 @@ from .llama import LlamaModel
|
||||
from .mamba import Mamba2Model
|
||||
|
||||
|
||||
@ModelBase.register("GraniteForCausalLM", "GraniteSpeechForConditionalGeneration")
|
||||
@ModelBase.register("GraniteForCausalLM")
|
||||
class GraniteModel(LlamaModel):
|
||||
"""Conversion for IBM's GraniteForCausalLM"""
|
||||
model_arch = gguf.MODEL_ARCH.GRANITE
|
||||
@@ -46,11 +47,29 @@ class GraniteModel(LlamaModel):
|
||||
self.gguf_writer.add_logit_scale(logits_scale)
|
||||
logger.info("gguf: (granite) logits_scale = %s", logits_scale)
|
||||
|
||||
# If being used as the base for Granite4 Vision, add deepstack_layer_arr
|
||||
if self.hparams.get("spatial_target_layers") or self.hparams.get("deepstack_layer_map"):
|
||||
normalized_projector_map = Granite4VisionMmprojModel.get_normalized_projector_map(self.hparams)
|
||||
deepstack_mapping_arr = [-1 for _ in range(self.block_count)] # Populate with -1 sentinels
|
||||
for proj_idx, (_, llm_layer, _, _) in enumerate(normalized_projector_map):
|
||||
# Skip the first projector which is handled as the base embedding
|
||||
# stream like normal
|
||||
if proj_idx == 0:
|
||||
continue
|
||||
deepstack_mapping_arr[llm_layer] = proj_idx
|
||||
self.gguf_writer.add_deepstack_mapping(deepstack_mapping_arr)
|
||||
|
||||
@classmethod
|
||||
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
|
||||
name, gen = item
|
||||
if name.startswith("encoder."):
|
||||
return None
|
||||
# Skip multimodal tensors
|
||||
if (
|
||||
name.startswith(("encoder."))
|
||||
or "image_" in name
|
||||
or "layerwise_projectors" in name
|
||||
or "spatial_projectors" in name
|
||||
):
|
||||
return
|
||||
return super().filter_tensors(item)
|
||||
|
||||
|
||||
@@ -241,7 +260,8 @@ class GraniteHybridModel(Mamba2Model, GraniteMoeModel):
|
||||
assert self.d_inner % d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {d_head}"
|
||||
|
||||
def set_vocab(self):
|
||||
self.hparams["pad_vocab_size_multiple"] = 8
|
||||
# For models with no ssm layers, don't pad for mamba2
|
||||
self.hparams["pad_vocab_size_multiple"] = 8 if self._ssm_layers else 1
|
||||
Mamba2Model.set_vocab(self)
|
||||
|
||||
|
||||
@@ -326,3 +346,133 @@ class GraniteSpeechMmprojModel(MmprojModel):
|
||||
data_torch = data_torch.squeeze(1)
|
||||
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("Granite4VisionForConditionalGeneration")
|
||||
class Granite4VisionMmprojModel(MmprojModel):
|
||||
has_vision_encoder = True
|
||||
has_audio_encoder = False
|
||||
|
||||
@staticmethod
|
||||
def get_normalized_projector_map(global_config: dict) -> list[tuple[int, int, str, int]]:
|
||||
"""Normalize both deepstack and spatial projector maps to the form:
|
||||
(vision_layer, llm_layer, <type>, type_index)
|
||||
|
||||
This is then used to populate the following mappings:
|
||||
- vision_feature_layers (mmproj hparam): ordered list of all
|
||||
vision_layer values where order corresponds with the order of the
|
||||
stacked projector tensors
|
||||
NOTE: Values may appear multiple times for spatial projectors
|
||||
- tensor_prefix_map (mmproj tensors): mapping from tensor prefixes to
|
||||
the index of the corresponding projector in the stacked tensors
|
||||
- deepstack_layer_arr (llm hparam): per-text-layer array indicating
|
||||
which input vision feature should be injected at that layer
|
||||
(-1 if none)
|
||||
|
||||
Output: (vision_layer, llm_layer, <type>, type_index)
|
||||
"""
|
||||
deepstack_map = global_config.get("deepstack_layer_map", []) # [[vis_layer, llm_layer], ...]
|
||||
spatial_layers = global_config.get("spatial_target_layers", []) # [llm_layer, ...]
|
||||
n_text_layers = global_config["text_config"]["num_hidden_layers"]
|
||||
n_vision_layers = global_config["vision_config"]["num_hidden_layers"]
|
||||
normalized_projector_map = []
|
||||
if deepstack_map:
|
||||
for deepstack_idx, (vision_layer, llm_layer) in enumerate(sorted(deepstack_map)):
|
||||
if vision_layer < 0:
|
||||
vision_layer = n_vision_layers + vision_layer
|
||||
if llm_layer < 0:
|
||||
llm_layer = n_text_layers + llm_layer
|
||||
normalized_projector_map.append((vision_layer, llm_layer, "layerwise", deepstack_idx))
|
||||
if spatial_layers:
|
||||
spatial_vision_layer = global_config.get("spatial_vision_layer", -1)
|
||||
if spatial_vision_layer < 0:
|
||||
spatial_vision_layer = n_vision_layers + spatial_vision_layer
|
||||
for spatial_idx, llm_layer in enumerate(spatial_layers):
|
||||
normalized_projector_map.append((spatial_vision_layer, llm_layer, "spatial", spatial_idx))
|
||||
return list(sorted(normalized_projector_map, key=(lambda entry: entry[1])))
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
normalized_projector_map = self.get_normalized_projector_map(self.global_config)
|
||||
self._n_proj = len(normalized_projector_map)
|
||||
|
||||
self._tensor_prefix_map = {
|
||||
f"model.{proj_type}_projectors.{type_idx}": proj_idx
|
||||
for proj_idx, (_, _, proj_type, type_idx) in enumerate(normalized_projector_map)
|
||||
}
|
||||
self._vision_feature_layers = [vision_layer for vision_layer, _, _, _ in normalized_projector_map]
|
||||
self._spatial_offsets = [
|
||||
type_idx if proj_type == "spatial" else -1
|
||||
for _, _, proj_type, type_idx in normalized_projector_map
|
||||
]
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
assert self.hparams_vision is not None
|
||||
super().set_gguf_parameters()
|
||||
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GRANITE4_VISION)
|
||||
|
||||
# SigLIP encoder hparams
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
|
||||
self.gguf_writer.add_vision_use_gelu(True)
|
||||
|
||||
# Preprocessor
|
||||
self.gguf_writer.add_vision_preproc_image_size(self.hparams.get("image_size", 384))
|
||||
|
||||
# QFormer projector config
|
||||
ds_rate = self.global_config["downsample_rate"]
|
||||
ds_parts = ds_rate.split("/")
|
||||
assert len(ds_parts) == 2, f"Invalid 'downsample_rate' value: {ds_rate}"
|
||||
query_side, window_side = [int(p) for p in ds_parts]
|
||||
self.gguf_writer.add_vision_projector_query_side(query_side)
|
||||
self.gguf_writer.add_vision_projector_window_side(window_side)
|
||||
|
||||
# Set vision feature layers
|
||||
self.gguf_writer.add_vision_feature_layers(self._vision_feature_layers)
|
||||
|
||||
# Set the spatial offests per projector
|
||||
self.gguf_writer.add_vision_spatial_offsets(self._spatial_offsets)
|
||||
|
||||
# Add flattened image grind pinpoints (resolution candidates internally)
|
||||
if pinpoints := self.global_config.get("image_grid_pinpoints"):
|
||||
# Flatten with h, w -> w, h inversion
|
||||
pinpoints = [val for h, w in pinpoints for val in (w, h)]
|
||||
self.gguf_writer.add_vision_image_grid_pinpoints(pinpoints)
|
||||
|
||||
@classmethod
|
||||
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
|
||||
name, _ = item
|
||||
if ("vision_model.head" in name or name.startswith("lm_head")):
|
||||
return None
|
||||
return super().filter_tensors(item)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
|
||||
# Detect projector tensors and bin them
|
||||
projector_idx = None
|
||||
for prefix, proj_idx in self._tensor_prefix_map.items():
|
||||
if name.startswith(prefix):
|
||||
projector_idx = proj_idx
|
||||
break
|
||||
if projector_idx is not None:
|
||||
# If this projector tensor has a block id within the projector,
|
||||
# alias the bid to projector_idx
|
||||
#
|
||||
# TODO: currently, none of the Granite 4 Vision models have
|
||||
# projectors with multiple QFormer layers, so the `layer.{}` index
|
||||
# is always 0. This allows us to simply map to a single `bid` that
|
||||
# matches the projector index. If this changes, we'll need a
|
||||
# convention that merges the two IDs.
|
||||
id_matches = list(re.finditer(r"\.([0-9]+)\.", name))
|
||||
all_ids = [int(m.group(1)) for m in id_matches]
|
||||
assert len(all_ids) >= 1 and len(all_ids) <= 2, "Must have at least 1 and at most 2 ids in tensor names"
|
||||
# If not layer id, just use the projector index
|
||||
new_bid = projector_idx
|
||||
if len(all_ids) == 1:
|
||||
new_name = name[:id_matches[0].span(1)[0]] + str(new_bid) + name[id_matches[0].span(1)[1]:]
|
||||
else: # len(all_ids) == 2
|
||||
new_bid = projector_idx # + all_ids[1]
|
||||
new_name = name[:id_matches[0].span(0)[0]] + name[id_matches[0].span(1)[1]:id_matches[1].span(1)[0]] + str(new_bid) + name[id_matches[1].span(1)[1]:]
|
||||
yield from super().modify_tensors(data_torch, new_name, new_bid)
|
||||
return
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
+130
-1
@@ -5,12 +5,13 @@ import math
|
||||
|
||||
from typing import Callable, Iterable, TYPE_CHECKING
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from torch import Tensor
|
||||
|
||||
from .base import ModelBase, TextModel, gguf
|
||||
from .base import ModelBase, TextModel, gguf, logger
|
||||
|
||||
|
||||
@ModelBase.register(
|
||||
@@ -21,6 +22,9 @@ from .base import ModelBase, TextModel, gguf
|
||||
"VLlama3ForCausalLM",
|
||||
"LlavaForConditionalGeneration",
|
||||
"VoxtralForConditionalGeneration",
|
||||
"LlamaForCausalLMEagle3",
|
||||
"Eagle3Speculator",
|
||||
"Eagle3DraftModel",
|
||||
"IQuestCoderForCausalLM",
|
||||
"LlamaModel")
|
||||
class LlamaModel(TextModel):
|
||||
@@ -39,7 +43,61 @@ class LlamaModel(TextModel):
|
||||
hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
|
||||
self.origin_hf_arch = hparams.get('architectures', [None])[0]
|
||||
|
||||
# Detect eagle3 draft checkpoint by hparams (some models don't use a distinct HF arch name)
|
||||
if "draft_vocab_size" in self.hparams and self.hparams["num_hidden_layers"] == 1:
|
||||
self.is_eagle3 = True
|
||||
self.model_arch = gguf.MODEL_ARCH.EAGLE3
|
||||
logger.info("Detected EAGLE-3 draft model, switching to EAGLE3 architecture")
|
||||
# Re-initialize tensor_map with eagle3 architecture
|
||||
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
||||
# Update gguf_writer architecture
|
||||
self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
|
||||
self.gguf_writer.add_architecture()
|
||||
if self.target_model_dir is None:
|
||||
raise ValueError(
|
||||
"EAGLE-3 model requires --target-model-dir to be specified. "
|
||||
"Please provide the path to the target model directory to read config.json"
|
||||
)
|
||||
# Read both eagle3 raw config and target model config
|
||||
with open(self.dir_model / "config.json", 'r', encoding='utf-8') as f:
|
||||
eagle3_raw_config = json.load(f)
|
||||
with open(self.target_model_dir / "config.json", 'r', encoding='utf-8') as f:
|
||||
target_config = json.load(f)
|
||||
|
||||
if "text_config" in target_config:
|
||||
target_config = {**target_config, **target_config["text_config"]}
|
||||
self.target_vocab_size = target_config["vocab_size"]
|
||||
|
||||
# target_layers: derived from target model layer count (low/mid/high)
|
||||
target_num_layers = target_config["num_hidden_layers"]
|
||||
target_layers = [2, target_num_layers // 2, target_num_layers - 3]
|
||||
logger.info(f"EAGLE-3: target_layers = {target_layers} (target model has {target_num_layers} layers)")
|
||||
self.gguf_writer.add_array(f"{self.gguf_writer.arch}.target_layers", target_layers)
|
||||
|
||||
# target_hidden_size: prefer eagle3 config, fallback to target config
|
||||
if eagle3_raw_config.get("target_hidden_size") is not None:
|
||||
target_hidden_size = eagle3_raw_config["target_hidden_size"]
|
||||
src = "EAGLE-3 config"
|
||||
else:
|
||||
target_hidden_size = target_config["hidden_size"]
|
||||
src = "target model config"
|
||||
logger.info(f"EAGLE-3: target_hidden_size = {target_hidden_size} (from {src})")
|
||||
self.gguf_writer.add_uint32(f"{self.gguf_writer.arch}.target_hidden_size", target_hidden_size)
|
||||
|
||||
# norm_before_residual (RedHat-style eagle3 specific)
|
||||
norm_before_residual = eagle3_raw_config.get("norm_before_residual", False)
|
||||
logger.info(f"EAGLE-3: norm_before_residual = {norm_before_residual}")
|
||||
self.gguf_writer.add_bool(f"{self.gguf_writer.arch}.norm_before_residual", norm_before_residual)
|
||||
|
||||
def set_vocab(self):
|
||||
# eagle3: use tokenizer from target model if provided
|
||||
original_dir_model = None
|
||||
if getattr(self, 'is_eagle3', False):
|
||||
assert self.target_model_dir is not None
|
||||
logger.info(f"EAGLE-3: Using tokenizer from target model: {self.target_model_dir}")
|
||||
original_dir_model = self.dir_model
|
||||
self.dir_model = self.target_model_dir
|
||||
|
||||
if self.origin_hf_arch == "GlmasrModel":
|
||||
return self._set_vocab_glmedge()
|
||||
|
||||
@@ -85,6 +143,10 @@ class LlamaModel(TextModel):
|
||||
if self.hparams.get("vocab_size", 32000) == 49152:
|
||||
self.gguf_writer.add_add_bos_token(False)
|
||||
|
||||
# eagle3: Restore original dir_model
|
||||
if original_dir_model is not None:
|
||||
self.dir_model = original_dir_model
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
@@ -129,7 +191,49 @@ class LlamaModel(TextModel):
|
||||
|
||||
return super().filter_tensors((name, gen))
|
||||
|
||||
def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Callable[[], Tensor]]:
|
||||
tensors = super().index_tensors(remote_hf_model_id)
|
||||
|
||||
# Handle Eagle3Speculator nested config
|
||||
if "transformer_layer_config" in self.hparams:
|
||||
self.hparams = {**self.hparams, **self.hparams["transformer_layer_config"]}
|
||||
|
||||
# eagle3 detection
|
||||
if "draft_vocab_size" in self.hparams and self.hparams["num_hidden_layers"] == 1:
|
||||
logger.info("EAGLE-3: renaming midlayer.* / layers.0.* to model.layers.0.*")
|
||||
new_tensors = {}
|
||||
for name, gen in tensors.items():
|
||||
if name.startswith("midlayer."):
|
||||
new_name = "model.layers.0." + name[len("midlayer."):]
|
||||
new_tensors[new_name] = gen
|
||||
elif name.startswith("layers.0."): # Eagle3Speculator format
|
||||
new_name = "model." + name
|
||||
new_tensors[new_name] = gen
|
||||
else:
|
||||
new_tensors[name] = gen
|
||||
return new_tensors
|
||||
|
||||
return tensors
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# eagle3: special tensors that bypass standard llama mapping
|
||||
if getattr(self, 'is_eagle3', False):
|
||||
if name == "fc.weight":
|
||||
yield (name, data_torch)
|
||||
return
|
||||
if name == "d2t":
|
||||
# store for manual int64 handling in prepare_tensors (avoid F32 conversion)
|
||||
if not hasattr(self, '_eagle3_int_tensors'):
|
||||
self._eagle3_int_tensors = {}
|
||||
self._eagle3_int_tensors[name] = data_torch
|
||||
return
|
||||
if name == "t2d":
|
||||
# not used at runtime, skip
|
||||
return
|
||||
if name.endswith(".hidden_norm.weight"):
|
||||
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_NORM_2, bid), data_torch)
|
||||
return
|
||||
|
||||
n_head = self.find_hparam(["n_heads", "num_attention_heads"])
|
||||
n_kv_head = self.find_hparam(["n_kv_heads", "num_key_value_heads"])
|
||||
|
||||
@@ -205,8 +309,33 @@ class LlamaModel(TextModel):
|
||||
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
|
||||
|
||||
def prepare_tensors(self):
|
||||
# eagle3: collect d2t original dtype before parent converts tensors to F32
|
||||
eagle3_original_dtypes = {}
|
||||
if getattr(self, 'is_eagle3', False):
|
||||
for name, data_torch in self.get_tensors():
|
||||
if name == "d2t":
|
||||
eagle3_original_dtypes[name] = data_torch.dtype
|
||||
|
||||
super().prepare_tensors()
|
||||
|
||||
# eagle3: write d2t as absolute target token ids
|
||||
if getattr(self, 'is_eagle3', False) and hasattr(self, '_eagle3_int_tensors'):
|
||||
for name, data_torch in self._eagle3_int_tensors.items():
|
||||
old_dtype = eagle3_original_dtypes.get(name, data_torch.dtype)
|
||||
data = data_torch.to(torch.int64).cpu().numpy()
|
||||
if name == "d2t":
|
||||
data = data.reshape(-1)
|
||||
data = data + np.arange(data.size, dtype=np.int64)
|
||||
if np.any((data < 0) | (data >= self.target_vocab_size)):
|
||||
raise ValueError(f"EAGLE-3 d2t target ids out of range for target vocab size {self.target_vocab_size}")
|
||||
if np.unique(data).size != data.size:
|
||||
raise ValueError("EAGLE-3 d2t contains duplicate target ids")
|
||||
data_qtype = gguf.GGMLQuantizationType.I64
|
||||
|
||||
shape_str = f"{{{', '.join(str(n) for n in reversed(data.shape))}}}"
|
||||
logger.info(f"{name + ',':<30} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
|
||||
self.gguf_writer.add_tensor(name, data, raw_dtype=data_qtype)
|
||||
|
||||
if self._experts is not None:
|
||||
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
||||
experts = [k for d in self._experts for k in d.keys()]
|
||||
|
||||
@@ -105,8 +105,9 @@ class MistralModel(LlamaModel):
|
||||
gguf_writer.add_rope_scaling_yarn_log_mul(mscale_all_dim)
|
||||
gguf_writer.add_rope_scaling_orig_ctx_len(yarn_params["original_max_position_embeddings"])
|
||||
|
||||
if "llama_4_scaling" in hparams:
|
||||
gguf_writer.add_attn_temperature_scale(hparams["llama_4_scaling"]["beta"])
|
||||
llama_4_scaling = hparams.get("llama_4_scaling")
|
||||
if llama_4_scaling is not None:
|
||||
gguf_writer.add_attn_temperature_scale(llama_4_scaling["beta"])
|
||||
|
||||
|
||||
class MistralMoeModel(DeepseekV2Model):
|
||||
|
||||
+11
-1
@@ -153,6 +153,15 @@ def parse_args() -> argparse.Namespace:
|
||||
help="Store tensors dequantized from FP8 as Q8_0 instead of BF16/F16.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--target-model-dir", type=str, default=None,
|
||||
help=(
|
||||
"path to the target model directory; required when converting a standalone draft model "
|
||||
"(e.g. EAGLE3 / DFlash) that needs target-model metadata such as tokenizer, hidden size, and "
|
||||
"layer count to populate its GGUF."
|
||||
),
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
if not args.print_supported_models and args.model is None:
|
||||
parser.error("the following arguments are required: model")
|
||||
@@ -238,7 +247,7 @@ def main() -> None:
|
||||
assert hparams.get("vision_encoder") is not None, "This model does not support multimodal"
|
||||
from conversion.pixtral import PixtralModel
|
||||
model_class = PixtralModel
|
||||
elif "moe" in hparams:
|
||||
elif hparams.get("moe") is not None:
|
||||
from conversion.mistral import MistralMoeModel
|
||||
model_class = MistralMoeModel
|
||||
else:
|
||||
@@ -269,6 +278,7 @@ def main() -> None:
|
||||
small_first_shard=args.no_tensor_first_split,
|
||||
remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template,
|
||||
sentence_transformers_dense_modules=args.sentence_transformers_dense_modules,
|
||||
target_model_dir=Path(args.target_model_dir) if args.target_model_dir else None,
|
||||
fuse_gate_up_exps=args.fuse_gate_up_exps,
|
||||
fp8_as_q8=args.fp8_as_q8,
|
||||
)
|
||||
|
||||
@@ -100,6 +100,7 @@ models = [
|
||||
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
|
||||
{"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
|
||||
{"name": "tiny_aya", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereLabs/tiny-aya-base", },
|
||||
{"name": "cohere2moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereLabs/North-Mini-Code-1.0", },
|
||||
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
|
||||
{"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
|
||||
{"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
|
||||
|
||||
+11
-5
@@ -311,6 +311,10 @@ def parse_args() -> argparse.Namespace:
|
||||
"--base-model-id", type=str,
|
||||
help="the model ID of the base model, if it is not available locally or in the adapter config. If specified, it will ignore --base and load the base model config from the Hugging Face hub (Example: 'meta-llama/Llama-3.2-1B-Instruct')",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--trust-remote-code", default=False, action="store_true",
|
||||
help="trust remote code in the model",
|
||||
)
|
||||
parser.add_argument(
|
||||
"lora_path", type=Path,
|
||||
help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)",
|
||||
@@ -319,11 +323,11 @@ def parse_args() -> argparse.Namespace:
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_hparams_from_hf(hf_model_id: str) -> tuple[dict[str, Any], Path | None]:
|
||||
def load_hparams_from_hf(hf_model_id: str, trust_remote_code: bool) -> tuple[dict[str, Any], Path | None]:
|
||||
from huggingface_hub import try_to_load_from_cache
|
||||
|
||||
# normally, adapter does not come with base model config, we need to load it from AutoConfig
|
||||
config = AutoConfig.from_pretrained(hf_model_id)
|
||||
config = AutoConfig.from_pretrained(hf_model_id, trust_remote_code=trust_remote_code)
|
||||
cache_dir = try_to_load_from_cache(hf_model_id, "config.json")
|
||||
cache_dir = Path(cache_dir).parent if isinstance(cache_dir, str) else None
|
||||
|
||||
@@ -372,13 +376,13 @@ if __name__ == '__main__':
|
||||
# load base model
|
||||
if base_model_id is not None:
|
||||
logger.info(f"Loading base model from Hugging Face: {base_model_id}")
|
||||
hparams, dir_base_model = load_hparams_from_hf(base_model_id)
|
||||
hparams, dir_base_model = load_hparams_from_hf(base_model_id, args.trust_remote_code)
|
||||
elif dir_base_model is None:
|
||||
if "base_model_name_or_path" in lparams:
|
||||
model_id = lparams["base_model_name_or_path"]
|
||||
logger.info(f"Loading base model from Hugging Face: {model_id}")
|
||||
try:
|
||||
hparams, dir_base_model = load_hparams_from_hf(model_id)
|
||||
hparams, dir_base_model = load_hparams_from_hf(model_id, args.trust_remote_code)
|
||||
except OSError as e:
|
||||
logger.error(f"Failed to load base model config: {e}")
|
||||
logger.error("Please try downloading the base model and add its path to --base")
|
||||
@@ -393,7 +397,9 @@ if __name__ == '__main__':
|
||||
|
||||
with torch.inference_mode():
|
||||
try:
|
||||
model_class = get_model_class(hparams["architectures"][0])
|
||||
model_arch = hparams.get("text_config", {}).get("architectures", hparams["architectures"])[0]
|
||||
logger.info("Using model architecture: %s", model_arch)
|
||||
model_class = get_model_class(model_arch)
|
||||
except NotImplementedError:
|
||||
logger.error(f"Model {hparams['architectures'][0]} is not supported")
|
||||
sys.exit(1)
|
||||
|
||||
+31
-31
@@ -14,15 +14,15 @@ Legend:
|
||||
|
||||
| Operation | BLAS | CANN | CPU | CUDA | MTL | OpenCL | SYCL | Vulkan | WebGPU | ZenDNN | zDNN |
|
||||
|-----------|------|------|------|------|------|------|------|------|------|------|------|
|
||||
| ABS | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| ABS | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
||||
| ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||
| ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| ADD_ID | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ |
|
||||
| CEIL | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| CEIL | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| CLAMP | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
|
||||
@@ -41,25 +41,25 @@ Legend:
|
||||
| DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| ELU | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| EXP | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| EXPM1 | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| ELU | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| EXP | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| EXPM1 | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
|
||||
| FILL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
|
||||
| FLOOR | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| GATED_DELTA_NET | ❌ | ❌ | ✅ | ❌ | 🟡 | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||
| FLOOR | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
|
||||
| GATED_DELTA_NET | ❌ | ❌ | ✅ | ❌ | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| GATED_LINEAR_ATTN | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
| GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| GEGLU_QUICK | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| GELU | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| GELU_ERF | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| GELU_QUICK | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| GEGLU_QUICK | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| GELU | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| GELU_ERF | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| GELU_QUICK | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ❌ | ❌ |
|
||||
| GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| HARDSWISH | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| HARDSWISH | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| IM2COL | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| L2_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
@@ -68,9 +68,9 @@ Legend:
|
||||
| MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| MUL_MAT_HADAMARD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
| MUL_MAT_HADAMARD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | ❌ |
|
||||
| NEG | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| NEG | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||
| OPT_STEP_SGD | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||
@@ -79,27 +79,27 @@ Legend:
|
||||
| PAD_REFLECT_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
| POOL_1D | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| RELU | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| RELU | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| RMS_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| ROLL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| ROPE | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| ROUND | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| ROUND | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
|
||||
| RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| SET | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
|
||||
| SET_ROWS | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
|
||||
| SGN | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| SIGMOID | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| SILU | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| SGN | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| SIGMOID | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| SILU | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||
| SIN | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
||||
| SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
|
||||
@@ -107,16 +107,16 @@ Legend:
|
||||
| SQRT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| SSM_CONV | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| STEP | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| STEP | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| SUM | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
|
||||
| SUM_ROWS | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ |
|
||||
| SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| SWIGLU_OAI | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| TANH | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| SWIGLU_OAI | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| TANH | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| TRUNC | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| TRUNC | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
|
||||
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| XIELU | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
|
||||
|
||||
+7157
-4196
File diff suppressed because it is too large
Load Diff
+1
-1
@@ -4,7 +4,7 @@ project("ggml" C CXX ASM)
|
||||
|
||||
### GGML Version
|
||||
set(GGML_VERSION_MAJOR 0)
|
||||
set(GGML_VERSION_MINOR 13)
|
||||
set(GGML_VERSION_MINOR 15)
|
||||
set(GGML_VERSION_PATCH 1)
|
||||
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
|
||||
|
||||
|
||||
@@ -8,10 +8,10 @@ extern "C" {
|
||||
|
||||
#define RPC_PROTO_MAJOR_VERSION 4
|
||||
#define RPC_PROTO_MINOR_VERSION 0
|
||||
#define RPC_PROTO_PATCH_VERSION 0
|
||||
#define RPC_PROTO_PATCH_VERSION 1
|
||||
|
||||
#ifdef __cplusplus
|
||||
static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT has changed - update RPC_PROTO_PATCH_VERSION");
|
||||
static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT has changed - update RPC_PROTO_PATCH_VERSION");
|
||||
#endif
|
||||
|
||||
#define GGML_RPC_MAX_SERVERS 16
|
||||
|
||||
+23
-5
@@ -535,6 +535,7 @@ extern "C" {
|
||||
GGML_OP_IM2COL,
|
||||
GGML_OP_IM2COL_BACK,
|
||||
GGML_OP_IM2COL_3D,
|
||||
GGML_OP_COL2IM_1D,
|
||||
GGML_OP_CONV_2D,
|
||||
GGML_OP_CONV_3D,
|
||||
GGML_OP_CONV_2D_DW,
|
||||
@@ -2007,6 +2008,16 @@ extern "C" {
|
||||
int d1, // dilation dimension 1
|
||||
bool is_2D);
|
||||
|
||||
// col2im_1d: scatter-add GEMM columns back to 1D signal
|
||||
// a: [K*OC, T_in] (columns from matmul, K = a->ne[0]/OC)
|
||||
// result: [T_out, OC] where T_out = (T_in - 1)*s0 + K - 2*p0
|
||||
GGML_API struct ggml_tensor * ggml_col2im_1d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a, // columns [K*OC, T_in]
|
||||
int s0, // stride
|
||||
int oc, // output channels
|
||||
int p0); // padding to crop from both sides
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_conv_1d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a, // convolution kernel
|
||||
@@ -2542,10 +2553,16 @@ extern "C" {
|
||||
// TODO: add ggml_gated_delta_net_set_bcast() to be able to configure Q, K broadcast type: tiled vs interleaved [TAG_GGML_GDN_BCAST]
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/19468#discussion_r2786394306
|
||||
//
|
||||
// state is a 3D tensor of shape (S_v*S_v*H, K, n_seqs):
|
||||
// K == 1: output carries the final state only.
|
||||
// K > 1: output carries K snapshot slots; the kernel writes the last min(n_tokens, K)
|
||||
// per-token snapshots into the trailing slots
|
||||
// tensor shapes (S_k == S_v, H_v % H_k == 0):
|
||||
// q, k : [S_k, H_k, n_tokens, n_seqs]
|
||||
// v : [S_v, H_v, n_tokens, n_seqs]
|
||||
// g : [1, H_v, n_tokens, n_seqs] (scalar gate) or [S_v, H_v, n_tokens, n_seqs] (KDA)
|
||||
// beta : [1, H_v, n_tokens, n_seqs]
|
||||
// state : [S_v, S_v, H_v, n_seqs] -- initial recurrent state s0
|
||||
//
|
||||
// the output packs the attention scores [S_v, H_v, n_tokens, n_seqs] followed by K state
|
||||
// snapshots, most-recent first (slot 0 = final state, slot s = state s tokens back). K == 1
|
||||
// keeps only the final state; when n_tokens < K only slots 0..n_tokens-1 are written.
|
||||
GGML_API struct ggml_tensor * ggml_gated_delta_net(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * q,
|
||||
@@ -2553,7 +2570,8 @@ extern "C" {
|
||||
struct ggml_tensor * v,
|
||||
struct ggml_tensor * g,
|
||||
struct ggml_tensor * beta,
|
||||
struct ggml_tensor * state);
|
||||
struct ggml_tensor * state,
|
||||
int64_t K);
|
||||
|
||||
// custom operators
|
||||
|
||||
|
||||
@@ -776,8 +776,8 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
||||
GGML_ASSERT(src_ss[2].axis == GGML_BACKEND_SPLIT_AXIS_1);
|
||||
GGML_ASSERT(src_ss[3].axis == GGML_BACKEND_SPLIT_AXIS_1);
|
||||
GGML_ASSERT(src_ss[4].axis == GGML_BACKEND_SPLIT_AXIS_1);
|
||||
// state shape is (S_v*S_v*H, K, n_seqs); the heads dim is nested inside axis 0,
|
||||
// so a head-aligned split on the input cache reshapes to axis 0 here (not axis 2).
|
||||
// state shape is [S_v, S_v, H_v, n_seqs] (s0 only); the heads dim is its own axis 2,
|
||||
// so a head-aligned split on the input cache lands on axis 2 here.
|
||||
GGML_ASSERT(src_ss[5].axis == GGML_BACKEND_SPLIT_AXIS_2 || src_ss[5].axis == GGML_BACKEND_SPLIT_AXIS_1 || src_ss[5].axis == GGML_BACKEND_SPLIT_AXIS_0);
|
||||
return {GGML_BACKEND_SPLIT_AXIS_0, {0}, {1}, 1};
|
||||
};
|
||||
|
||||
@@ -1912,6 +1912,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
{
|
||||
ggml_compute_forward_im2col_3d(params, tensor);
|
||||
} break;
|
||||
case GGML_OP_COL2IM_1D:
|
||||
{
|
||||
ggml_compute_forward_col2im_1d(params, tensor);
|
||||
} break;
|
||||
case GGML_OP_CONV_2D:
|
||||
{
|
||||
ggml_compute_forward_conv_2d(params, tensor);
|
||||
@@ -2343,6 +2347,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||
case GGML_OP_CONV_2D:
|
||||
case GGML_OP_CONV_3D:
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
case GGML_OP_COL2IM_1D:
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||
{
|
||||
@@ -2943,7 +2948,7 @@ struct ggml_cplan ggml_graph_plan(
|
||||
case GGML_OP_GATED_DELTA_NET:
|
||||
{
|
||||
const int64_t S_v = node->src[2]->ne[0];
|
||||
const int64_t K = node->src[5]->ne[1]; // state is (D, K, n_seqs)
|
||||
const int64_t K = ggml_get_op_params_i32(node, 0);
|
||||
const int64_t per_thread = S_v + (K > 1 ? S_v * S_v : 0);
|
||||
cur = per_thread * sizeof(float) * n_tasks;
|
||||
} break;
|
||||
|
||||
+86
-15
@@ -4008,12 +4008,12 @@ static void ggml_compute_forward_rms_norm_back_f32(
|
||||
// dx := scale(dx, rrms)
|
||||
float * dx = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
|
||||
|
||||
// dx[i00] = (x*(-sum_xdz/sum_eps) + dz) / sqrtf(mean_eps)
|
||||
ggml_vec_cpy_f32 (ne00, dx, x);
|
||||
// ggml_vec_scale_f32(ne00, dx, -mean_xdz/mean_eps);
|
||||
ggml_vec_scale_f32(ne00, dx, (float)(-sum_xdz)/sum_eps);
|
||||
ggml_vec_acc_f32 (ne00, dx, dz);
|
||||
ggml_vec_scale_f32(ne00, dx, rrms);
|
||||
// dx[i00] = (dz + x*(-sum_xdz/sum_eps)) * rrms
|
||||
// note: https://github.com/ggml-org/ggml/issues/1491
|
||||
const float scale_x = (float) (-sum_xdz) / sum_eps;
|
||||
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
||||
dx[i00] = (dz[i00] + x[i00] * scale_x) * rrms;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -6730,6 +6730,78 @@ static inline int64_t ggml_wrap_around(int64_t coord, int64_t size) {
|
||||
return (coord + size) % size; // adding size avoids negative number weirdness
|
||||
}
|
||||
|
||||
// ggml_compute_forward_col2im_1d
|
||||
//
|
||||
// Scatter-add columns [K*OC, T_in] -> signal [T_out, OC]
|
||||
// where T_out = (T_in - 1)*s + K - 2*p. Gather approach: each output reads ceil(K/s) inputs.
|
||||
// Parallelized over the time axis so the split stays balanced whatever OC is.
|
||||
// Supports F32, F16, BF16 input/output (same type), F32 accumulator.
|
||||
|
||||
template <typename elem_t>
|
||||
static void ggml_compute_forward_col2im_1d_impl(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
|
||||
const ggml_tensor * src = dst->src[0]; // [K*OC, T_in]
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src));
|
||||
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||
|
||||
const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
|
||||
const int32_t OC = ((const int32_t *)(dst->op_params))[1];
|
||||
const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
|
||||
|
||||
const int64_t K_OC = src->ne[0];
|
||||
const int64_t T_in = src->ne[1];
|
||||
const int64_t K = K_OC / OC;
|
||||
const int64_t T_out = dst->ne[0];
|
||||
|
||||
const elem_t * col_data = (const elem_t *) src->data;
|
||||
elem_t * dst_data = (elem_t *) dst->data;
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
// Parallelize over the time axis: the split stays balanced whatever OC is,
|
||||
// down to OC = 1 for mono audio, and threads read disjoint column bands
|
||||
const int64_t dr = (T_out + nth - 1) / nth;
|
||||
const int64_t it0 = dr * ith;
|
||||
const int64_t it1 = it0 + dr < T_out ? it0 + dr : T_out;
|
||||
|
||||
for (int64_t oc = 0; oc < OC; oc++) {
|
||||
for (int64_t t_out = it0; t_out < it1; t_out++) {
|
||||
const int64_t t_abs = t_out + p0; // absolute position in uncropped signal
|
||||
// Gather: find all (t_in, k) where t_in * s + k == t_abs, 0 <= k < K
|
||||
int64_t t_in_min = (t_abs - K + 1 + s0 - 1) / s0; // ceil((t_abs-K+1)/s)
|
||||
if (t_in_min < 0) t_in_min = 0;
|
||||
int64_t t_in_max = t_abs / s0;
|
||||
if (t_in_max >= T_in) t_in_max = T_in - 1;
|
||||
|
||||
float sum = 0.0f;
|
||||
for (int64_t t_in = t_in_min; t_in <= t_in_max; t_in++) {
|
||||
int64_t k = t_abs - t_in * s0;
|
||||
if (k >= 0 && k < K) {
|
||||
// col layout: [K*OC, T_in], element (oc*K+k, t_in)
|
||||
sum += type_conversion_table<elem_t>::to_f32(col_data[(oc * K + k) + t_in * K_OC]);
|
||||
}
|
||||
}
|
||||
// dst layout: [T_out, OC], element (t_out, oc)
|
||||
dst_data[t_out + oc * T_out] = type_conversion_table<elem_t>::from_f32(sum);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_compute_forward_col2im_1d(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
switch (dst->src[0]->type) {
|
||||
case GGML_TYPE_F32: ggml_compute_forward_col2im_1d_impl<float> (params, dst); break;
|
||||
case GGML_TYPE_F16: ggml_compute_forward_col2im_1d_impl<ggml_fp16_t>(params, dst); break;
|
||||
case GGML_TYPE_BF16: ggml_compute_forward_col2im_1d_impl<ggml_bf16_t>(params, dst); break;
|
||||
default: GGML_ABORT("col2im_1d: unsupported type %d", dst->src[0]->type);
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_conv_2d
|
||||
|
||||
|
||||
@@ -10552,11 +10624,11 @@ static void ggml_compute_forward_gated_delta_net_one_chunk(
|
||||
|
||||
const bool kda = (neg0 == S_v);
|
||||
|
||||
// state is 3D (S_v*S_v*H, K, n_seqs); K is the snapshot slot count.
|
||||
const int64_t K = src_state->ne[1];
|
||||
// K (snapshot slot count) is an op param; state holds s0 only [S_v, S_v, H, n_seqs].
|
||||
const int64_t K = ggml_get_op_params_i32(dst, 0);
|
||||
GGML_ASSERT(K >= 1);
|
||||
// per-seq stride in floats (slot 0 of seq s lives at state + s * seq_stride)
|
||||
const int64_t state_seq_stride = src_state->nb[2] / sizeof(float);
|
||||
// per-seq stride in floats (seq s starts at state + s * seq_stride)
|
||||
const int64_t state_seq_stride = src_state->nb[3] / sizeof(float);
|
||||
|
||||
const int64_t per_thread = S_v + (K > 1 ? S_v * S_v : 0);
|
||||
const int ith = params->ith;
|
||||
@@ -10572,9 +10644,8 @@ static void ggml_compute_forward_gated_delta_net_one_chunk(
|
||||
float * attn_out_base = (float *)dst->data;
|
||||
float * state_out_base = (float *)dst->data + attn_score_elems;
|
||||
|
||||
// snapshot slot mapping: target_slot = t - shift. When n_tokens < K only the last
|
||||
// n_tokens slots are written; earlier slots are left untouched (caller-owned).
|
||||
const int64_t shift = n_tokens - K;
|
||||
// snapshot slot mapping: slot 0 = most recent state, slot s = s tokens back.
|
||||
// When n_tokens < K only slots 0..n_tokens-1 are written; older slots are caller-owned.
|
||||
|
||||
const float * state_in_base = (const float *)src_state->data;
|
||||
|
||||
@@ -10602,7 +10673,7 @@ static void ggml_compute_forward_gated_delta_net_one_chunk(
|
||||
: state_out_base + (iv3 * H + iv1) * S_v * S_v;
|
||||
|
||||
// copy input state into the working buffer and operate in-place
|
||||
// state layout (D, K, n_seqs): slot 0 of seq iv3 starts at iv3 * state_seq_stride.
|
||||
// state layout [S_v, S_v, H, n_seqs]: seq iv3 starts at iv3 * state_seq_stride.
|
||||
const float * s_in = state_in_base + iv3 * state_seq_stride + iv1 * S_v * S_v;
|
||||
memcpy(s_out, s_in, S_v * S_v * sizeof(float));
|
||||
|
||||
@@ -10655,7 +10726,7 @@ static void ggml_compute_forward_gated_delta_net_one_chunk(
|
||||
attn_data += S_v * H; // advance to next token
|
||||
|
||||
if (K > 1) {
|
||||
const int64_t target_slot = t - shift;
|
||||
const int64_t target_slot = n_tokens - 1 - t;
|
||||
if (target_slot >= 0 && target_slot < K) {
|
||||
float * curr_state_o = state_out_base + target_slot * state_size_per_snap +
|
||||
(iv3 * H + iv1) * S_v * S_v;
|
||||
|
||||
@@ -68,6 +68,7 @@ void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * p
|
||||
void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_im2col_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_col2im_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_conv_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
|
||||
@@ -1,16 +1,18 @@
|
||||
#include "concat.cuh"
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
// contiguous kernels
|
||||
template <int dim>
|
||||
static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE) concat_f32_cont(const float * x,
|
||||
const float * y,
|
||||
float * dst,
|
||||
int64_t ne00,
|
||||
int64_t ne01,
|
||||
int64_t ne02,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2) {
|
||||
template <typename T, int dim>
|
||||
static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE) concat_cont(const T * x,
|
||||
const T * y,
|
||||
T * dst,
|
||||
int64_t ne00,
|
||||
int64_t ne01,
|
||||
int64_t ne02,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2) {
|
||||
static_assert(dim >= 0 && dim <= 2, "dim must be in [0, 2]");
|
||||
|
||||
const int64_t n = ne0 * ne1 * ne2;
|
||||
@@ -50,37 +52,37 @@ static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE) concat_f32_cont
|
||||
}
|
||||
}
|
||||
|
||||
static void concat_f32_cuda(const float * x,
|
||||
const float * y,
|
||||
float * dst,
|
||||
int64_t ne00,
|
||||
int64_t ne01,
|
||||
int64_t ne02,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2,
|
||||
int dim,
|
||||
cudaStream_t stream) {
|
||||
template <typename T>
|
||||
static void concat_cont_cuda(const T * x,
|
||||
const T * y,
|
||||
T * dst,
|
||||
int64_t ne00,
|
||||
int64_t ne01,
|
||||
int64_t ne02,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2,
|
||||
int dim,
|
||||
cudaStream_t stream) {
|
||||
const int64_t n = ne0 * ne1 * ne2;
|
||||
const int num_blocks = (n + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
|
||||
|
||||
if (dim == 0) {
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream);
|
||||
ggml_cuda_kernel_launch(concat_f32_cont<0>, launch_params,x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
|
||||
ggml_cuda_kernel_launch(concat_cont<T, 0>, launch_params, x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
|
||||
return;
|
||||
}
|
||||
if (dim == 1) {
|
||||
concat_f32_cont<1>
|
||||
<<<num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
|
||||
concat_cont<T, 1><<<num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
|
||||
return;
|
||||
}
|
||||
concat_f32_cont<2><<<num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
|
||||
concat_cont<T, 2><<<num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
|
||||
}
|
||||
|
||||
// non-contiguous kernel (slow)
|
||||
template <int dim>
|
||||
template <typename T, int dim>
|
||||
static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE)
|
||||
concat_f32_non_cont(
|
||||
concat_non_cont(
|
||||
const char * src0,
|
||||
const char * src1,
|
||||
char * dst,
|
||||
@@ -107,61 +109,49 @@ static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE)
|
||||
uint64_t nb0,
|
||||
uint64_t nb1,
|
||||
uint64_t nb2,
|
||||
uint64_t nb3){
|
||||
uint64_t nb3) {
|
||||
static_assert(dim >= 0 && dim <= 3, "dim must be in [0, 3]");
|
||||
|
||||
const int64_t i3 = blockIdx.z;
|
||||
const int64_t i2 = blockIdx.y;
|
||||
const int64_t i1 = blockIdx.x;
|
||||
|
||||
const float * x;
|
||||
const T * x;
|
||||
|
||||
for (int64_t i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) {
|
||||
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
||||
x = (const float *)(src0 + (i3 )*nb03 + (i2 )*nb02 + (i1 )*nb01 + (i0 )*nb00);
|
||||
x = (const T *)(src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||
} else {
|
||||
if constexpr (dim == 0) {
|
||||
x = (const float *) (src1 + i3 * nb13 + i2 * nb12 + i1 * nb11 + (i0 - ne00) * nb10);
|
||||
x = (const T *)(src1 + i3*nb13 + i2*nb12 + i1*nb11 + (i0 - ne00)*nb10);
|
||||
} else if constexpr (dim == 1) {
|
||||
x = (const float *) (src1 + i3 * nb13 + i2 * nb12 + (i1 - ne01) * nb11 + i0 * nb10);
|
||||
x = (const T *)(src1 + i3*nb13 + i2*nb12 + (i1 - ne01)*nb11 + i0*nb10);
|
||||
} else if constexpr (dim == 2) {
|
||||
x = (const float *) (src1 + i3 * nb13 + (i2 - ne02) * nb12 + i1 * nb11 + i0 * nb10);
|
||||
x = (const T *)(src1 + i3*nb13 + (i2 - ne02)*nb12 + i1*nb11 + i0*nb10);
|
||||
} else if constexpr (dim == 3) {
|
||||
x = (const float *) (src1 + (i3 - ne03) * nb13 + i2 * nb12 + i1 * nb11 + i0 * nb10);
|
||||
x = (const T *)(src1 + (i3 - ne03)*nb13 + i2*nb12 + i1*nb11 + i0*nb10);
|
||||
}
|
||||
}
|
||||
|
||||
float * y = (float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
T * y = (T *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
*y = *x;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
const int32_t dim = ((int32_t *) dst->op_params)[0];
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
template <typename T>
|
||||
static void concat_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, int dim, cudaStream_t stream) {
|
||||
if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
const float * src1_d = (const float *)src1->data;
|
||||
|
||||
float * dst_d = (float *)dst->data;
|
||||
const T * src0_d = (const T *) src0->data;
|
||||
const T * src1_d = (const T *) src1->data;
|
||||
T * dst_d = (T *) dst->data;
|
||||
|
||||
if (dim != 3) {
|
||||
for (int i3 = 0; i3 < dst->ne[3]; i3++) {
|
||||
concat_f32_cuda(
|
||||
src0_d + i3 * (src0->nb[3] / 4),
|
||||
src1_d + i3 * (src1->nb[3] / 4),
|
||||
dst_d + i3 * ( dst->nb[3] / 4),
|
||||
for (int64_t i3 = 0; i3 < dst->ne[3]; i3++) {
|
||||
concat_cont_cuda(
|
||||
src0_d + i3*(src0->nb[3] / sizeof(T)),
|
||||
src1_d + i3*(src1->nb[3] / sizeof(T)),
|
||||
dst_d + i3*( dst->nb[3] / sizeof(T)),
|
||||
src0->ne[0], src0->ne[1], src0->ne[2],
|
||||
dst->ne[0], dst->ne[1], dst->ne[2], dim, stream);
|
||||
}
|
||||
@@ -169,13 +159,13 @@ void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const size_t size0 = ggml_nbytes(src0);
|
||||
const size_t size1 = ggml_nbytes(src1);
|
||||
|
||||
CUDA_CHECK(cudaMemcpyAsync(dst_d, src0_d, size0, cudaMemcpyDeviceToDevice, stream));
|
||||
CUDA_CHECK(cudaMemcpyAsync(dst_d + size0/4, src1_d, size1, cudaMemcpyDeviceToDevice, stream));
|
||||
CUDA_CHECK(cudaMemcpyAsync((char *) dst->data, src0->data, size0, cudaMemcpyDeviceToDevice, stream));
|
||||
CUDA_CHECK(cudaMemcpyAsync((char *) dst->data + size0, src1->data, size1, cudaMemcpyDeviceToDevice, stream));
|
||||
}
|
||||
} else {
|
||||
dim3 grid_dim(dst->ne[1], dst->ne[2], dst->ne[3]);
|
||||
auto launch_kernel = [&](auto dim) {
|
||||
concat_f32_non_cont<dim><<<grid_dim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(
|
||||
concat_non_cont<T, dim><<<grid_dim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(
|
||||
(const char *) src0->data, (const char *) src1->data, (char *) dst->data,
|
||||
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
||||
src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
|
||||
@@ -203,3 +193,35 @@ void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
const int32_t dim = ((int32_t *) dst->op_params)[0];
|
||||
|
||||
GGML_ASSERT(src0->type == src1->type);
|
||||
GGML_ASSERT(dst->type == src0->type);
|
||||
GGML_ASSERT(!ggml_is_quantized(src0->type));
|
||||
GGML_ASSERT(ggml_blck_size(src0->type) == 1);
|
||||
|
||||
switch (ggml_type_size(src0->type)) {
|
||||
case 1:
|
||||
concat_cuda<uint8_t>(src0, src1, dst, dim, stream);
|
||||
break;
|
||||
case 2:
|
||||
concat_cuda<uint16_t>(src0, src1, dst, dim, stream);
|
||||
break;
|
||||
case 4:
|
||||
concat_cuda<uint32_t>(src0, src1, dst, dim, stream);
|
||||
break;
|
||||
case 8:
|
||||
concat_cuda<uint64_t>(src0, src1, dst, dim, stream);
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("Unsupported type size: %zu", ggml_type_size(src0->type));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -39,9 +39,9 @@ gated_delta_net_cuda(const float * q,
|
||||
float * attn_data = dst;
|
||||
float * state = dst + attn_score_elems;
|
||||
|
||||
// input state layout (D, K, n_seqs) — seq stride is K * D = K * H * S_v * S_v.
|
||||
// input state holds s0 only: [S_v, S_v, H, n_seqs] — seq stride is D = H * S_v * S_v.
|
||||
// output state layout (per-slot D * n_seqs) — same per-(seq,head) offset as before.
|
||||
const int64_t state_in_offset = sequence * K * H * S_v * S_v + h_idx * S_v * S_v;
|
||||
const int64_t state_in_offset = sequence * H * S_v * S_v + h_idx * S_v * S_v;
|
||||
const int64_t state_out_offset = (sequence * H + h_idx) * S_v * S_v;
|
||||
state += state_out_offset;
|
||||
curr_state += state_in_offset + col * S_v;
|
||||
@@ -143,12 +143,10 @@ gated_delta_net_cuda(const float * q,
|
||||
attn_data += S_v * H;
|
||||
|
||||
if constexpr (keep_rs_t) {
|
||||
// slot mapping: target_slot = t - shift. When n_tokens < K only the last n_tokens slots
|
||||
// are written; earlier slots are left untouched (caller-owned).
|
||||
const int shift = (int) n_tokens - K;
|
||||
|
||||
// snapshot slot mapping: slot 0 = most recent state, slot s = s tokens back.
|
||||
// When n_tokens < K only slots 0..n_tokens-1 are written; older slots are caller-owned.
|
||||
const int64_t state_size_per_token = S_v * S_v * H * n_seqs; // per-slot stride in output
|
||||
const int target_slot = t - shift;
|
||||
const int target_slot = (int) n_tokens - 1 - t;
|
||||
if (target_slot >= 0 && target_slot < K) {
|
||||
float * curr_state = (dst + attn_score_elems) + target_slot * state_size_per_token + state_out_offset;
|
||||
#pragma unroll
|
||||
@@ -286,8 +284,8 @@ void ggml_cuda_op_gated_delta_net(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
// state is 3D (S_v*S_v*H, K, n_seqs); K is the snapshot slot count.
|
||||
const int K = (int) src_state->ne[1];
|
||||
// K (snapshot slot count) is an op param; state holds s0 only [S_v, S_v, H, n_seqs].
|
||||
const int K = ggml_get_op_params_i32(dst, 0);
|
||||
const bool keep_rs = K > 1;
|
||||
|
||||
if (kda) {
|
||||
|
||||
@@ -622,6 +622,18 @@ ggml_backend_cuda_context::~ggml_backend_cuda_context() {
|
||||
|
||||
// cuda buffer
|
||||
|
||||
struct ggml_backend_cuda_device_context {
|
||||
int device;
|
||||
std::string name;
|
||||
std::string description;
|
||||
std::string pci_bus_id;
|
||||
int op_offload_min_batch_size;
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
std::mutex device_mutex;
|
||||
int active_count = 0;
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
};
|
||||
|
||||
struct ggml_backend_cuda_buffer_context {
|
||||
int device;
|
||||
void * dev_ptr = nullptr;
|
||||
@@ -639,6 +651,13 @@ struct ggml_backend_cuda_buffer_context {
|
||||
|
||||
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) buffer->buft->device->context;
|
||||
std::lock_guard<std::mutex> lock(dev_ctx->device_mutex);
|
||||
dev_ctx->active_count--;
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
@@ -791,6 +810,12 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
|
||||
|
||||
ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr);
|
||||
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) buft->device->context;
|
||||
std::lock_guard<std::mutex> lock(dev_ctx->device_mutex);
|
||||
dev_ctx->active_count++;
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
|
||||
return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
|
||||
}
|
||||
|
||||
@@ -1490,6 +1515,12 @@ static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) {
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) buffer->buft->device->context;
|
||||
std::lock_guard<std::mutex> lock(dev_ctx->device_mutex);
|
||||
dev_ctx->active_count--;
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
|
||||
CUDA_CHECK(cudaFreeHost(buffer->context));
|
||||
}
|
||||
|
||||
@@ -1498,6 +1529,8 @@ static void * ggml_cuda_host_malloc(size_t size) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
ggml_cuda_set_device(0); // cudaMallocHost can create the implicit CUDA device context, make sure that this is consistently done on device 0.
|
||||
|
||||
void * ptr = nullptr;
|
||||
cudaError_t err = cudaMallocHost((void **) &ptr, size);
|
||||
if (err != cudaSuccess) {
|
||||
@@ -1523,6 +1556,12 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
|
||||
buffer->buft = buft;
|
||||
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
|
||||
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) buft->device->context;
|
||||
std::lock_guard<std::mutex> lock(dev_ctx->device_mutex);
|
||||
dev_ctx->active_count++;
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
||||
@@ -3140,6 +3179,12 @@ static const char * ggml_backend_cuda_get_name(ggml_backend_t backend) {
|
||||
static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) backend->device->context;
|
||||
std::lock_guard<std::mutex> lock(dev_ctx->device_mutex);
|
||||
dev_ctx->active_count--;
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
|
||||
delete cuda_ctx;
|
||||
delete backend;
|
||||
}
|
||||
@@ -4871,14 +4916,6 @@ void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
|
||||
|
||||
// backend device
|
||||
|
||||
struct ggml_backend_cuda_device_context {
|
||||
int device;
|
||||
std::string name;
|
||||
std::string description;
|
||||
std::string pci_bus_id;
|
||||
int op_offload_min_batch_size;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
|
||||
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||
return ctx->name.c_str();
|
||||
@@ -4967,6 +5004,11 @@ static bool ggml_backend_cuda_get_available_uma_memory(long * available_memory_k
|
||||
|
||||
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
std::lock_guard<std::mutex> lock(ctx->device_mutex);
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
|
||||
ggml_cuda_set_device(ctx->device);
|
||||
CUDA_CHECK(cudaMemGetInfo(free, total));
|
||||
|
||||
@@ -4993,6 +5035,13 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t *
|
||||
}
|
||||
#endif // defined(__linux__)
|
||||
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
// If no backends or buffers are active, the cudaMemGetInfo call above lazily created a CUDA
|
||||
// context that permanently consumes VRAM. Reset the device to free it.
|
||||
if (ctx->active_count == 0) {
|
||||
CUDA_CHECK(cudaDeviceReset());
|
||||
}
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
}
|
||||
|
||||
static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) {
|
||||
@@ -5296,7 +5345,15 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_OP_CONCAT:
|
||||
{
|
||||
ggml_type src0_type = op->src[0]->type;
|
||||
return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
|
||||
ggml_type src1_type = op->src[1]->type;
|
||||
return src0_type == src1_type &&
|
||||
src0_type == op->type &&
|
||||
!ggml_is_quantized(src0_type) &&
|
||||
ggml_blck_size(src0_type) == 1 &&
|
||||
(ggml_type_size(src0_type) == 1 ||
|
||||
ggml_type_size(src0_type) == 2 ||
|
||||
ggml_type_size(src0_type) == 4 ||
|
||||
ggml_type_size(src0_type) == 8);
|
||||
} break;
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
{
|
||||
@@ -5687,13 +5744,21 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device);
|
||||
|
||||
ggml_backend_t cuda_backend = new ggml_backend {
|
||||
/* .guid = */ ggml_backend_cuda_guid(),
|
||||
/* .iface = */ ggml_backend_cuda_interface,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device),
|
||||
/* .device = */ dev,
|
||||
/* .context = */ ctx,
|
||||
};
|
||||
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
|
||||
std::lock_guard<std::mutex> lock(dev_ctx->device_mutex);
|
||||
dev_ctx->active_count++;
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
|
||||
return cuda_backend;
|
||||
}
|
||||
|
||||
|
||||
@@ -411,7 +411,6 @@ static constexpr __host__ __device__ int calc_nwarps(ggml_type type, int ncols_d
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q4_K:
|
||||
return 8;
|
||||
case GGML_TYPE_Q6_K:
|
||||
return 2;
|
||||
|
||||
@@ -67,6 +67,7 @@ __global__ void __launch_bounds__(splitD, 1)
|
||||
__shared__ CubTempStorage cub_temp_storage;
|
||||
|
||||
BlockLoad(cub_temp_storage.load_temp).Load(A_block, regA);
|
||||
__syncthreads();
|
||||
BlockLoad(cub_temp_storage.load_temp).Load(s0_block, regs0);
|
||||
#else
|
||||
const int stride_s0 = src0_nb2 / sizeof(float);
|
||||
@@ -105,6 +106,7 @@ __global__ void __launch_bounds__(splitD, 1)
|
||||
regs0[n] = state;
|
||||
}
|
||||
y_block[i * stride_y + threadIdx.x] = sumf;
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
#ifdef USE_CUB
|
||||
@@ -249,9 +251,8 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa
|
||||
GGML_ASSERT(head_dim == 1);
|
||||
GGML_ASSERT(n_group == 1);
|
||||
const dim3 blocks(n_seq, (n_head + threads - 1) / threads, 1);
|
||||
const int smem_size = (threads * (d_state + 1) * 2) * sizeof(float);
|
||||
if (d_state == 16) {
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(blocks, threads, smem_size, stream);
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(blocks, threads, 0, stream);
|
||||
switch (n_tok)
|
||||
{
|
||||
case 1:
|
||||
|
||||
Vendored
+2
-2
@@ -219,9 +219,9 @@
|
||||
#define RDNA3
|
||||
#endif // defined(__GFX11__)
|
||||
|
||||
#if defined(__gfx1150__) || defined(__gfx1151__)
|
||||
#if defined(__gfx1150__) || defined(__gfx1151__) || defined(__gfx1152__) || defined(__gfx1153__)
|
||||
#define RDNA3_5
|
||||
#endif // defined(__gfx1150__) || defined(__gfx1151__)
|
||||
#endif // defined(__gfx1150__) || defined(__gfx1151__) || defined(__gfx1152__) || defined(__gfx1153__)
|
||||
|
||||
#if defined(RDNA3) && !defined(RDNA3_5)
|
||||
#define RDNA3_0
|
||||
|
||||
@@ -2538,7 +2538,7 @@ static bool ggml_hexagon_supported_gated_delta_net(const struct ggml_hexagon_ses
|
||||
const int64_t H = v->ne[1];
|
||||
const int64_t n_tokens = v->ne[2];
|
||||
const int64_t n_seqs = v->ne[3];
|
||||
const int64_t K = state->ne[1];
|
||||
const int64_t K = ggml_get_op_params_i32(op, 0);
|
||||
|
||||
if (S_v <= 0 || S_v > 128 || H <= 0 || n_tokens <= 0 || n_seqs <= 0) {
|
||||
return false;
|
||||
@@ -2551,7 +2551,8 @@ static bool ggml_hexagon_supported_gated_delta_net(const struct ggml_hexagon_ses
|
||||
if ((g->ne[0] != 1 && g->ne[0] != S_v) || beta->ne[0] != 1) {
|
||||
return false;
|
||||
}
|
||||
if (ggml_nelements(state) != S_v * S_v * H * n_seqs * K) {
|
||||
// state holds s0 only [S_v, S_v, H, n_seqs]; K is op param 0.
|
||||
if (ggml_nelements(state) != S_v * S_v * H * n_seqs) {
|
||||
return false;
|
||||
}
|
||||
if (dst->ne[0] != S_v * H || dst->ne[1] != n_tokens * n_seqs + S_v * n_seqs * K) {
|
||||
|
||||
@@ -584,7 +584,7 @@ static void gated_delta_net_f32_pp_thread(unsigned int nth, unsigned int ith, vo
|
||||
const uint32_t H = v->ne[1];
|
||||
const uint32_t n_tokens = v->ne[2];
|
||||
const uint32_t n_seqs = v->ne[3];
|
||||
const uint32_t K = state->ne[1];
|
||||
const uint32_t K = octx->op_params[0];
|
||||
|
||||
const uint32_t total_rows = H * n_seqs;
|
||||
if (ith >= total_rows) {
|
||||
@@ -618,9 +618,8 @@ static void gated_delta_net_f32_pp_thread(unsigned int nth, unsigned int ith, vo
|
||||
struct fastdiv_values fd_rq3 = init_fastdiv_values(rq3);
|
||||
struct fastdiv_values fd_rk3 = init_fastdiv_values(rk3);
|
||||
|
||||
const uint64_t state_seq_stride = state->nb[2] / sizeof(float);
|
||||
const uint64_t state_seq_stride = state->nb[3] / sizeof(float);
|
||||
const uint64_t state_size_per_snap = (uint64_t) S_v * S_v * H * n_seqs;
|
||||
const int64_t shift = (int64_t) n_tokens - (int64_t) K;
|
||||
|
||||
uint32_t ir_prefetch = ith;
|
||||
int spad_idx = 0;
|
||||
@@ -630,7 +629,8 @@ static void gated_delta_net_f32_pp_thread(unsigned int nth, unsigned int ith, vo
|
||||
const uint32_t piv1 = fastmodulo(ir_prefetch, H, &fd_H);
|
||||
const uint32_t piv3 = fastdiv(ir_prefetch, &fd_H);
|
||||
const float * ps_in = state_in_base + (uint64_t) piv3 * state_seq_stride + (uint64_t) piv1 * S_v * S_v;
|
||||
float * ps_out = state_out_base + (uint64_t) (K - 1) * state_size_per_snap + ((uint64_t) piv3 * H + piv1) * S_v * S_v;
|
||||
// final state lands in snapshot slot 0 (most-recent-first ordering)
|
||||
float * ps_out = state_out_base + ((uint64_t) piv3 * H + piv1) * S_v * S_v;
|
||||
|
||||
// Push dummy write-back
|
||||
dma_queue_push(dma, dma_make_ptr(ps_out, s_work[spad_idx]),
|
||||
@@ -661,7 +661,8 @@ static void gated_delta_net_f32_pp_thread(unsigned int nth, unsigned int ith, vo
|
||||
const uint32_t iq3 = fastdiv(iv3, &fd_rq3);
|
||||
const uint32_t ik3 = fastdiv(iv3, &fd_rk3);
|
||||
|
||||
float * s_out = state_out_base + (uint64_t) (K - 1) * state_size_per_snap + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
|
||||
// final state lands in snapshot slot 0 (most-recent-first ordering)
|
||||
float * s_out = state_out_base + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
|
||||
|
||||
float * attn_data = dst_base + ((uint64_t) iv3 * n_tokens * H + iv1) * S_v;
|
||||
|
||||
@@ -792,7 +793,8 @@ static void gated_delta_net_f32_pp_thread(unsigned int nth, unsigned int ith, vo
|
||||
}
|
||||
|
||||
if (K > 1) {
|
||||
const int64_t target_slot = (int64_t) t - shift;
|
||||
// snapshot slot mapping: slot 0 = most recent state, slot s = s tokens back.
|
||||
const int64_t target_slot = (int64_t) n_tokens - 1 - (int64_t) t;
|
||||
if (target_slot >= 0 && target_slot < (int64_t) K) {
|
||||
float * curr_state_o = state_out_base + (uint64_t) target_slot * state_size_per_snap + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
|
||||
if (curr_state_o != s_out) {
|
||||
@@ -844,7 +846,6 @@ static void gated_delta_net_f32_tg_thread(unsigned int nth, unsigned int ith, vo
|
||||
const uint32_t S_v = v->ne[0];
|
||||
const uint32_t H = v->ne[1];
|
||||
const uint32_t n_seqs = v->ne[3];
|
||||
const uint32_t K = state->ne[1];
|
||||
|
||||
const uint32_t total_rows = H * n_seqs;
|
||||
if (ith >= total_rows) {
|
||||
@@ -878,8 +879,7 @@ static void gated_delta_net_f32_tg_thread(unsigned int nth, unsigned int ith, vo
|
||||
struct fastdiv_values fd_rq3 = init_fastdiv_values(rq3);
|
||||
struct fastdiv_values fd_rk3 = init_fastdiv_values(rk3);
|
||||
|
||||
const uint64_t state_seq_stride = state->nb[2] / sizeof(float);
|
||||
const uint64_t state_size_per_snap = (uint64_t) S_v * S_v * H * n_seqs;
|
||||
const uint64_t state_seq_stride = state->nb[3] / sizeof(float);
|
||||
|
||||
uint32_t ir_prefetch = ith;
|
||||
int spad_idx = 0;
|
||||
@@ -889,7 +889,8 @@ static void gated_delta_net_f32_tg_thread(unsigned int nth, unsigned int ith, vo
|
||||
const uint32_t piv1 = fastmodulo(ir_prefetch, H, &fd_H);
|
||||
const uint32_t piv3 = fastdiv(ir_prefetch, &fd_H);
|
||||
const float * ps_in = state_in_base + (uint64_t) piv3 * state_seq_stride + (uint64_t) piv1 * S_v * S_v;
|
||||
float * ps_out = state_out_base + (uint64_t) (K - 1) * state_size_per_snap + ((uint64_t) piv3 * H + piv1) * S_v * S_v;
|
||||
// final state lands in snapshot slot 0 (most-recent-first ordering)
|
||||
float * ps_out = state_out_base + ((uint64_t) piv3 * H + piv1) * S_v * S_v;
|
||||
|
||||
// Push dummy write-back
|
||||
dma_queue_push(dma, dma_make_ptr(ps_out, s_work[spad_idx]),
|
||||
@@ -920,7 +921,8 @@ static void gated_delta_net_f32_tg_thread(unsigned int nth, unsigned int ith, vo
|
||||
const uint32_t iq3 = fastdiv(iv3, &fd_rq3);
|
||||
const uint32_t ik3 = fastdiv(iv3, &fd_rk3);
|
||||
|
||||
float * s_out = state_out_base + (uint64_t) (K - 1) * state_size_per_snap + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
|
||||
// final state lands in snapshot slot 0 (most-recent-first ordering)
|
||||
float * s_out = state_out_base + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
|
||||
|
||||
float * attn_data = dst_base + ((uint64_t) iv3 * H + iv1) * S_v;
|
||||
|
||||
@@ -1097,7 +1099,7 @@ int op_gated_delta_net(struct htp_ops_context * octx) {
|
||||
const uint32_t H = v->ne[1];
|
||||
const uint32_t n_tokens = v->ne[2];
|
||||
const uint32_t n_seqs = v->ne[3];
|
||||
const uint32_t K = state->ne[1];
|
||||
const uint32_t K = octx->op_params[0];
|
||||
|
||||
if (S_v == 0 || S_v > HTP_GDN_MAX_SV || H == 0 || n_tokens == 0 || n_seqs == 0) {
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
@@ -1110,7 +1112,8 @@ int op_gated_delta_net(struct htp_ops_context * octx) {
|
||||
(n_seqs % q->ne[3]) != 0 || (n_seqs % k->ne[3]) != 0) {
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
if (state->ne[0] * state->ne[2] * state->ne[3] != S_v * S_v * H * n_seqs) {
|
||||
// state holds s0 only: [S_v, S_v, H, n_seqs]
|
||||
if (state->ne[0] != S_v || state->ne[1] != S_v || state->ne[2] != H || state->ne[3] != n_seqs) {
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
if (dst->ne[0] != S_v * H || dst->ne[1] != n_tokens * n_seqs + S_v * n_seqs * K) {
|
||||
|
||||
@@ -590,8 +590,8 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_gated_delta_net(
|
||||
const int ne20 = op->src[2]->ne[0]; // S_v
|
||||
const int ne21 = op->src[2]->ne[1]; // H
|
||||
const int ne30 = op->src[3]->ne[0]; // G
|
||||
// state is src[5], 3D (S_v*S_v*H, K, n_seqs); K is the snapshot slot count.
|
||||
const int K = op->src[5]->ne[1];
|
||||
// state is src[5], 4D [S_v, S_v, H_v, n_seqs] (s0 only); K is op param 0.
|
||||
const int K = ggml_get_op_params_i32(op, 0);
|
||||
|
||||
const int nsg = op->src[2]->ne[0]/32;
|
||||
|
||||
@@ -1738,10 +1738,14 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_im2col(ggml_meta
|
||||
GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_F32);
|
||||
|
||||
const bool is_2D = ((const int32_t *)(op->op_params))[6] == 1;
|
||||
const int64_t KH = is_2D ? ne01 : 1;
|
||||
const int64_t KW = ne00;
|
||||
|
||||
char base[256];
|
||||
char name[256];
|
||||
|
||||
if (ne00*ne01 <= 1024) {
|
||||
if (KH*KW <= 1024) {
|
||||
snprintf(base, 256, "kernel_im2col_%s", ggml_type_name(op->type));
|
||||
} else {
|
||||
snprintf(base, 256, "kernel_im2col_ext_%s", ggml_type_name(op->type));
|
||||
|
||||
@@ -1120,8 +1120,17 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
||||
case GGML_OP_VIEW:
|
||||
case GGML_OP_TRANSPOSE:
|
||||
case GGML_OP_PERMUTE:
|
||||
case GGML_OP_CONCAT:
|
||||
return true;
|
||||
case GGML_OP_CONCAT:
|
||||
{
|
||||
// kernel_concat copies one float-sized value per element.
|
||||
// Other scalar types need a type-generic copy kernel first.
|
||||
const enum ggml_type src0_type = op->src[0]->type;
|
||||
const enum ggml_type src1_type = op->src[1]->type;
|
||||
return src0_type == src1_type &&
|
||||
src0_type == op->type &&
|
||||
(src0_type == GGML_TYPE_F32 || src0_type == GGML_TYPE_I32);
|
||||
}
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_SUB:
|
||||
case GGML_OP_MUL:
|
||||
|
||||
@@ -2599,9 +2599,9 @@ kernel void kernel_gated_delta_net_impl(
|
||||
|
||||
const float scale = 1.0f / sqrt((float)S_v);
|
||||
|
||||
// input state layout (D, K, n_seqs): per-seq stride is K*H*D; we read slot 0.
|
||||
// input state layout [S_v, S_v, H, n_seqs] (s0 only): per-seq stride is H*D.
|
||||
// state is stored transposed: M[i20][is] = S[is][i20], so row i20 is contiguous
|
||||
const uint state_in_base = (i23*K*args.ne21 + i21)*S_v*S_v + i20*S_v;
|
||||
const uint state_in_base = (i23*args.ne21 + i21)*S_v*S_v + i20*S_v;
|
||||
device const float * s_ptr = (device const float *) (s) + state_in_base;
|
||||
|
||||
float ls[NSG];
|
||||
@@ -2620,9 +2620,8 @@ kernel void kernel_gated_delta_net_impl(
|
||||
device const float * b_ptr = (device const float *) (b) + (i23*args.ne22*args.ne21 + i21);
|
||||
device const float * g_ptr = (device const float *) (g) + (i23*args.ne22*args.ne21 + i21)*G;
|
||||
|
||||
// snapshot slot mapping: target_slot = t - shift. When n_tokens < K, only the last
|
||||
// n_tokens slots are written; earlier slots are left untouched (caller-owned).
|
||||
const int shift = (int)args.ne22 - (int)K;
|
||||
// snapshot slot mapping: slot 0 = most recent state, slot s = s tokens back.
|
||||
// When n_tokens < K, only slots 0..n_tokens-1 are written; older slots are caller-owned.
|
||||
|
||||
// output state base offset: after attention scores
|
||||
const uint attn_size = args.ne22 * args.ne21 * S_v * args.ne23;
|
||||
@@ -2680,7 +2679,7 @@ kernel void kernel_gated_delta_net_impl(
|
||||
g_ptr += args.ne21*G;
|
||||
|
||||
if (K > 1) {
|
||||
const int target_slot = (int)t - shift;
|
||||
const int target_slot = (int)args.ne22 - 1 - (int)t;
|
||||
if (target_slot >= 0 && target_slot < (int)K) {
|
||||
device float * dst_state = (device float *) (dst) + attn_size + (uint)target_slot * state_size_per_snap + state_out_base;
|
||||
FOR_UNROLL (short j = 0; j < NSG; j++) {
|
||||
|
||||
@@ -142,6 +142,10 @@ set(GGML_OPENCL_KERNELS
|
||||
gemm_noshuffle_q4_0_f32
|
||||
gemv_noshuffle_q4_1_f32
|
||||
gemm_noshuffle_q4_1_f32
|
||||
gemv_noshuffle_q5_0_f32
|
||||
gemm_noshuffle_q5_0_f32
|
||||
gemv_noshuffle_q5_1_f32
|
||||
gemm_noshuffle_q5_1_f32
|
||||
gemv_noshuffle_iq4_nl_f32
|
||||
gemm_noshuffle_iq4_nl_f32
|
||||
gemv_noshuffle_q8_0_f32
|
||||
|
||||
@@ -558,7 +558,7 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_set_rows_f32_i64, kernel_set_rows_f32_i32, kernel_set_rows_f16_i64, kernel_set_rows_f16_i32;
|
||||
cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16;
|
||||
cl_kernel kernel_rope_multi_f32, kernel_rope_multi_f16, kernel_rope_vision_f32, kernel_rope_vision_f16;
|
||||
cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32, kernel_cpy_i32_i32;
|
||||
cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32, kernel_cpy_f32_f32_pack, kernel_cpy_i32_i32;
|
||||
cl_kernel kernel_mul_mat_f32_f32;
|
||||
cl_kernel kernel_mul_mat_f16_f16;
|
||||
cl_kernel kernel_mul_mat_f16_f32_1row;
|
||||
@@ -593,6 +593,10 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_restore_block_q4_0_noshuffle;
|
||||
cl_kernel kernel_convert_block_q4_1_noshuffle;
|
||||
cl_kernel kernel_restore_block_q4_1_noshuffle;
|
||||
cl_kernel kernel_convert_block_q5_0_noshuffle;
|
||||
cl_kernel kernel_restore_block_q5_0_noshuffle;
|
||||
cl_kernel kernel_convert_block_q5_1_noshuffle;
|
||||
cl_kernel kernel_restore_block_q5_1_noshuffle;
|
||||
cl_kernel kernel_convert_block_q4_K_noshuffle;
|
||||
cl_kernel kernel_restore_block_q4_K_noshuffle;
|
||||
cl_kernel kernel_convert_block_q4_K, kernel_restore_block_q4_K;
|
||||
@@ -639,7 +643,7 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_softplus_f16, kernel_softplus_f16_4, kernel_softplus_f16_nc;
|
||||
cl_kernel kernel_upscale;
|
||||
cl_kernel kernel_upscale_bilinear;
|
||||
cl_kernel kernel_concat_f32;
|
||||
cl_kernel kernel_concat_f32, kernel_concat_f32_pack;
|
||||
cl_kernel kernel_conv_2d_f16;
|
||||
cl_kernel kernel_conv_2d_f32;
|
||||
cl_kernel kernel_conv_2d_f16_f32;
|
||||
@@ -829,6 +833,10 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_gemm_noshuffle_q6_K_f32;
|
||||
cl_kernel kernel_gemv_noshuffle_q5_k_f32;
|
||||
cl_kernel kernel_gemm_noshuffle_q5_k_f32;
|
||||
cl_kernel kernel_gemv_noshuffle_q5_0_f32;
|
||||
cl_kernel kernel_gemm_noshuffle_q5_0_f32;
|
||||
cl_kernel kernel_gemv_noshuffle_q5_1_f32;
|
||||
cl_kernel kernel_gemm_noshuffle_q5_1_f32;
|
||||
cl_kernel kernel_gemv_noshuffle_iq4_nl_f32;
|
||||
cl_kernel kernel_gemm_noshuffle_iq4_nl_f32;
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
@@ -1121,6 +1129,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
|
||||
CL_CHECK((backend_ctx->kernel_cpy_f16_f32 = clCreateKernel(prog, "kernel_cpy_f16_f32", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_cpy_f32_f16 = clCreateKernel(prog, "kernel_cpy_f32_f16", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_cpy_f32_f32 = clCreateKernel(prog, "kernel_cpy_f32_f32", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_cpy_f32_f32_pack = clCreateKernel(prog, "kernel_cpy_f32_f32_pack", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_cpy_i32_i32 = clCreateKernel(prog, "kernel_cpy_i32_i32", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
@@ -1151,6 +1160,10 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q4_1_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_1_trans4_ns", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q5_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_0", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q5_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_0", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q5_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_0_noshuffle", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q5_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_0_noshuffle", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q5_1_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_1_noshuffle", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q5_1_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_1_noshuffle", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q5_0_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_0_trans4_ns", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q5_0_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_0_trans4_ns", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q5_1 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_1", &err), err));
|
||||
@@ -2615,6 +2628,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
|
||||
cl_program prog =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
CL_CHECK((backend_ctx->kernel_concat_f32 = clCreateKernel(prog, "kernel_concat_f32", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_concat_f32_pack = clCreateKernel(prog, "kernel_concat_f32_pack", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
@@ -3063,6 +3077,80 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// gemm_noshuffle_q5_0_f32
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "gemm_noshuffle_q5_0_f32.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("gemm_noshuffle_q5_0_f32.cl");
|
||||
#endif
|
||||
cl_program prog = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
CL_CHECK((backend_ctx->kernel_gemm_noshuffle_q5_0_f32 = clCreateKernel(prog, "kernel_gemm_noshuffle_q5_0_f32", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// gemv_noshuffle_q5_0_f32
|
||||
{
|
||||
std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
||||
" -cl-mad-enable ";
|
||||
if (backend_ctx->has_vector_subgroup_broadcast) {
|
||||
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAST ";
|
||||
}
|
||||
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "gemv_noshuffle_q5_0_f32.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("gemv_noshuffle_q5_0_f32.cl");
|
||||
#endif
|
||||
cl_program prog = build_program_from_source(
|
||||
backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_gemv_compile_opts);
|
||||
CL_CHECK((backend_ctx->kernel_gemv_noshuffle_q5_0_f32 = clCreateKernel(prog, "kernel_gemv_noshuffle_q5_0_f32", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// gemm_noshuffle_q5_1_f32
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "gemm_noshuffle_q5_1_f32.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("gemm_noshuffle_q5_1_f32.cl");
|
||||
#endif
|
||||
cl_program prog = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
CL_CHECK((backend_ctx->kernel_gemm_noshuffle_q5_1_f32 = clCreateKernel(prog, "kernel_gemm_noshuffle_q5_1_f32", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// gemv_noshuffle_q5_1_f32
|
||||
{
|
||||
std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
||||
" -cl-mad-enable ";
|
||||
if (backend_ctx->has_vector_subgroup_broadcast) {
|
||||
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAST ";
|
||||
}
|
||||
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "gemv_noshuffle_q5_1_f32.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("gemv_noshuffle_q5_1_f32.cl");
|
||||
#endif
|
||||
cl_program prog = build_program_from_source(
|
||||
backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_gemv_compile_opts);
|
||||
CL_CHECK((backend_ctx->kernel_gemv_noshuffle_q5_1_f32 = clCreateKernel(prog, "kernel_gemv_noshuffle_q5_1_f32", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// gemm_noshuffle_iq4_nl_f32
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
@@ -6105,15 +6193,16 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
return;
|
||||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
cl_kernel kernel = backend_ctx->kernel_convert_block_q5_0;
|
||||
cl_ulong n_blk = ggml_nelements(tensor)/ggml_blck_size(tensor->type);
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
if (use_adreno_kernels(backend_ctx, tensor)) {
|
||||
cl_kernel kernel = backend_ctx->kernel_convert_block_q5_0_noshuffle;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qs));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->qh));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &n_blk));
|
||||
|
||||
size_t global_work_size[] = {(size_t)CEIL_DIV(n_blk, 64) * 64, 1, 1};
|
||||
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
||||
size_t local_work_size[] = {64, 1, 1};
|
||||
|
||||
cl_event evt;
|
||||
@@ -6122,7 +6211,39 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
CL_CHECK(clReleaseMemObject(data_device));
|
||||
|
||||
tensor->extra = extra;
|
||||
|
||||
int M = tensor->ne[1];
|
||||
int K = tensor->ne[0];
|
||||
GGML_ASSERT(K % 32 == 0);
|
||||
|
||||
// Transpose qs as ushort
|
||||
transpose_2d_as_16b(backend_ctx, extra->qs, extra->qs, size_qs, K/4, M);
|
||||
// Transpose qh as uchar
|
||||
transpose_2d_as_8b(backend_ctx, extra->qh, extra->qh, size_qh, K/8, M);
|
||||
// Transpose d as ushort
|
||||
transpose_2d_as_16b(backend_ctx, extra->d, extra->d, size_d, K/32, M);
|
||||
|
||||
return;
|
||||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
cl_kernel kernel = backend_ctx->kernel_convert_block_q5_0;
|
||||
cl_ulong n_blk = ggml_nelements(tensor)/ggml_blck_size(tensor->type);
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qs));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->qh));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &n_blk));
|
||||
|
||||
size_t global_work_size[] = {(size_t)CEIL_DIV(n_blk, 64) * 64, 1, 1};
|
||||
size_t local_work_size[] = {64, 1, 1};
|
||||
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
CL_CHECK(clReleaseMemObject(data_device));
|
||||
|
||||
tensor->extra = extra;
|
||||
return;
|
||||
}
|
||||
if (tensor->type == GGML_TYPE_Q5_1) {
|
||||
ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
|
||||
@@ -6223,6 +6344,42 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
return;
|
||||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
if (use_adreno_kernels(backend_ctx, tensor)) {
|
||||
cl_kernel kernel = backend_ctx->kernel_convert_block_q5_1_noshuffle;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qs));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->qh));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->m));
|
||||
|
||||
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
||||
size_t local_work_size[] = {64, 1, 1};
|
||||
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
CL_CHECK(clReleaseMemObject(data_device));
|
||||
|
||||
tensor->extra = extra;
|
||||
|
||||
int M = tensor->ne[1];
|
||||
int K = tensor->ne[0];
|
||||
GGML_ASSERT(K % 32 == 0);
|
||||
|
||||
// Transpose qs as ushort
|
||||
transpose_2d_as_16b(backend_ctx, extra->qs, extra->qs, size_qs, K/4, M);
|
||||
// Transpose qh as uchar
|
||||
transpose_2d_as_8b(backend_ctx, extra->qh, extra->qh, size_qh, K/8, M);
|
||||
// Transpose d as ushort
|
||||
transpose_2d_as_16b(backend_ctx, extra->d, extra->d, size_d, K/32, M);
|
||||
// Transpose m as ushort
|
||||
transpose_2d_as_16b(backend_ctx, extra->m, extra->m, size_m, K/32, M);
|
||||
|
||||
return;
|
||||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
cl_kernel kernel = backend_ctx->kernel_convert_block_q5_1;
|
||||
cl_ulong n_blk = ggml_nelements(tensor)/ggml_blck_size(tensor->type);
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
|
||||
@@ -7297,6 +7454,48 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||
CL_CHECK(clReleaseMemObject(data_device));
|
||||
return;
|
||||
}
|
||||
if (use_adreno_kernels(backend_ctx, tensor)) {
|
||||
ggml_cl_buffer buf_trans_qs;
|
||||
ggml_cl_buffer buf_trans_qh;
|
||||
ggml_cl_buffer buf_trans_d;
|
||||
ggml_cl_buffer buf_unpacked;
|
||||
|
||||
cl_int M = tensor->ne[1];
|
||||
cl_int K = tensor->ne[0];
|
||||
|
||||
GGML_ASSERT(K % 32 == 0);
|
||||
|
||||
size_t size_qs = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*ggml_blck_size(tensor->type)/2;
|
||||
size_t size_qh = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*sizeof(int32_t);
|
||||
size_t size_d = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*sizeof(ggml_fp16_t);
|
||||
|
||||
buf_trans_qs.allocate(backend_ctx->context, size_qs);
|
||||
buf_trans_qh.allocate(backend_ctx->context, size_qh);
|
||||
buf_trans_d.allocate(backend_ctx->context, size_d);
|
||||
buf_unpacked.allocate(backend_ctx->context, ggml_nbytes(tensor));
|
||||
|
||||
transpose_2d_as_16b(backend_ctx, extra->qs, buf_trans_qs.buffer, size_qs, M, K/4);
|
||||
transpose_2d_as_8b(backend_ctx, extra->qh, buf_trans_qh.buffer, size_qh, M, K/8);
|
||||
transpose_2d_as_16b(backend_ctx, extra->d, buf_trans_d.buffer, size_d, M, K/32);
|
||||
|
||||
cl_uchar mask_0F = 0x0F;
|
||||
cl_uchar mask_F0 = 0xF0;
|
||||
|
||||
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
||||
size_t local_work_size[] = {1, 1, 1};
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_restore_block_q5_0_noshuffle;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_trans_qs.buffer));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_qh.buffer));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &buf_trans_d.buffer));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &buf_unpacked.buffer));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_uchar), &mask_0F));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_uchar), &mask_F0));
|
||||
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, buf_unpacked.buffer, CL_TRUE, offset, size, data, 0, NULL, NULL));
|
||||
return;
|
||||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
|
||||
cl_int err;
|
||||
@@ -7360,6 +7559,54 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||
CL_CHECK(clReleaseMemObject(data_device));
|
||||
return;
|
||||
}
|
||||
|
||||
if (use_adreno_kernels(backend_ctx, tensor)) {
|
||||
ggml_cl_buffer buf_trans_qs;
|
||||
ggml_cl_buffer buf_trans_qh;
|
||||
ggml_cl_buffer buf_trans_d;
|
||||
ggml_cl_buffer buf_trans_m;
|
||||
ggml_cl_buffer buf_unpacked;
|
||||
|
||||
cl_int M = tensor->ne[1];
|
||||
cl_int K = tensor->ne[0];
|
||||
GGML_ASSERT(K % 32 == 0);
|
||||
|
||||
size_t size_qs = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*ggml_blck_size(tensor->type)/2;
|
||||
size_t size_qh = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*sizeof(int32_t);
|
||||
size_t size_d = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*sizeof(ggml_fp16_t);
|
||||
size_t size_m = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*sizeof(ggml_fp16_t);
|
||||
|
||||
buf_trans_qs.allocate(backend_ctx->context, size_qs);
|
||||
buf_trans_qh.allocate(backend_ctx->context, size_qh);
|
||||
buf_trans_d.allocate(backend_ctx->context, size_d);
|
||||
buf_trans_m.allocate(backend_ctx->context, size_m);
|
||||
buf_unpacked.allocate(backend_ctx->context, ggml_nbytes(tensor));
|
||||
|
||||
// Transpose back: from col-major to row-major
|
||||
transpose_2d_as_16b(backend_ctx, extra->qs, buf_trans_qs.buffer, size_qs, M, K/4);
|
||||
transpose_2d_as_8b(backend_ctx, extra->qh, buf_trans_qh.buffer, size_qh, M, K/8);
|
||||
transpose_2d_as_16b(backend_ctx, extra->d, buf_trans_d.buffer, size_d, M, K/32);
|
||||
transpose_2d_as_16b(backend_ctx, extra->m, buf_trans_m.buffer, size_m, M, K/32);
|
||||
|
||||
cl_uchar mask_0F = 0x0F;
|
||||
cl_uchar mask_F0 = 0xF0;
|
||||
|
||||
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
||||
size_t local_work_size[] = {1, 1, 1};
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_restore_block_q5_1_noshuffle;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_trans_qs.buffer));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_qh.buffer));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &buf_trans_d.buffer));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &buf_trans_m.buffer));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &buf_unpacked.buffer));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_uchar), &mask_0F));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_uchar), &mask_F0));
|
||||
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, buf_unpacked.buffer, CL_TRUE, offset, size, data, 0, NULL, NULL));
|
||||
return;
|
||||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
cl_int err;
|
||||
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
||||
@@ -8552,7 +8799,14 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
||||
nth *= 2;
|
||||
}
|
||||
|
||||
size_t global_work_size[] = {(size_t)ne10*nth, (size_t)ne11, (size_t)ne12};
|
||||
int nchunks = 1;
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
const int chunk_target = nth * 4;
|
||||
nchunks = (ne00 + chunk_target - 1) / chunk_target;
|
||||
nchunks = MAX(1, MIN(nchunks, 64));
|
||||
}
|
||||
|
||||
size_t global_work_size[] = {(size_t)ne10*nth*nchunks, (size_t)ne11, (size_t)ne12};
|
||||
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
@@ -11128,7 +11382,9 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
|
||||
|
||||
int nth = MIN(64, ne0);
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_concat_f32;
|
||||
const bool concat_pack = (dim == 0 && ne0 < 32);
|
||||
cl_kernel kernel = concat_pack ? backend_ctx->kernel_concat_f32_pack
|
||||
: backend_ctx->kernel_concat_f32;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
@@ -11155,10 +11411,28 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
|
||||
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_int), &dim));
|
||||
|
||||
size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3};
|
||||
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
||||
if (concat_pack) {
|
||||
// packed kernel needs the dst dims to unflatten its 1-D row index.
|
||||
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int), &ne3));
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
const int maxwg = (int)backend_ctx->get_kernel_workgroup_size(kernel);
|
||||
const int base = MIN(64, maxwg);
|
||||
const int tpr = MIN(ne0, base); // threads per row
|
||||
const int rpw = MAX(1, base / tpr); // rows per workgroup
|
||||
const int lsz = tpr * rpw;
|
||||
const int nrows = ne1*ne2*ne3;
|
||||
const int nwg = (nrows + rpw - 1) / rpw;
|
||||
size_t global_work_size[] = {(size_t)nwg*lsz, 1, 1};
|
||||
size_t local_work_size[] = {(size_t)lsz, 1, 1};
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size, dst);
|
||||
} else {
|
||||
size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3};
|
||||
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
@@ -12176,6 +12450,368 @@ static void ggml_cl_mul_mat_q4_1_f32_adreno(ggml_backend_t backend, const ggml_t
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_cl_mul_mat_q5_0_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(src1);
|
||||
GGML_ASSERT(src1->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
ggml_tensor_extra_cl_q5_0 * extra0_q5_0 = (ggml_tensor_extra_cl_q5_0 *)src0->extra;
|
||||
|
||||
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
const int ne00 = src0->ne[0];
|
||||
const int ne01 = src0->ne[1];
|
||||
|
||||
const int ne1 = dst->ne[1];
|
||||
|
||||
GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0);
|
||||
|
||||
cl_context context = backend_ctx->context;
|
||||
cl_kernel kernel;
|
||||
|
||||
cl_int err;
|
||||
cl_image_format img_fmt;
|
||||
cl_image_desc img_desc;
|
||||
cl_buffer_region region;
|
||||
|
||||
int M = ne01;
|
||||
int N = ne1;
|
||||
int K = ne00;
|
||||
|
||||
if (ne1 == 1) {
|
||||
cl_mem qs_img = nullptr;
|
||||
cl_mem b_sub_buf = nullptr;
|
||||
cl_mem b_img = nullptr;
|
||||
|
||||
// image for qs
|
||||
img_fmt = { CL_R, CL_UNSIGNED_INT32 };
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = M * K / 2 / 4;
|
||||
img_desc.buffer = extra0_q5_0->qs;
|
||||
CL_CHECK((qs_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
// subbuffer for activations
|
||||
region.origin = offset1;
|
||||
region.size = K * N * sizeof(float);
|
||||
CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
|
||||
// image for activations
|
||||
img_fmt = {CL_RGBA, CL_FLOAT};
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = K * N / 4;
|
||||
img_desc.buffer = b_sub_buf;
|
||||
CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
kernel = backend_ctx->kernel_gemv_noshuffle_q5_0_f32;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &qs_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q5_0->qh));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q5_0->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &b_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_int), &ne01));
|
||||
|
||||
size_t local_work_size[3] = {64, 4, 1};
|
||||
size_t global_work_size[3] = {(size_t)CEIL_DIV(ne01/2, 64)*64, 4, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
|
||||
CL_CHECK(clReleaseMemObject(qs_img));
|
||||
CL_CHECK(clReleaseMemObject(b_sub_buf));
|
||||
CL_CHECK(clReleaseMemObject(b_img));
|
||||
} else {
|
||||
cl_mem b_sub_buf = nullptr;
|
||||
cl_mem b_sub_buf_trans = nullptr;
|
||||
cl_mem b_img = nullptr;
|
||||
cl_mem b_img_trans = nullptr;
|
||||
cl_mem d_sub_buf = nullptr;
|
||||
|
||||
// subbuffer for activations
|
||||
region.origin = offset1;
|
||||
region.size = K * N * sizeof(float);
|
||||
CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
|
||||
// image for activations
|
||||
img_fmt = {CL_RGBA, CL_FLOAT};
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = K * N / 4;
|
||||
img_desc.buffer = b_sub_buf;
|
||||
CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
// pad N to multiple of 8
|
||||
int extra_elements = N % 8;
|
||||
int padding = 0;
|
||||
if (extra_elements > 0){
|
||||
padding = 8 - extra_elements;
|
||||
}
|
||||
|
||||
// subbuffer for transposed activations
|
||||
region.origin = 0;
|
||||
region.size = K * (N + padding) * sizeof(float)/2;
|
||||
backend_ctx->prealloc_act_trans.allocate(context, region.size);
|
||||
CL_CHECK((b_sub_buf_trans = clCreateSubBuffer(backend_ctx->prealloc_act_trans.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
|
||||
// image for transposed activations
|
||||
img_fmt = {CL_RGBA, CL_HALF_FLOAT};
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = K * (N + padding) / 4;
|
||||
img_desc.buffer = b_sub_buf_trans;
|
||||
CL_CHECK((b_img_trans = clCreateImage(context, 0, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
// subbuffer for output
|
||||
region.origin = extrad->offset;
|
||||
region.size = M * N * sizeof(float);
|
||||
CL_CHECK((d_sub_buf = clCreateSubBuffer(extrad->data_device, CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
|
||||
// transpose activations
|
||||
int height_B = N/4;
|
||||
if (height_B == 0) {
|
||||
height_B = 1;
|
||||
}
|
||||
int width_B = K/4;
|
||||
int padded_height_B = (N + padding)/4;
|
||||
|
||||
kernel = backend_ctx->kernel_transpose_32_16;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &b_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &b_img_trans));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_B));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_B));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &padded_height_B));
|
||||
|
||||
size_t local_work_size_t[2] = { 1, 16 };
|
||||
size_t global_work_size_t[2] = { (size_t)width_B, (size_t)padded_height_B };
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size_t, local_work_size_t, dst);
|
||||
|
||||
// gemm
|
||||
kernel = backend_ctx->kernel_gemm_noshuffle_q5_0_f32;
|
||||
int padded_N = N + padding;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q5_0->qs));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q5_0->qh));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q5_0->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &b_img_trans));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_sub_buf));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_int), &padded_N));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_int), &ne1));
|
||||
|
||||
size_t global_work_size[3] = {(size_t)CEIL_DIV(ne1, 8), (size_t)CEIL_DIV(ne01, 4), 1};
|
||||
size_t local_work_size[3] = {1, 128, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
|
||||
CL_CHECK(clReleaseMemObject(b_sub_buf));
|
||||
CL_CHECK(clReleaseMemObject(b_sub_buf_trans));
|
||||
CL_CHECK(clReleaseMemObject(b_img));
|
||||
CL_CHECK(clReleaseMemObject(b_img_trans));
|
||||
CL_CHECK(clReleaseMemObject(d_sub_buf));
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(backend);
|
||||
GGML_UNUSED(src0);
|
||||
GGML_UNUSED(src1);
|
||||
GGML_UNUSED(dst);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_cl_mul_mat_q5_1_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(src1);
|
||||
GGML_ASSERT(src1->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
ggml_tensor_extra_cl_q5_1 * extra0_q5_1 = (ggml_tensor_extra_cl_q5_1 *)src0->extra;
|
||||
|
||||
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
const int ne00 = src0->ne[0];
|
||||
const int ne01 = src0->ne[1];
|
||||
|
||||
const int ne1 = dst->ne[1];
|
||||
|
||||
GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0);
|
||||
|
||||
cl_context context = backend_ctx->context;
|
||||
cl_kernel kernel;
|
||||
|
||||
cl_int err;
|
||||
cl_image_format img_fmt;
|
||||
cl_image_desc img_desc;
|
||||
cl_buffer_region region;
|
||||
|
||||
int M = ne01;
|
||||
int N = ne1;
|
||||
int K = ne00;
|
||||
|
||||
if (ne1 == 1) {
|
||||
cl_mem qs_img = nullptr;
|
||||
cl_mem b_sub_buf = nullptr;
|
||||
cl_mem b_img = nullptr;
|
||||
|
||||
// image for qs
|
||||
img_fmt = { CL_R, CL_UNSIGNED_INT32 };
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = M * K / 2 / 4;
|
||||
img_desc.buffer = extra0_q5_1->qs;
|
||||
CL_CHECK((qs_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
// subbuffer for activations
|
||||
region.origin = offset1;
|
||||
region.size = K * N * sizeof(float);
|
||||
CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
|
||||
// image for activations
|
||||
img_fmt = {CL_RGBA, CL_FLOAT};
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = K * N / 4;
|
||||
img_desc.buffer = b_sub_buf;
|
||||
CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
kernel = backend_ctx->kernel_gemv_noshuffle_q5_1_f32;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &qs_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q5_1->qh));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q5_1->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q5_1->m));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &b_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_int), &ne01));
|
||||
|
||||
size_t local_work_size[3] = {64, 4, 1};
|
||||
size_t global_work_size[3] = {(size_t)CEIL_DIV(ne01/2, 64)*64, 4, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
|
||||
CL_CHECK(clReleaseMemObject(qs_img));
|
||||
CL_CHECK(clReleaseMemObject(b_sub_buf));
|
||||
CL_CHECK(clReleaseMemObject(b_img));
|
||||
} else {
|
||||
cl_mem b_sub_buf = nullptr;
|
||||
cl_mem b_sub_buf_trans = nullptr;
|
||||
cl_mem b_img = nullptr;
|
||||
cl_mem b_img_trans = nullptr;
|
||||
cl_mem d_sub_buf = nullptr;
|
||||
|
||||
// subbuffer for activations
|
||||
region.origin = offset1;
|
||||
region.size = K * N * sizeof(float);
|
||||
CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
|
||||
// image for activations
|
||||
img_fmt = {CL_RGBA, CL_FLOAT};
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = K * N / 4;
|
||||
img_desc.buffer = b_sub_buf;
|
||||
CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
// pad N to multiple of 8
|
||||
int extra_elements = N % 8;
|
||||
int padding = 0;
|
||||
if (extra_elements > 0){
|
||||
padding = 8 - extra_elements;
|
||||
}
|
||||
|
||||
// subbuffer for transposed activations
|
||||
region.origin = 0;
|
||||
region.size = K * (N + padding) * sizeof(float)/2;
|
||||
backend_ctx->prealloc_act_trans.allocate(context, region.size);
|
||||
CL_CHECK((b_sub_buf_trans = clCreateSubBuffer(backend_ctx->prealloc_act_trans.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
|
||||
// image for transposed activations
|
||||
img_fmt = {CL_RGBA, CL_HALF_FLOAT};
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = K * (N + padding) / 4;
|
||||
img_desc.buffer = b_sub_buf_trans;
|
||||
CL_CHECK((b_img_trans = clCreateImage(context, 0, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
// subbuffer for output
|
||||
region.origin = extrad->offset;
|
||||
region.size = M * N * sizeof(float);
|
||||
CL_CHECK((d_sub_buf = clCreateSubBuffer(extrad->data_device, CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
|
||||
// transpose activations
|
||||
int height_B = N/4;
|
||||
if (height_B == 0) {
|
||||
height_B = 1;
|
||||
}
|
||||
int width_B = K/4;
|
||||
int padded_height_B = (N + padding)/4;
|
||||
|
||||
kernel = backend_ctx->kernel_transpose_32_16;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &b_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &b_img_trans));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_B));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_B));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &padded_height_B));
|
||||
|
||||
size_t local_work_size_t[2] = { 1, 16 };
|
||||
size_t global_work_size_t[2] = { (size_t)width_B, (size_t)padded_height_B };
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size_t, local_work_size_t, dst);
|
||||
|
||||
// gemm
|
||||
kernel = backend_ctx->kernel_gemm_noshuffle_q5_1_f32;
|
||||
int padded_N = N + padding;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q5_1->qs));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q5_1->qh));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q5_1->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q5_1->m));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &b_img_trans));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &d_sub_buf));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_int), &padded_N));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_int), &ne1));
|
||||
|
||||
size_t global_work_size[3] = {(size_t)CEIL_DIV(ne1, 8), (size_t)CEIL_DIV(ne01, 4), 1};
|
||||
size_t local_work_size[3] = {1, 128, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
|
||||
CL_CHECK(clReleaseMemObject(b_sub_buf));
|
||||
CL_CHECK(clReleaseMemObject(b_sub_buf_trans));
|
||||
CL_CHECK(clReleaseMemObject(b_img));
|
||||
CL_CHECK(clReleaseMemObject(b_img_trans));
|
||||
CL_CHECK(clReleaseMemObject(d_sub_buf));
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(backend);
|
||||
GGML_UNUSED(src0);
|
||||
GGML_UNUSED(src1);
|
||||
GGML_UNUSED(dst);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_cl_mul_mat_iq4_nl_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
GGML_ASSERT(src0);
|
||||
@@ -13214,6 +13850,18 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
||||
return;
|
||||
}
|
||||
|
||||
// q5_0 x fp32
|
||||
if (src0t == GGML_TYPE_Q5_0 && src1t == GGML_TYPE_F32) {
|
||||
ggml_cl_mul_mat_q5_0_f32_adreno(backend, src0, src1, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
// q5_1 x fp32
|
||||
if (src0t == GGML_TYPE_Q5_1 && src1t == GGML_TYPE_F32) {
|
||||
ggml_cl_mul_mat_q5_1_f32_adreno(backend, src0, src1, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
// iq4_nl x fp32
|
||||
if (src0t == GGML_TYPE_IQ4_NL && src1t == GGML_TYPE_F32) {
|
||||
ggml_cl_mul_mat_iq4_nl_f32_adreno(backend, src0, src1, dst);
|
||||
@@ -14536,7 +15184,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
||||
} else if (backend_ctx->gpu_family == ADRENO) {
|
||||
nth0 = 64;
|
||||
nth1 = 2;
|
||||
ndst = 4;
|
||||
ndst = 16;
|
||||
} else {
|
||||
GGML_ASSERT(false && "TODO: Unknown GPU");
|
||||
}
|
||||
@@ -16633,7 +17281,8 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
kernel = backend_ctx->kernel_cpy_f32_f16;
|
||||
break;
|
||||
case GGML_TYPE_F32:
|
||||
kernel = backend_ctx->kernel_cpy_f32_f32;
|
||||
kernel = ne00 < 32 ? backend_ctx->kernel_cpy_f32_f32_pack
|
||||
: backend_ctx->kernel_cpy_f32_f32;
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false && "not implemented");
|
||||
@@ -16685,12 +17334,27 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13));
|
||||
|
||||
const int nth = MIN(64, ne00);
|
||||
if (kernel == backend_ctx->kernel_cpy_f32_f32_pack) {
|
||||
const int maxwg = (int)backend_ctx->get_kernel_workgroup_size(kernel);
|
||||
const int base = MIN(64, maxwg);
|
||||
const int tpr = MIN(ne00, base); // threads per row
|
||||
const int rpw = MAX(1, base / tpr); // rows per workgroup
|
||||
const int lsz = tpr * rpw; // <= base <= maxwg
|
||||
const int nrows = ne01*ne02*ne03;
|
||||
const int nwg = (nrows + rpw - 1) / rpw;
|
||||
|
||||
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
||||
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
||||
size_t global_work_size[] = {(size_t)nwg*lsz, 1, 1};
|
||||
size_t local_work_size[] = {(size_t)lsz, 1, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src1);
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size, src1);
|
||||
} else {
|
||||
const int nth = MIN(64, ne00);
|
||||
|
||||
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
||||
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src1);
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
@@ -17705,7 +18369,7 @@ static void ggml_cl_gated_delta_net(ggml_backend_t backend, ggml_tensor * dst) {
|
||||
const cl_uint H_v = (cl_uint) src_v->ne[1];
|
||||
const cl_uint n_tokens = (cl_uint) src_v->ne[2];
|
||||
const cl_uint n_seqs = (cl_uint) src_v->ne[3];
|
||||
const cl_uint K = (cl_uint) src_state->ne[1];
|
||||
const cl_uint K = (cl_uint) ggml_get_op_params_i32(dst, 0);
|
||||
|
||||
int si;
|
||||
switch (S_v) {
|
||||
|
||||
@@ -49,3 +49,70 @@ kernel void kernel_concat_f32(
|
||||
*y = *x;
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_concat_f32_pack(
|
||||
global const char * src0,
|
||||
ulong offset0,
|
||||
global const char * src1,
|
||||
ulong offset1,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
ulong nb13,
|
||||
int ne0,
|
||||
ulong nb0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3,
|
||||
int dim,
|
||||
int ne1,
|
||||
int ne2,
|
||||
int ne3
|
||||
) {
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
dst = dst + offsetd;
|
||||
|
||||
int lsz = get_local_size(0);
|
||||
int tpr = min(ne0, lsz); // threads per row
|
||||
int rpw = lsz / tpr; // rows per workgroup
|
||||
int lid = get_local_id(0);
|
||||
int row = get_group_id(0)*rpw + lid / tpr;
|
||||
int lane = lid - (lid / tpr) * tpr;
|
||||
|
||||
int nrows = ne1*ne2*ne3;
|
||||
if (row >= nrows) {
|
||||
return;
|
||||
}
|
||||
|
||||
int i1 = row % ne1;
|
||||
int t = row / ne1;
|
||||
int i2 = t % ne2;
|
||||
int i3 = t / ne2;
|
||||
|
||||
int o[4] = {0, 0, 0, 0};
|
||||
o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
|
||||
|
||||
for (int i0 = lane; i0 < ne0; i0 += tpr) {
|
||||
global const float * x;
|
||||
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
||||
x = (global const float *)(src0 + (i3 )*nb03 + (i2 )*nb02 + (i1 )*nb01 + (i0 )*nb00);
|
||||
} else {
|
||||
x = (global const float *)(src1 + (i3 - o[3])*nb13 + (i2 - o[2])*nb12 + (i1 - o[1])*nb11 + (i0 - o[0])*nb10);
|
||||
}
|
||||
|
||||
global float * y = (global float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
*y = *x;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -183,6 +183,65 @@ kernel void kernel_cpy_f32_f32(
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_cpy_f32_f32_pack(
|
||||
global float * src0,
|
||||
ulong offset0,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int ne2,
|
||||
int ne3,
|
||||
ulong nb0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
) {
|
||||
src0 = (global float*)((global char*)src0 + offset0);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
int lsz = get_local_size(0);
|
||||
int tpr = min(ne00, lsz); // threads per row
|
||||
int rpw = lsz / tpr; // rows per workgroup
|
||||
int lid = get_local_id(0);
|
||||
int row = get_group_id(0)*rpw + lid / tpr;
|
||||
int lane = lid - (lid / tpr) * tpr;
|
||||
|
||||
int nrows = ne01*ne02*ne03;
|
||||
if (row >= nrows) {
|
||||
return;
|
||||
}
|
||||
|
||||
int i01 = row % ne01;
|
||||
int t = row / ne01;
|
||||
int i02 = t % ne02;
|
||||
int i03 = t / ne02;
|
||||
|
||||
// linear index of the first element of this row, unflattened over dst dims
|
||||
long n = (long)row * ne00;
|
||||
int i3 = (int)(n / ((long)ne2*ne1*ne0));
|
||||
long rm = n - (long)i3*ne2*ne1*ne0;
|
||||
int i2 = (int)(rm / ((long)ne1*ne0));
|
||||
rm -= (long)i2*ne1*ne0;
|
||||
int i1 = (int)(rm / ne0);
|
||||
int i0 = (int)(rm - (long)i1*ne0);
|
||||
|
||||
global float * dst_data = (global float *) ((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
for (int i00 = lane; i00 < ne00; i00 += tpr) {
|
||||
global const float * src = (global float *)((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
|
||||
dst_data[i00] = src[0];
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_cpy_i32_i32(
|
||||
global int * src0,
|
||||
ulong offset0,
|
||||
|
||||
@@ -584,6 +584,60 @@ kernel void kernel_restore_block_q5_0(
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_convert_block_q5_0_noshuffle(
|
||||
global struct block_q5_0 * src0,
|
||||
global uchar * dst_q,
|
||||
global uint * dst_qh,
|
||||
global half * dst_d
|
||||
) {
|
||||
global struct block_q5_0 * b = (global struct block_q5_0 *) src0 + get_global_id(0);
|
||||
global uchar * q = (global uchar *) dst_q + QK5_0/2*get_global_id(0);
|
||||
global uint * qh = (global uint *) dst_qh + get_global_id(0);
|
||||
global half * d = (global half *) dst_d + get_global_id(0);
|
||||
|
||||
*d = b->d;
|
||||
*qh = *((global uint *)(b->qh));
|
||||
|
||||
for (int i = 0; i < QK5_0/4; ++i) {
|
||||
uchar x0 = b->qs[2*i + 0];
|
||||
uchar x1 = b->qs[2*i + 1];
|
||||
|
||||
q[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
|
||||
q[i + QK5_0/4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
|
||||
|
||||
#ifdef ADRENO_GPU
|
||||
if (get_global_id(0) == 65536*4096) {
|
||||
printf("%04x - %02x\n", *(global ushort*)d, ((x0 & 0xF0) >> 4) | (x1 & 0xF0));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_restore_block_q5_0_noshuffle(
|
||||
global uchar * src_q,
|
||||
global uint * src_qh,
|
||||
global half * src_d,
|
||||
global struct block_q5_0 * dst,
|
||||
uchar mask_0F,
|
||||
uchar mask_F0
|
||||
) {
|
||||
global struct block_q5_0 * b = (global struct block_q5_0 *) dst + get_global_id(0);
|
||||
global uchar * q = (global uchar *) src_q + QK5_0/2*get_global_id(0);
|
||||
global uint * qh = (global uint *) src_qh + get_global_id(0);
|
||||
global half * d = (global half *) src_d + get_global_id(0);
|
||||
|
||||
b->d = *d;
|
||||
*((global uint *)(b->qh)) = *qh;
|
||||
|
||||
for (int i = 0; i < QK5_0/4; ++i) {
|
||||
uchar x0 = q[i + 0 ];
|
||||
uchar x1 = q[i + QK5_0/4];
|
||||
|
||||
b->qs[2*i + 0] = convert_uchar((x0 & mask_0F) | ((x1 & mask_0F) << 4));
|
||||
b->qs[2*i + 1] = convert_uchar(((x0 & mask_F0) >> 4) | (x1 & mask_F0));
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_convert_block_q5_0_trans4_ns(
|
||||
__global struct block_q5_0 * src0,
|
||||
__global uint * dst_qs,
|
||||
@@ -736,6 +790,66 @@ kernel void kernel_restore_block_q5_1(
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_convert_block_q5_1_noshuffle(
|
||||
global struct block_q5_1 * src0,
|
||||
global uchar * dst_q,
|
||||
global uint * dst_qh,
|
||||
global half * dst_d,
|
||||
global half * dst_m
|
||||
) {
|
||||
global struct block_q5_1 * b = (global struct block_q5_1 *) src0 + get_global_id(0);
|
||||
global uchar * q = (global uchar *) dst_q + QK5_1/2*get_global_id(0);
|
||||
global uint * qh = (global uint *) dst_qh + get_global_id(0);
|
||||
global half * d = (global half *) dst_d + get_global_id(0);
|
||||
global half * m = (global half *) dst_m + get_global_id(0);
|
||||
|
||||
*d = b->d;
|
||||
*m = b->m;
|
||||
*qh = *((global uint *)(b->qh));
|
||||
|
||||
for (int i = 0; i < QK5_1/4; ++i) {
|
||||
uchar x0 = b->qs[2*i + 0];
|
||||
uchar x1 = b->qs[2*i + 1];
|
||||
|
||||
q[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
|
||||
q[i + QK5_1/4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
|
||||
|
||||
#ifdef ADRENO_GPU
|
||||
if (get_global_id(0) == 65536*4096) {
|
||||
printf("%04x - %02x\n", *(global ushort*)d, ((x0 & 0xF0) >> 4) | (x1 & 0xF0));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_restore_block_q5_1_noshuffle(
|
||||
global uchar * src_q,
|
||||
global uint * src_qh,
|
||||
global half * src_d,
|
||||
global half * src_m,
|
||||
global struct block_q5_1 * dst,
|
||||
uchar mask_0F,
|
||||
uchar mask_F0
|
||||
) {
|
||||
global struct block_q5_1 * b = (global struct block_q5_1 *) dst + get_global_id(0);
|
||||
global uchar * q = (global uchar *) src_q + QK5_1/2*get_global_id(0);
|
||||
global uint * qh = (global uint *) src_qh + get_global_id(0);
|
||||
global half * d = (global half *) src_d + get_global_id(0);
|
||||
global half * m = (global half *) src_m + get_global_id(0);
|
||||
|
||||
b->d = *d;
|
||||
b->m = *m;
|
||||
*((global uint *)(b->qh)) = *qh;
|
||||
|
||||
for (int i = 0; i < QK5_1/4; ++i) {
|
||||
uchar x0 = q[i + 0 ];
|
||||
uchar x1 = q[i + QK5_1/4];
|
||||
|
||||
b->qs[2*i + 0] = convert_uchar((x0 & mask_0F) | ((x1 & mask_0F) << 4));
|
||||
b->qs[2*i + 1] = convert_uchar(((x0 & mask_F0) >> 4) | (x1 & mask_F0));
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_convert_block_q5_1_trans4_ns(
|
||||
__global struct block_q5_1 * src0,
|
||||
__global uint * dst_qs,
|
||||
|
||||
@@ -123,7 +123,8 @@ kernel void kernel_gated_delta_net(
|
||||
const uint iq3 = seq_id / rq3; // seq index for Q and K
|
||||
|
||||
const uint state_size = S_V * S_V;
|
||||
const uint state_base = (seq_id * K * H_v + head_id) * state_size;
|
||||
// input state holds s0 only [S_v, S_v, H, n_seqs]: per-seq stride is H*D.
|
||||
const uint state_base = (seq_id * H_v + head_id) * state_size;
|
||||
const uint q_off_base = iq3 * sq3 + iq1 * sq1;
|
||||
const uint v_off_base = seq_id * sv3 + head_id * sv1;
|
||||
const uint gb_off_base = seq_id * sb3 + head_id * sb1;
|
||||
@@ -143,7 +144,8 @@ kernel void kernel_gated_delta_net(
|
||||
}
|
||||
}
|
||||
|
||||
const int shift = (int)n_tokens - (int)K;
|
||||
// snapshot slot mapping: slot 0 = most recent state, slot s = s tokens back.
|
||||
// When n_tokens < K only slots 0..n_tokens-1 are written; older slots are caller-owned.
|
||||
uint attn_off = (seq_id * n_tokens * H_v + head_id) * S_V;
|
||||
|
||||
for (uint t = 0; t < n_tokens; t++) {
|
||||
@@ -219,7 +221,7 @@ kernel void kernel_gated_delta_net(
|
||||
attn_off += S_V * H_v;
|
||||
|
||||
if (K > 1u) {
|
||||
const int target_slot = (int)t - shift;
|
||||
const int target_slot = (int)n_tokens - 1 - (int)t;
|
||||
if (target_slot >= 0 && target_slot < (int)K) {
|
||||
#pragma unroll
|
||||
for (uint cg = 0; cg < COLS_PER_LANE_GROUP; cg++) {
|
||||
|
||||
@@ -0,0 +1,131 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
|
||||
#ifdef cl_qcom_reqd_sub_group_size
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#endif
|
||||
|
||||
#ifdef ADRENO_GPU
|
||||
REQD_SUBGROUP_SIZE_128
|
||||
#endif
|
||||
|
||||
kernel void kernel_gemm_noshuffle_q5_0_f32(
|
||||
global const ushort * src0_qs, // quantized A
|
||||
global const uchar * src0_qh, // 5th bits
|
||||
global const half * src0_d, // A scales
|
||||
__read_only image1d_buffer_t src1, // B (1d image)
|
||||
global float * dst, // C
|
||||
int m, // M
|
||||
int n, // N with padding
|
||||
int k, // K
|
||||
int n_no_padding // N without padding
|
||||
) {
|
||||
|
||||
int n_4 = n >> 2;
|
||||
|
||||
int gy = get_global_id(0);
|
||||
int gx = get_global_id(1);
|
||||
int gx_2 = gx << 2;
|
||||
|
||||
half8 c0 = 0, c1 = 0, c2 = 0, c3 = 0;
|
||||
half8 B;
|
||||
half4 dequantized_weights;
|
||||
|
||||
global const ushort * weight_ptr = src0_qs + gx_2;
|
||||
global const uchar * qh_ptr = src0_qh + gx_2;
|
||||
global const half * scale_ptr = src0_d + gx_2;
|
||||
|
||||
for (int i = 0; i < k; i += 4) {
|
||||
|
||||
B.s0123 = read_imageh(src1, gy*2 + i*n_4);
|
||||
B.s4567 = read_imageh(src1, gy*2 + i*n_4 + 1);
|
||||
|
||||
ushort4 bits4 = vload4(0, weight_ptr + (i >> 2)*m);
|
||||
uchar4 bits1 = vload4(0, qh_ptr + (i >> 3)*m);
|
||||
uchar4 qh = bits1 >> (uchar4)(i & 4);
|
||||
|
||||
half4 scale = vload4(0, scale_ptr + (i >> 5)*m);
|
||||
|
||||
// j=0
|
||||
dequantized_weights.s0 = (convert_half((bits4.s0 & 0x000F) | ((qh.s0 & 0x01) << 4)) - 16.0h) * scale.s0;
|
||||
dequantized_weights.s1 = (convert_half((bits4.s1 & 0x000F) | ((qh.s1 & 0x01) << 4)) - 16.0h) * scale.s1;
|
||||
dequantized_weights.s2 = (convert_half((bits4.s2 & 0x000F) | ((qh.s2 & 0x01) << 4)) - 16.0h) * scale.s2;
|
||||
dequantized_weights.s3 = (convert_half((bits4.s3 & 0x000F) | ((qh.s3 & 0x01) << 4)) - 16.0h) * scale.s3;
|
||||
c0 += B * dequantized_weights.s0;
|
||||
c1 += B * dequantized_weights.s1;
|
||||
c2 += B * dequantized_weights.s2;
|
||||
c3 += B * dequantized_weights.s3;
|
||||
|
||||
// j=1
|
||||
B.s0123 = read_imageh(src1, gy*2 + (i+1)*n_4);
|
||||
B.s4567 = read_imageh(src1, gy*2 + (i+1)*n_4 + 1);
|
||||
dequantized_weights.s0 = (convert_half(((bits4.s0 & 0x00F0) >> 4) | ((qh.s0 & 0x02) << 3)) - 16.0h) * scale.s0;
|
||||
dequantized_weights.s1 = (convert_half(((bits4.s1 & 0x00F0) >> 4) | ((qh.s1 & 0x02) << 3)) - 16.0h) * scale.s1;
|
||||
dequantized_weights.s2 = (convert_half(((bits4.s2 & 0x00F0) >> 4) | ((qh.s2 & 0x02) << 3)) - 16.0h) * scale.s2;
|
||||
dequantized_weights.s3 = (convert_half(((bits4.s3 & 0x00F0) >> 4) | ((qh.s3 & 0x02) << 3)) - 16.0h) * scale.s3;
|
||||
c0 += B * dequantized_weights.s0;
|
||||
c1 += B * dequantized_weights.s1;
|
||||
c2 += B * dequantized_weights.s2;
|
||||
c3 += B * dequantized_weights.s3;
|
||||
|
||||
// j=2
|
||||
B.s0123 = read_imageh(src1, gy*2 + (i+2)*n_4);
|
||||
B.s4567 = read_imageh(src1, gy*2 + (i+2)*n_4 + 1);
|
||||
dequantized_weights.s0 = (convert_half(((bits4.s0 & 0x0F00) >> 8) | ((qh.s0 & 0x04) << 2)) - 16.0h) * scale.s0;
|
||||
dequantized_weights.s1 = (convert_half(((bits4.s1 & 0x0F00) >> 8) | ((qh.s1 & 0x04) << 2)) - 16.0h) * scale.s1;
|
||||
dequantized_weights.s2 = (convert_half(((bits4.s2 & 0x0F00) >> 8) | ((qh.s2 & 0x04) << 2)) - 16.0h) * scale.s2;
|
||||
dequantized_weights.s3 = (convert_half(((bits4.s3 & 0x0F00) >> 8) | ((qh.s3 & 0x04) << 2)) - 16.0h) * scale.s3;
|
||||
c0 += B * dequantized_weights.s0;
|
||||
c1 += B * dequantized_weights.s1;
|
||||
c2 += B * dequantized_weights.s2;
|
||||
c3 += B * dequantized_weights.s3;
|
||||
|
||||
// j=3
|
||||
B.s0123 = read_imageh(src1, gy*2 + (i+3)*n_4);
|
||||
B.s4567 = read_imageh(src1, gy*2 + (i+3)*n_4 + 1);
|
||||
dequantized_weights.s0 = (convert_half(((bits4.s0 & 0xF000) >> 12) | ((qh.s0 & 0x08) << 1)) - 16.0h) * scale.s0;
|
||||
dequantized_weights.s1 = (convert_half(((bits4.s1 & 0xF000) >> 12) | ((qh.s1 & 0x08) << 1)) - 16.0h) * scale.s1;
|
||||
dequantized_weights.s2 = (convert_half(((bits4.s2 & 0xF000) >> 12) | ((qh.s2 & 0x08) << 1)) - 16.0h) * scale.s2;
|
||||
dequantized_weights.s3 = (convert_half(((bits4.s3 & 0xF000) >> 12) | ((qh.s3 & 0x08) << 1)) - 16.0h) * scale.s3;
|
||||
c0 += B * dequantized_weights.s0;
|
||||
c1 += B * dequantized_weights.s1;
|
||||
c2 += B * dequantized_weights.s2;
|
||||
c3 += B * dequantized_weights.s3;
|
||||
}
|
||||
|
||||
int idx = (gy<<3)*m + (gx<<2);
|
||||
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s0, c1.s0, c2.s0, c3.s0), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s1, c1.s1, c2.s1, c3.s1), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s2, c1.s2, c2.s2, c3.s2), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s3, c1.s3, c2.s3, c3.s3), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s4, c1.s4, c2.s4, c3.s4), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s5, c1.s5, c2.s5, c3.s5), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s6, c1.s6, c2.s6, c3.s6), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s7, c1.s7, c2.s7, c3.s7), 0, dst + idx);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,134 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
|
||||
#ifdef cl_qcom_reqd_sub_group_size
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#endif
|
||||
|
||||
#ifdef ADRENO_GPU
|
||||
REQD_SUBGROUP_SIZE_128
|
||||
#endif
|
||||
|
||||
kernel void kernel_gemm_noshuffle_q5_1_f32(
|
||||
global const ushort * src0_qs, // quantized A
|
||||
global const uchar * src0_qh, // 5th bits
|
||||
global const half * src0_d, // A scales
|
||||
global const half * src0_m, // A mins
|
||||
__read_only image1d_buffer_t src1, // B (1d image)
|
||||
global float * dst, // C
|
||||
int m, // M
|
||||
int n, // N with padding
|
||||
int k, // K
|
||||
int n_no_padding // N without padding
|
||||
) {
|
||||
|
||||
int n_4 = n >> 2;
|
||||
|
||||
int gy = get_global_id(0);
|
||||
int gx = get_global_id(1);
|
||||
int gx_2 = gx << 2;
|
||||
|
||||
half8 c0 = 0, c1 = 0, c2 = 0, c3 = 0;
|
||||
half8 B;
|
||||
half4 dequantized_weights;
|
||||
|
||||
global const ushort * weight_ptr = src0_qs + gx_2;
|
||||
global const uchar * qh_ptr = src0_qh + gx_2;
|
||||
global const half * scale_ptr = src0_d + gx_2;
|
||||
global const half * min_ptr = src0_m + gx_2;
|
||||
|
||||
for (int i = 0; i < k; i += 4) {
|
||||
|
||||
B.s0123 = read_imageh(src1, gy*2 + i*n_4);
|
||||
B.s4567 = read_imageh(src1, gy*2 + i*n_4 + 1);
|
||||
|
||||
ushort4 bits4 = vload4(0, weight_ptr + (i >> 2)*m);
|
||||
uchar4 bits1 = vload4(0, qh_ptr + (i >> 3)*m);
|
||||
uchar4 qh = bits1 >> (uchar4)(i & 4);
|
||||
|
||||
half4 scale = vload4(0, scale_ptr + (i >> 5)*m);
|
||||
half4 minv = vload4(0, min_ptr + (i >> 5)*m);
|
||||
|
||||
// j=0
|
||||
dequantized_weights.s0 = convert_half((bits4.s0 & 0x000F) | ((qh.s0 & 0x01) << 4)) * scale.s0 + minv.s0;
|
||||
dequantized_weights.s1 = convert_half((bits4.s1 & 0x000F) | ((qh.s1 & 0x01) << 4)) * scale.s1 + minv.s1;
|
||||
dequantized_weights.s2 = convert_half((bits4.s2 & 0x000F) | ((qh.s2 & 0x01) << 4)) * scale.s2 + minv.s2;
|
||||
dequantized_weights.s3 = convert_half((bits4.s3 & 0x000F) | ((qh.s3 & 0x01) << 4)) * scale.s3 + minv.s3;
|
||||
c0 += B * dequantized_weights.s0;
|
||||
c1 += B * dequantized_weights.s1;
|
||||
c2 += B * dequantized_weights.s2;
|
||||
c3 += B * dequantized_weights.s3;
|
||||
|
||||
// j=1
|
||||
B.s0123 = read_imageh(src1, gy*2 + (i+1)*n_4);
|
||||
B.s4567 = read_imageh(src1, gy*2 + (i+1)*n_4 + 1);
|
||||
dequantized_weights.s0 = convert_half(((bits4.s0 & 0x00F0) >> 4) | ((qh.s0 & 0x02) << 3)) * scale.s0 + minv.s0;
|
||||
dequantized_weights.s1 = convert_half(((bits4.s1 & 0x00F0) >> 4) | ((qh.s1 & 0x02) << 3)) * scale.s1 + minv.s1;
|
||||
dequantized_weights.s2 = convert_half(((bits4.s2 & 0x00F0) >> 4) | ((qh.s2 & 0x02) << 3)) * scale.s2 + minv.s2;
|
||||
dequantized_weights.s3 = convert_half(((bits4.s3 & 0x00F0) >> 4) | ((qh.s3 & 0x02) << 3)) * scale.s3 + minv.s3;
|
||||
c0 += B * dequantized_weights.s0;
|
||||
c1 += B * dequantized_weights.s1;
|
||||
c2 += B * dequantized_weights.s2;
|
||||
c3 += B * dequantized_weights.s3;
|
||||
|
||||
// j=2
|
||||
B.s0123 = read_imageh(src1, gy*2 + (i+2)*n_4);
|
||||
B.s4567 = read_imageh(src1, gy*2 + (i+2)*n_4 + 1);
|
||||
dequantized_weights.s0 = convert_half(((bits4.s0 & 0x0F00) >> 8) | ((qh.s0 & 0x04) << 2)) * scale.s0 + minv.s0;
|
||||
dequantized_weights.s1 = convert_half(((bits4.s1 & 0x0F00) >> 8) | ((qh.s1 & 0x04) << 2)) * scale.s1 + minv.s1;
|
||||
dequantized_weights.s2 = convert_half(((bits4.s2 & 0x0F00) >> 8) | ((qh.s2 & 0x04) << 2)) * scale.s2 + minv.s2;
|
||||
dequantized_weights.s3 = convert_half(((bits4.s3 & 0x0F00) >> 8) | ((qh.s3 & 0x04) << 2)) * scale.s3 + minv.s3;
|
||||
c0 += B * dequantized_weights.s0;
|
||||
c1 += B * dequantized_weights.s1;
|
||||
c2 += B * dequantized_weights.s2;
|
||||
c3 += B * dequantized_weights.s3;
|
||||
|
||||
// j=3
|
||||
B.s0123 = read_imageh(src1, gy*2 + (i+3)*n_4);
|
||||
B.s4567 = read_imageh(src1, gy*2 + (i+3)*n_4 + 1);
|
||||
dequantized_weights.s0 = convert_half(((bits4.s0 & 0xF000) >> 12) | ((qh.s0 & 0x08) << 1)) * scale.s0 + minv.s0;
|
||||
dequantized_weights.s1 = convert_half(((bits4.s1 & 0xF000) >> 12) | ((qh.s1 & 0x08) << 1)) * scale.s1 + minv.s1;
|
||||
dequantized_weights.s2 = convert_half(((bits4.s2 & 0xF000) >> 12) | ((qh.s2 & 0x08) << 1)) * scale.s2 + minv.s2;
|
||||
dequantized_weights.s3 = convert_half(((bits4.s3 & 0xF000) >> 12) | ((qh.s3 & 0x08) << 1)) * scale.s3 + minv.s3;
|
||||
c0 += B * dequantized_weights.s0;
|
||||
c1 += B * dequantized_weights.s1;
|
||||
c2 += B * dequantized_weights.s2;
|
||||
c3 += B * dequantized_weights.s3;
|
||||
}
|
||||
|
||||
int idx = (gy<<3)*m + (gx<<2);
|
||||
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s0, c1.s0, c2.s0, c3.s0), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s1, c1.s1, c2.s1, c3.s1), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s2, c1.s2, c2.s2, c3.s2), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s3, c1.s3, c2.s3, c3.s3), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s4, c1.s4, c2.s4, c3.s4), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s5, c1.s5, c2.s5, c3.s5), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s6, c1.s6, c2.s6, c3.s6), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s7, c1.s7, c2.s7, c3.s7), 0, dst + idx);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,291 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
|
||||
#ifdef cl_qcom_reqd_sub_group_size
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#endif
|
||||
|
||||
#define QK5_0 32
|
||||
#define NSUBGROUPS 4
|
||||
#define SUBGROUP_SIZE 64
|
||||
|
||||
#define dequantizeBlockAccum_ns_q5_0_sgbroadcast_1_hi(total_sums, bits4, bits1, scale, y) \
|
||||
float shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s0, 0); \
|
||||
total_sums.s0 += (((bits4.s0 & 0x000F) | (((bits1.s0 ) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s1 & 0x000F) | (((bits1.s4 ) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s1, 0); \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x00F0) >> 4) | (((bits1.s0 >> 1) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x00F0) >> 4) | (((bits1.s4 >> 1) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s2, 0); \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x0F00) >> 8) | (((bits1.s0 >> 2) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x0F00) >> 8) | (((bits1.s4 >> 2) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s3, 0); \
|
||||
total_sums.s0 += ((((bits4.s0 & 0xF000) >> 12) | (((bits1.s0 >> 3) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0xF000) >> 12) | (((bits1.s4 >> 3) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s4, 0); \
|
||||
total_sums.s0 += (((bits4.s2 & 0x000F) | (((bits1.s0 >> 4) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s3 & 0x000F) | (((bits1.s4 >> 4) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s5, 0); \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x00F0) >> 4) | (((bits1.s0 >> 5) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x00F0) >> 4) | (((bits1.s4 >> 5) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s6, 0); \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x0F00) >> 8) | (((bits1.s0 >> 6) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x0F00) >> 8) | (((bits1.s4 >> 6) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s7, 0); \
|
||||
total_sums.s0 += ((((bits4.s2 & 0xF000) >> 12) | (((bits1.s0 >> 7) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0xF000) >> 12) | (((bits1.s4 >> 7) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s0, 1); \
|
||||
total_sums.s0 += (((bits4.s4 & 0x000F) | (((bits1.s1 ) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s5 & 0x000F) | (((bits1.s5 ) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s1, 1); \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x00F0) >> 4) | (((bits1.s1 >> 1) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x00F0) >> 4) | (((bits1.s5 >> 1) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s2, 1); \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x0F00) >> 8) | (((bits1.s1 >> 2) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x0F00) >> 8) | (((bits1.s5 >> 2) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s3, 1); \
|
||||
total_sums.s0 += ((((bits4.s4 & 0xF000) >> 12) | (((bits1.s1 >> 3) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0xF000) >> 12) | (((bits1.s5 >> 3) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s4, 1); \
|
||||
total_sums.s0 += (((bits4.s6 & 0x000F) | (((bits1.s1 >> 4) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s7 & 0x000F) | (((bits1.s5 >> 4) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s5, 1); \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x00F0) >> 4) | (((bits1.s1 >> 5) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x00F0) >> 4) | (((bits1.s5 >> 5) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s6, 1); \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x0F00) >> 8) | (((bits1.s1 >> 6) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x0F00) >> 8) | (((bits1.s5 >> 6) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s7, 1); \
|
||||
total_sums.s0 += ((((bits4.s6 & 0xF000) >> 12) | (((bits1.s1 >> 7) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0xF000) >> 12) | (((bits1.s5 >> 7) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
|
||||
|
||||
#define dequantizeBlockAccum_ns_q5_0_sgbroadcast_1_lo(total_sums, bits4, bits1, scale, y) \
|
||||
shared_y = sub_group_broadcast(y.s0, 2); \
|
||||
total_sums.s0 += (((bits4.s0 & 0x000F) | (((bits1.s2 ) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s1 & 0x000F) | (((bits1.s6 ) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s1, 2); \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x00F0) >> 4) | (((bits1.s2 >> 1) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x00F0) >> 4) | (((bits1.s6 >> 1) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s2, 2); \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x0F00) >> 8) | (((bits1.s2 >> 2) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x0F00) >> 8) | (((bits1.s6 >> 2) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s3, 2); \
|
||||
total_sums.s0 += ((((bits4.s0 & 0xF000) >> 12) | (((bits1.s2 >> 3) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0xF000) >> 12) | (((bits1.s6 >> 3) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s4, 2); \
|
||||
total_sums.s0 += (((bits4.s2 & 0x000F) | (((bits1.s2 >> 4) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s3 & 0x000F) | (((bits1.s6 >> 4) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s5, 2); \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x00F0) >> 4) | (((bits1.s2 >> 5) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x00F0) >> 4) | (((bits1.s6 >> 5) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s6, 2); \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x0F00) >> 8) | (((bits1.s2 >> 6) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x0F00) >> 8) | (((bits1.s6 >> 6) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s7, 2); \
|
||||
total_sums.s0 += ((((bits4.s2 & 0xF000) >> 12) | (((bits1.s2 >> 7) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0xF000) >> 12) | (((bits1.s6 >> 7) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s0, 3); \
|
||||
total_sums.s0 += (((bits4.s4 & 0x000F) | (((bits1.s3 ) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s5 & 0x000F) | (((bits1.s7 ) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s1, 3); \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x00F0) >> 4) | (((bits1.s3 >> 1) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x00F0) >> 4) | (((bits1.s7 >> 1) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s2, 3); \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x0F00) >> 8) | (((bits1.s3 >> 2) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x0F00) >> 8) | (((bits1.s7 >> 2) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s3, 3); \
|
||||
total_sums.s0 += ((((bits4.s4 & 0xF000) >> 12) | (((bits1.s3 >> 3) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0xF000) >> 12) | (((bits1.s7 >> 3) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s4, 3); \
|
||||
total_sums.s0 += (((bits4.s6 & 0x000F) | (((bits1.s3 >> 4) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s7 & 0x000F) | (((bits1.s7 >> 4) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s5, 3); \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x00F0) >> 4) | (((bits1.s3 >> 5) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x00F0) >> 4) | (((bits1.s7 >> 5) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s6, 3); \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x0F00) >> 8) | (((bits1.s3 >> 6) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x0F00) >> 8) | (((bits1.s7 >> 6) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s7, 3); \
|
||||
total_sums.s0 += ((((bits4.s6 & 0xF000) >> 12) | (((bits1.s3 >> 7) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0xF000) >> 12) | (((bits1.s7 >> 7) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
|
||||
|
||||
#define dequantizeBlockAccum_ns_q5_0_sgbroadcast_8_hi(total_sums, bits4, bits1, scale, y) \
|
||||
float8 shared_y; \
|
||||
shared_y = sub_group_broadcast(y, 0); \
|
||||
total_sums.s0 += (((bits4.s0 & 0x000F) | (((bits1.s0 ) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s0; \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x00F0) >> 4) | (((bits1.s0 >> 1) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s1; \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x0F00) >> 8) | (((bits1.s0 >> 2) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s2; \
|
||||
total_sums.s0 += ((((bits4.s0 & 0xF000) >> 12) | (((bits1.s0 >> 3) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s3; \
|
||||
total_sums.s0 += (((bits4.s2 & 0x000F) | (((bits1.s0 >> 4) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s4; \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x00F0) >> 4) | (((bits1.s0 >> 5) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s5; \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x0F00) >> 8) | (((bits1.s0 >> 6) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s6; \
|
||||
total_sums.s0 += ((((bits4.s2 & 0xF000) >> 12) | (((bits1.s0 >> 7) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s7; \
|
||||
total_sums.s1 += (((bits4.s1 & 0x000F) | (((bits1.s4 ) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s0; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x00F0) >> 4) | (((bits1.s4 >> 1) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s1; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x0F00) >> 8) | (((bits1.s4 >> 2) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s2; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0xF000) >> 12) | (((bits1.s4 >> 3) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s3; \
|
||||
total_sums.s1 += (((bits4.s3 & 0x000F) | (((bits1.s4 >> 4) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s4; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x00F0) >> 4) | (((bits1.s4 >> 5) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s5; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x0F00) >> 8) | (((bits1.s4 >> 6) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s6; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0xF000) >> 12) | (((bits1.s4 >> 7) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s7; \
|
||||
shared_y = sub_group_broadcast(y, 1); \
|
||||
total_sums.s0 += (((bits4.s4 & 0x000F) | (((bits1.s1 ) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s0; \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x00F0) >> 4) | (((bits1.s1 >> 1) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s1; \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x0F00) >> 8) | (((bits1.s1 >> 2) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s2; \
|
||||
total_sums.s0 += ((((bits4.s4 & 0xF000) >> 12) | (((bits1.s1 >> 3) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s3; \
|
||||
total_sums.s0 += (((bits4.s6 & 0x000F) | (((bits1.s1 >> 4) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s4; \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x00F0) >> 4) | (((bits1.s1 >> 5) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s5; \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x0F00) >> 8) | (((bits1.s1 >> 6) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s6; \
|
||||
total_sums.s0 += ((((bits4.s6 & 0xF000) >> 12) | (((bits1.s1 >> 7) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s7; \
|
||||
total_sums.s1 += (((bits4.s5 & 0x000F) | (((bits1.s5 ) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s0; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x00F0) >> 4) | (((bits1.s5 >> 1) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s1; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x0F00) >> 8) | (((bits1.s5 >> 2) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s2; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0xF000) >> 12) | (((bits1.s5 >> 3) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s3; \
|
||||
total_sums.s1 += (((bits4.s7 & 0x000F) | (((bits1.s5 >> 4) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s4; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x00F0) >> 4) | (((bits1.s5 >> 5) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s5; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x0F00) >> 8) | (((bits1.s5 >> 6) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s6; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0xF000) >> 12) | (((bits1.s5 >> 7) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s7; \
|
||||
|
||||
|
||||
#define dequantizeBlockAccum_ns_q5_0_sgbroadcast_8_lo(total_sums, bits4, bits1, scale, y) \
|
||||
shared_y = sub_group_broadcast(y, 2); \
|
||||
total_sums.s0 += (((bits4.s0 & 0x000F) | (((bits1.s2 ) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s0; \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x00F0) >> 4) | (((bits1.s2 >> 1) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s1; \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x0F00) >> 8) | (((bits1.s2 >> 2) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s2; \
|
||||
total_sums.s0 += ((((bits4.s0 & 0xF000) >> 12) | (((bits1.s2 >> 3) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s3; \
|
||||
total_sums.s0 += (((bits4.s2 & 0x000F) | (((bits1.s2 >> 4) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s4; \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x00F0) >> 4) | (((bits1.s2 >> 5) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s5; \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x0F00) >> 8) | (((bits1.s2 >> 6) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s6; \
|
||||
total_sums.s0 += ((((bits4.s2 & 0xF000) >> 12) | (((bits1.s2 >> 7) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s7; \
|
||||
total_sums.s1 += (((bits4.s1 & 0x000F) | (((bits1.s6 ) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s0; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x00F0) >> 4) | (((bits1.s6 >> 1) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s1; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x0F00) >> 8) | (((bits1.s6 >> 2) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s2; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0xF000) >> 12) | (((bits1.s6 >> 3) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s3; \
|
||||
total_sums.s1 += (((bits4.s3 & 0x000F) | (((bits1.s6 >> 4) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s4; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x00F0) >> 4) | (((bits1.s6 >> 5) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s5; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x0F00) >> 8) | (((bits1.s6 >> 6) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s6; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0xF000) >> 12) | (((bits1.s6 >> 7) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s7; \
|
||||
shared_y = sub_group_broadcast(y, 3); \
|
||||
total_sums.s0 += (((bits4.s4 & 0x000F) | (((bits1.s3 ) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s0; \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x00F0) >> 4) | (((bits1.s3 >> 1) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s1; \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x0F00) >> 8) | (((bits1.s3 >> 2) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s2; \
|
||||
total_sums.s0 += ((((bits4.s4 & 0xF000) >> 12) | (((bits1.s3 >> 3) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s3; \
|
||||
total_sums.s0 += (((bits4.s6 & 0x000F) | (((bits1.s3 >> 4) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s4; \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x00F0) >> 4) | (((bits1.s3 >> 5) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s5; \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x0F00) >> 8) | (((bits1.s3 >> 6) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s6; \
|
||||
total_sums.s0 += ((((bits4.s6 & 0xF000) >> 12) | (((bits1.s3 >> 7) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s7; \
|
||||
total_sums.s1 += (((bits4.s5 & 0x000F) | (((bits1.s7 ) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s0; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x00F0) >> 4) | (((bits1.s7 >> 1) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s1; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x0F00) >> 8) | (((bits1.s7 >> 2) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s2; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0xF000) >> 12) | (((bits1.s7 >> 3) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s3; \
|
||||
total_sums.s1 += (((bits4.s7 & 0x000F) | (((bits1.s7 >> 4) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s4; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x00F0) >> 4) | (((bits1.s7 >> 5) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s5; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x0F00) >> 8) | (((bits1.s7 >> 6) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s6; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0xF000) >> 12) | (((bits1.s7 >> 7) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s7; \
|
||||
|
||||
#ifdef ADRENO_GPU
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
#endif
|
||||
__kernel void kernel_gemv_noshuffle_q5_0_f32(
|
||||
__read_only image1d_buffer_t src0_qs, // quantized A
|
||||
global ushort * src0_qh, // 5th bits
|
||||
global half2 * src0_d, // A scales
|
||||
__read_only image1d_buffer_t src1, // B activations
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00, // K
|
||||
int ne01) // M
|
||||
{
|
||||
uint groupId = get_local_id(1);
|
||||
uint gid = get_global_id(0);
|
||||
ushort slid = get_sub_group_local_id();
|
||||
|
||||
uint K = ne00;
|
||||
uint M = ne01;
|
||||
|
||||
uint LINE_STRIDE_A = M / 2;
|
||||
uint BLOCK_STRIDE_A = NSUBGROUPS * M;
|
||||
|
||||
private uint4 regA;
|
||||
private half2 regS;
|
||||
private float8 regB;
|
||||
|
||||
private float2 totalSum = (float2)(0.0f);
|
||||
|
||||
for (uint k = groupId; k < (K / QK5_0); k += NSUBGROUPS) {
|
||||
regS = src0_d[gid + k * LINE_STRIDE_A];
|
||||
|
||||
ushort4 qh_raw;
|
||||
qh_raw.s0 = src0_qh[gid + (4*k + 0) * LINE_STRIDE_A];
|
||||
qh_raw.s1 = src0_qh[gid + (4*k + 1) * LINE_STRIDE_A];
|
||||
qh_raw.s2 = src0_qh[gid + (4*k + 2) * LINE_STRIDE_A];
|
||||
qh_raw.s3 = src0_qh[gid + (4*k + 3) * LINE_STRIDE_A];
|
||||
|
||||
uchar8 raw = as_uchar8(qh_raw);
|
||||
uchar8 qh_bytes = (uchar8)(raw.s0, raw.s2, raw.s4, raw.s6,
|
||||
raw.s1, raw.s3, raw.s5, raw.s7);
|
||||
|
||||
// Load activations
|
||||
if (slid < 4) {
|
||||
regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
|
||||
regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
|
||||
}
|
||||
|
||||
regA.s0 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
|
||||
regA.s1 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
|
||||
regA.s2 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
|
||||
regA.s3 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
|
||||
|
||||
#ifdef VECTOR_SUB_GROUP_BROADCAST
|
||||
dequantizeBlockAccum_ns_q5_0_sgbroadcast_8_hi(totalSum, as_ushort8(regA), qh_bytes, regS, regB);
|
||||
#else
|
||||
dequantizeBlockAccum_ns_q5_0_sgbroadcast_1_hi(totalSum, as_ushort8(regA), qh_bytes, regS, regB);
|
||||
#endif // VECTOR_SUB_GROUP_BROADCAST
|
||||
|
||||
regA.s0 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
|
||||
regA.s1 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
|
||||
regA.s2 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
|
||||
regA.s3 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
|
||||
#ifdef VECTOR_SUB_GROUP_BROADCAST
|
||||
dequantizeBlockAccum_ns_q5_0_sgbroadcast_8_lo(totalSum, as_ushort8(regA), qh_bytes, regS, regB);
|
||||
#else
|
||||
dequantizeBlockAccum_ns_q5_0_sgbroadcast_1_lo(totalSum, as_ushort8(regA), qh_bytes, regS, regB);
|
||||
#endif // VECTOR_SUB_GROUP_BROADCAST
|
||||
}
|
||||
|
||||
// reduction in local memory, assumes #wave=4
|
||||
local float2 reduceLM[SUBGROUP_SIZE * 3];
|
||||
if (groupId == 1) {
|
||||
reduceLM[SUBGROUP_SIZE * 0 + slid] = totalSum;
|
||||
}
|
||||
if (groupId == 2) {
|
||||
reduceLM[SUBGROUP_SIZE * 1 + slid] = totalSum;
|
||||
}
|
||||
if (groupId == 3) {
|
||||
reduceLM[SUBGROUP_SIZE * 2 + slid] = totalSum;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (groupId == 0) {
|
||||
totalSum += reduceLM[SUBGROUP_SIZE * 0 + slid];
|
||||
}
|
||||
if (groupId == 0) {
|
||||
totalSum += reduceLM[SUBGROUP_SIZE * 1 + slid];
|
||||
}
|
||||
if (groupId == 0) {
|
||||
totalSum += reduceLM[SUBGROUP_SIZE * 2 + slid];
|
||||
}
|
||||
|
||||
// 2 outputs per fiber in wave 0
|
||||
if (groupId == 0) {
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
vstore2(totalSum, 0, &(dst[gid * 2]));
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,294 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
|
||||
#ifdef cl_qcom_reqd_sub_group_size
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#endif
|
||||
|
||||
#define QK5_1 32
|
||||
#define NSUBGROUPS 4
|
||||
#define SUBGROUP_SIZE 64
|
||||
|
||||
#define dequantizeBlockAccum_ns_q5_1_sgbroadcast_1_hi(total_sums, bits4, bits1, scale, minv, y) \
|
||||
float shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s0, 0); \
|
||||
total_sums.s0 += (((bits4.s0 & 0x000F) | (((bits1.s0 ) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += (((bits4.s1 & 0x000F) | (((bits1.s4 ) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s1, 0); \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x00F0) >> 4) | (((bits1.s0 >> 1) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x00F0) >> 4) | (((bits1.s4 >> 1) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s2, 0); \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x0F00) >> 8) | (((bits1.s0 >> 2) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x0F00) >> 8) | (((bits1.s4 >> 2) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s3, 0); \
|
||||
total_sums.s0 += ((((bits4.s0 & 0xF000) >> 12) | (((bits1.s0 >> 3) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0xF000) >> 12) | (((bits1.s4 >> 3) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s4, 0); \
|
||||
total_sums.s0 += (((bits4.s2 & 0x000F) | (((bits1.s0 >> 4) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += (((bits4.s3 & 0x000F) | (((bits1.s4 >> 4) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s5, 0); \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x00F0) >> 4) | (((bits1.s0 >> 5) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x00F0) >> 4) | (((bits1.s4 >> 5) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s6, 0); \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x0F00) >> 8) | (((bits1.s0 >> 6) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x0F00) >> 8) | (((bits1.s4 >> 6) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s7, 0); \
|
||||
total_sums.s0 += ((((bits4.s2 & 0xF000) >> 12) | (((bits1.s0 >> 7) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0xF000) >> 12) | (((bits1.s4 >> 7) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s0, 1); \
|
||||
total_sums.s0 += (((bits4.s4 & 0x000F) | (((bits1.s1 ) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += (((bits4.s5 & 0x000F) | (((bits1.s5 ) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s1, 1); \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x00F0) >> 4) | (((bits1.s1 >> 1) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x00F0) >> 4) | (((bits1.s5 >> 1) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s2, 1); \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x0F00) >> 8) | (((bits1.s1 >> 2) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x0F00) >> 8) | (((bits1.s5 >> 2) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s3, 1); \
|
||||
total_sums.s0 += ((((bits4.s4 & 0xF000) >> 12) | (((bits1.s1 >> 3) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0xF000) >> 12) | (((bits1.s5 >> 3) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s4, 1); \
|
||||
total_sums.s0 += (((bits4.s6 & 0x000F) | (((bits1.s1 >> 4) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += (((bits4.s7 & 0x000F) | (((bits1.s5 >> 4) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s5, 1); \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x00F0) >> 4) | (((bits1.s1 >> 5) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x00F0) >> 4) | (((bits1.s5 >> 5) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s6, 1); \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x0F00) >> 8) | (((bits1.s1 >> 6) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x0F00) >> 8) | (((bits1.s5 >> 6) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s7, 1); \
|
||||
total_sums.s0 += ((((bits4.s6 & 0xF000) >> 12) | (((bits1.s1 >> 7) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0xF000) >> 12) | (((bits1.s5 >> 7) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
|
||||
|
||||
#define dequantizeBlockAccum_ns_q5_1_sgbroadcast_1_lo(total_sums, bits4, bits1, scale, minv, y) \
|
||||
shared_y = sub_group_broadcast(y.s0, 2); \
|
||||
total_sums.s0 += (((bits4.s0 & 0x000F) | (((bits1.s2 ) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += (((bits4.s1 & 0x000F) | (((bits1.s6 ) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s1, 2); \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x00F0) >> 4) | (((bits1.s2 >> 1) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x00F0) >> 4) | (((bits1.s6 >> 1) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s2, 2); \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x0F00) >> 8) | (((bits1.s2 >> 2) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x0F00) >> 8) | (((bits1.s6 >> 2) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s3, 2); \
|
||||
total_sums.s0 += ((((bits4.s0 & 0xF000) >> 12) | (((bits1.s2 >> 3) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0xF000) >> 12) | (((bits1.s6 >> 3) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s4, 2); \
|
||||
total_sums.s0 += (((bits4.s2 & 0x000F) | (((bits1.s2 >> 4) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += (((bits4.s3 & 0x000F) | (((bits1.s6 >> 4) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s5, 2); \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x00F0) >> 4) | (((bits1.s2 >> 5) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x00F0) >> 4) | (((bits1.s6 >> 5) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s6, 2); \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x0F00) >> 8) | (((bits1.s2 >> 6) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x0F00) >> 8) | (((bits1.s6 >> 6) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s7, 2); \
|
||||
total_sums.s0 += ((((bits4.s2 & 0xF000) >> 12) | (((bits1.s2 >> 7) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0xF000) >> 12) | (((bits1.s6 >> 7) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s0, 3); \
|
||||
total_sums.s0 += (((bits4.s4 & 0x000F) | (((bits1.s3 ) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += (((bits4.s5 & 0x000F) | (((bits1.s7 ) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s1, 3); \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x00F0) >> 4) | (((bits1.s3 >> 1) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x00F0) >> 4) | (((bits1.s7 >> 1) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s2, 3); \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x0F00) >> 8) | (((bits1.s3 >> 2) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x0F00) >> 8) | (((bits1.s7 >> 2) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s3, 3); \
|
||||
total_sums.s0 += ((((bits4.s4 & 0xF000) >> 12) | (((bits1.s3 >> 3) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0xF000) >> 12) | (((bits1.s7 >> 3) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s4, 3); \
|
||||
total_sums.s0 += (((bits4.s6 & 0x000F) | (((bits1.s3 >> 4) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += (((bits4.s7 & 0x000F) | (((bits1.s7 >> 4) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s5, 3); \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x00F0) >> 4) | (((bits1.s3 >> 5) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x00F0) >> 4) | (((bits1.s7 >> 5) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s6, 3); \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x0F00) >> 8) | (((bits1.s3 >> 6) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x0F00) >> 8) | (((bits1.s7 >> 6) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s7, 3); \
|
||||
total_sums.s0 += ((((bits4.s6 & 0xF000) >> 12) | (((bits1.s3 >> 7) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0xF000) >> 12) | (((bits1.s7 >> 7) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
|
||||
|
||||
#define dequantizeBlockAccum_ns_q5_1_sgbroadcast_8_hi(total_sums, bits4, bits1, scale, minv, y) \
|
||||
float8 shared_y; \
|
||||
shared_y = sub_group_broadcast(y, 0); \
|
||||
total_sums.s0 += (((bits4.s0 & 0x000F) | (((bits1.s0 ) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s0; \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x00F0) >> 4) | (((bits1.s0 >> 1) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s1; \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x0F00) >> 8) | (((bits1.s0 >> 2) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s2; \
|
||||
total_sums.s0 += ((((bits4.s0 & 0xF000) >> 12) | (((bits1.s0 >> 3) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s3; \
|
||||
total_sums.s0 += (((bits4.s2 & 0x000F) | (((bits1.s0 >> 4) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s4; \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x00F0) >> 4) | (((bits1.s0 >> 5) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s5; \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x0F00) >> 8) | (((bits1.s0 >> 6) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s6; \
|
||||
total_sums.s0 += ((((bits4.s2 & 0xF000) >> 12) | (((bits1.s0 >> 7) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s7; \
|
||||
total_sums.s1 += (((bits4.s1 & 0x000F) | (((bits1.s4 ) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s0; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x00F0) >> 4) | (((bits1.s4 >> 1) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s1; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x0F00) >> 8) | (((bits1.s4 >> 2) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s2; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0xF000) >> 12) | (((bits1.s4 >> 3) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s3; \
|
||||
total_sums.s1 += (((bits4.s3 & 0x000F) | (((bits1.s4 >> 4) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s4; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x00F0) >> 4) | (((bits1.s4 >> 5) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s5; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x0F00) >> 8) | (((bits1.s4 >> 6) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s6; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0xF000) >> 12) | (((bits1.s4 >> 7) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s7; \
|
||||
shared_y = sub_group_broadcast(y, 1); \
|
||||
total_sums.s0 += (((bits4.s4 & 0x000F) | (((bits1.s1 ) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s0; \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x00F0) >> 4) | (((bits1.s1 >> 1) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s1; \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x0F00) >> 8) | (((bits1.s1 >> 2) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s2; \
|
||||
total_sums.s0 += ((((bits4.s4 & 0xF000) >> 12) | (((bits1.s1 >> 3) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s3; \
|
||||
total_sums.s0 += (((bits4.s6 & 0x000F) | (((bits1.s1 >> 4) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s4; \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x00F0) >> 4) | (((bits1.s1 >> 5) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s5; \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x0F00) >> 8) | (((bits1.s1 >> 6) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s6; \
|
||||
total_sums.s0 += ((((bits4.s6 & 0xF000) >> 12) | (((bits1.s1 >> 7) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s7; \
|
||||
total_sums.s1 += (((bits4.s5 & 0x000F) | (((bits1.s5 ) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s0; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x00F0) >> 4) | (((bits1.s5 >> 1) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s1; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x0F00) >> 8) | (((bits1.s5 >> 2) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s2; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0xF000) >> 12) | (((bits1.s5 >> 3) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s3; \
|
||||
total_sums.s1 += (((bits4.s7 & 0x000F) | (((bits1.s5 >> 4) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s4; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x00F0) >> 4) | (((bits1.s5 >> 5) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s5; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x0F00) >> 8) | (((bits1.s5 >> 6) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s6; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0xF000) >> 12) | (((bits1.s5 >> 7) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s7; \
|
||||
|
||||
|
||||
#define dequantizeBlockAccum_ns_q5_1_sgbroadcast_8_lo(total_sums, bits4, bits1, scale, minv, y) \
|
||||
shared_y = sub_group_broadcast(y, 2); \
|
||||
total_sums.s0 += (((bits4.s0 & 0x000F) | (((bits1.s2 ) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s0; \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x00F0) >> 4) | (((bits1.s2 >> 1) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s1; \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x0F00) >> 8) | (((bits1.s2 >> 2) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s2; \
|
||||
total_sums.s0 += ((((bits4.s0 & 0xF000) >> 12) | (((bits1.s2 >> 3) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s3; \
|
||||
total_sums.s0 += (((bits4.s2 & 0x000F) | (((bits1.s2 >> 4) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s4; \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x00F0) >> 4) | (((bits1.s2 >> 5) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s5; \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x0F00) >> 8) | (((bits1.s2 >> 6) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s6; \
|
||||
total_sums.s0 += ((((bits4.s2 & 0xF000) >> 12) | (((bits1.s2 >> 7) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s7; \
|
||||
total_sums.s1 += (((bits4.s1 & 0x000F) | (((bits1.s6 ) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s0; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x00F0) >> 4) | (((bits1.s6 >> 1) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s1; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x0F00) >> 8) | (((bits1.s6 >> 2) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s2; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0xF000) >> 12) | (((bits1.s6 >> 3) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s3; \
|
||||
total_sums.s1 += (((bits4.s3 & 0x000F) | (((bits1.s6 >> 4) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s4; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x00F0) >> 4) | (((bits1.s6 >> 5) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s5; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x0F00) >> 8) | (((bits1.s6 >> 6) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s6; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0xF000) >> 12) | (((bits1.s6 >> 7) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s7; \
|
||||
shared_y = sub_group_broadcast(y, 3); \
|
||||
total_sums.s0 += (((bits4.s4 & 0x000F) | (((bits1.s3 ) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s0; \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x00F0) >> 4) | (((bits1.s3 >> 1) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s1; \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x0F00) >> 8) | (((bits1.s3 >> 2) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s2; \
|
||||
total_sums.s0 += ((((bits4.s4 & 0xF000) >> 12) | (((bits1.s3 >> 3) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s3; \
|
||||
total_sums.s0 += (((bits4.s6 & 0x000F) | (((bits1.s3 >> 4) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s4; \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x00F0) >> 4) | (((bits1.s3 >> 5) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s5; \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x0F00) >> 8) | (((bits1.s3 >> 6) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s6; \
|
||||
total_sums.s0 += ((((bits4.s6 & 0xF000) >> 12) | (((bits1.s3 >> 7) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s7; \
|
||||
total_sums.s1 += (((bits4.s5 & 0x000F) | (((bits1.s7 ) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s0; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x00F0) >> 4) | (((bits1.s7 >> 1) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s1; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x0F00) >> 8) | (((bits1.s7 >> 2) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s2; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0xF000) >> 12) | (((bits1.s7 >> 3) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s3; \
|
||||
total_sums.s1 += (((bits4.s7 & 0x000F) | (((bits1.s7 >> 4) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s4; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x00F0) >> 4) | (((bits1.s7 >> 5) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s5; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x0F00) >> 8) | (((bits1.s7 >> 6) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s6; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0xF000) >> 12) | (((bits1.s7 >> 7) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s7; \
|
||||
|
||||
#ifdef ADRENO_GPU
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
#endif
|
||||
__kernel void kernel_gemv_noshuffle_q5_1_f32(
|
||||
__read_only image1d_buffer_t src0_qs, // quantized A
|
||||
global ushort * src0_qh, // 5th bits
|
||||
global half2 * src0_d, // A scales
|
||||
global half2 * src0_m, // A mins
|
||||
__read_only image1d_buffer_t src1, // B activations
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00, // K
|
||||
int ne01) // M
|
||||
{
|
||||
uint groupId = get_local_id(1);
|
||||
uint gid = get_global_id(0);
|
||||
ushort slid = get_sub_group_local_id();
|
||||
|
||||
uint K = ne00;
|
||||
uint M = ne01;
|
||||
|
||||
uint LINE_STRIDE_A = M / 2;
|
||||
uint BLOCK_STRIDE_A = NSUBGROUPS * M;
|
||||
|
||||
__private uint4 regA;
|
||||
__private half2 regS;
|
||||
__private half2 regM;
|
||||
__private float8 regB;
|
||||
|
||||
__private float2 totalSum = (float2)(0.0f);
|
||||
|
||||
for (uint k = groupId; k < (K / QK5_1); k += NSUBGROUPS) {
|
||||
regS = src0_d[gid + k * LINE_STRIDE_A];
|
||||
regM = src0_m[gid + k * LINE_STRIDE_A];
|
||||
|
||||
ushort4 qh_raw;
|
||||
qh_raw.s0 = src0_qh[gid + (4*k + 0) * LINE_STRIDE_A];
|
||||
qh_raw.s1 = src0_qh[gid + (4*k + 1) * LINE_STRIDE_A];
|
||||
qh_raw.s2 = src0_qh[gid + (4*k + 2) * LINE_STRIDE_A];
|
||||
qh_raw.s3 = src0_qh[gid + (4*k + 3) * LINE_STRIDE_A];
|
||||
|
||||
uchar8 raw = as_uchar8(qh_raw);
|
||||
uchar8 qh_bytes = (uchar8)(raw.s0, raw.s2, raw.s4, raw.s6,
|
||||
raw.s1, raw.s3, raw.s5, raw.s7);
|
||||
|
||||
// Load activations
|
||||
if (slid < 4) {
|
||||
regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
|
||||
regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
|
||||
}
|
||||
|
||||
regA.s0 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
|
||||
regA.s1 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
|
||||
regA.s2 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
|
||||
regA.s3 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
|
||||
|
||||
#ifdef VECTOR_SUB_GROUP_BROADCAST
|
||||
dequantizeBlockAccum_ns_q5_1_sgbroadcast_8_hi(totalSum, as_ushort8(regA), qh_bytes, regS, regM, regB);
|
||||
#else
|
||||
dequantizeBlockAccum_ns_q5_1_sgbroadcast_1_hi(totalSum, as_ushort8(regA), qh_bytes, regS, regM, regB);
|
||||
#endif // VECTOR_SUB_GROUP_BROADCAST
|
||||
|
||||
regA.s0 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
|
||||
regA.s1 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
|
||||
regA.s2 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
|
||||
regA.s3 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
|
||||
#ifdef VECTOR_SUB_GROUP_BROADCAST
|
||||
dequantizeBlockAccum_ns_q5_1_sgbroadcast_8_lo(totalSum, as_ushort8(regA), qh_bytes, regS, regM, regB);
|
||||
#else
|
||||
dequantizeBlockAccum_ns_q5_1_sgbroadcast_1_lo(totalSum, as_ushort8(regA), qh_bytes, regS, regM, regB);
|
||||
#endif // VECTOR_SUB_GROUP_BROADCAST
|
||||
}
|
||||
|
||||
// reduction in local memory, assumes #wave=4
|
||||
local float2 reduceLM[SUBGROUP_SIZE * 3];
|
||||
if (groupId == 1) {
|
||||
reduceLM[SUBGROUP_SIZE * 0 + slid] = totalSum;
|
||||
}
|
||||
if (groupId == 2) {
|
||||
reduceLM[SUBGROUP_SIZE * 1 + slid] = totalSum;
|
||||
}
|
||||
if (groupId == 3) {
|
||||
reduceLM[SUBGROUP_SIZE * 2 + slid] = totalSum;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (groupId == 0) {
|
||||
totalSum += reduceLM[SUBGROUP_SIZE * 0 + slid];
|
||||
}
|
||||
if (groupId == 0) {
|
||||
totalSum += reduceLM[SUBGROUP_SIZE * 1 + slid];
|
||||
}
|
||||
if (groupId == 0) {
|
||||
totalSum += reduceLM[SUBGROUP_SIZE * 2 + slid];
|
||||
}
|
||||
|
||||
// 2 outputs per fiber in wave 0
|
||||
if (groupId == 0) {
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
vstore2(totalSum, 0, &(dst[gid * 2]));
|
||||
}
|
||||
|
||||
}
|
||||
@@ -82,21 +82,27 @@ kernel void kernel_get_rows_f32(
|
||||
src1 = (global int*)((global char*)src1 + offset1);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
int i10 = get_group_id(0);
|
||||
int i11 = get_group_id(1);
|
||||
int i12 = get_group_id(2);
|
||||
int nchunks = get_num_groups(0) / ne10;
|
||||
int g = get_group_id(0);
|
||||
int i10 = g / nchunks;
|
||||
int chunk = g - i10 * nchunks;
|
||||
int i11 = get_group_id(1);
|
||||
int i12 = get_group_id(2);
|
||||
|
||||
int r = ((global int *) ((global char *) src1 + i12*nb12 + i11*nb11 + i10*nb10))[0];
|
||||
|
||||
int i02 = i11;
|
||||
int i03 = i12;
|
||||
|
||||
for (int ind = get_local_id(0); ind < ne00; ind += get_local_size(0)) {
|
||||
if (ind >= ne00) {
|
||||
return;
|
||||
}
|
||||
((global float *) ((global char *) dst + i12*nb3 + i11*nb2 + i10*nb1))[ind] =
|
||||
((global float *) ((global char *) src0 + r*nb01 + i02*nb02 + i03*nb03))[ind];
|
||||
global float * dst_row = (global float *) ((global char *) dst + i12*nb3 + i11*nb2 + i10*nb1);
|
||||
global float * src_row = (global float *) ((global char *) src0 + r*nb01 + i02*nb02 + i03*nb03);
|
||||
|
||||
int span = (ne00 + nchunks - 1) / nchunks;
|
||||
int start = chunk * span;
|
||||
int end = min(start + span, ne00);
|
||||
|
||||
for (int ind = start + get_local_id(0); ind < end; ind += get_local_size(0)) {
|
||||
dst_row[ind] = src_row[ind];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -33,13 +33,15 @@ inline float block_q_6_K_dot_y_flat(
|
||||
global uchar * blk_qh,
|
||||
global char * blk_scales,
|
||||
global half * blk_d,
|
||||
global float * yy,
|
||||
int ib,
|
||||
int ip,
|
||||
int is,
|
||||
int l0
|
||||
int l0,
|
||||
float4 y0,
|
||||
float4 y1,
|
||||
float4 y2,
|
||||
float4 y3
|
||||
) {
|
||||
int y_offset = 128*ip + l0;
|
||||
int q_offset_l = 64*ip + l0;
|
||||
int q_offset_h = 32*ip + l0;
|
||||
|
||||
@@ -48,36 +50,28 @@ inline float block_q_6_K_dot_y_flat(
|
||||
global uchar * qh = blk_qh + ib*64 + q_offset_h;
|
||||
global char * sc = blk_scales + ib*16 + is;
|
||||
|
||||
global float * y = yy + ib * QK_K + y_offset;
|
||||
|
||||
float dall = blk_d[ib];
|
||||
|
||||
float sumf = 0;
|
||||
float4 sums = {0.f, 0.f, 0.f, 0.f};
|
||||
// Vectorized loads: 3 uchar4 weight loads instead of 12 scalar byte reads.
|
||||
// q_offset_l/h are 4-aligned, so these are aligned vector loads.
|
||||
uchar4 q1v = vload4(0, q1);
|
||||
uchar4 q2v = vload4(0, q2);
|
||||
uchar4 qhv = vload4(0, qh);
|
||||
|
||||
sums.s0 += y[0+ 0] * ((float)((q1[0] & 0xF) | ((qh[0] & Q6_K_MASK1) << 4)) - 32.f);
|
||||
sums.s1 += y[0+32] * ((float)((q2[0] & 0xF) | ((qh[0] & Q6_K_MASK2) << 2)) - 32.f);
|
||||
sums.s2 += y[0+64] * ((float)((q1[0] >> 4) | ((qh[0] & Q6_K_MASK3) << 0)) - 32.f);
|
||||
sums.s3 += y[0+96] * ((float)((q2[0] >> 4) | ((qh[0] & Q6_K_MASK4) >> 2)) - 32.f);
|
||||
int4 q1i = convert_int4(q1v);
|
||||
int4 q2i = convert_int4(q2v);
|
||||
int4 qhi = convert_int4(qhv);
|
||||
|
||||
sums.s0 += y[1+ 0] * ((float)((q1[1] & 0xF) | ((qh[1] & Q6_K_MASK1) << 4)) - 32.f);
|
||||
sums.s1 += y[1+32] * ((float)((q2[1] & 0xF) | ((qh[1] & Q6_K_MASK2) << 2)) - 32.f);
|
||||
sums.s2 += y[1+64] * ((float)((q1[1] >> 4) | ((qh[1] & Q6_K_MASK3) << 0)) - 32.f);
|
||||
sums.s3 += y[1+96] * ((float)((q2[1] >> 4) | ((qh[1] & Q6_K_MASK4) >> 2)) - 32.f);
|
||||
// Reconstruct the four 6-bit weight groups (low/high nibble of ql OR'd with the
|
||||
// matching 2-bit plane of qh), same arithmetic as the scalar version, then dot()
|
||||
// against the cached activation lanes.
|
||||
float4 w0 = convert_float4((q1i & 0xF) | ((qhi & Q6_K_MASK1) << 4)) - 32.f;
|
||||
float4 w1 = convert_float4((q2i & 0xF) | ((qhi & Q6_K_MASK2) << 2)) - 32.f;
|
||||
float4 w2 = convert_float4((q1i >> 4) | ((qhi & Q6_K_MASK3) )) - 32.f;
|
||||
float4 w3 = convert_float4((q2i >> 4) | ((qhi & Q6_K_MASK4) >> 2)) - 32.f;
|
||||
|
||||
sums.s0 += y[2+ 0] * ((float)((q1[2] & 0xF) | ((qh[2] & Q6_K_MASK1) << 4)) - 32.f);
|
||||
sums.s1 += y[2+32] * ((float)((q2[2] & 0xF) | ((qh[2] & Q6_K_MASK2) << 2)) - 32.f);
|
||||
sums.s2 += y[2+64] * ((float)((q1[2] >> 4) | ((qh[2] & Q6_K_MASK3) << 0)) - 32.f);
|
||||
sums.s3 += y[2+96] * ((float)((q2[2] >> 4) | ((qh[2] & Q6_K_MASK4) >> 2)) - 32.f);
|
||||
|
||||
sums.s0 += y[3+ 0] * ((float)((q1[3] & 0xF) | ((qh[3] & Q6_K_MASK1) << 4)) - 32.f);
|
||||
sums.s1 += y[3+32] * ((float)((q2[3] & 0xF) | ((qh[3] & Q6_K_MASK2) << 2)) - 32.f);
|
||||
sums.s2 += y[3+64] * ((float)((q1[3] >> 4) | ((qh[3] & Q6_K_MASK3) << 0)) - 32.f);
|
||||
sums.s3 += y[3+96] * ((float)((q2[3] >> 4) | ((qh[3] & Q6_K_MASK4) >> 2)) - 32.f);
|
||||
|
||||
sumf += dall * (sums.s0 * sc[0] + sums.s1 * sc[2] + sums.s2 * sc[4] + sums.s3 * sc[6]);
|
||||
|
||||
return sumf;
|
||||
return dall * (dot(y0, w0) * sc[0] + dot(y1, w1) * sc[2] +
|
||||
dot(y2, w2) * sc[4] + dot(y3, w3) * sc[6]);
|
||||
}
|
||||
|
||||
#undef N_DST
|
||||
@@ -89,7 +83,7 @@ inline float block_q_6_K_dot_y_flat(
|
||||
#define N_SIMDGROUP 2
|
||||
#define N_SIMDWIDTH 16
|
||||
#elif defined (ADRENO_GPU)
|
||||
#define N_DST 4
|
||||
#define N_DST 16
|
||||
#define N_SIMDGROUP 2
|
||||
#define N_SIMDWIDTH 64
|
||||
#endif
|
||||
@@ -146,49 +140,39 @@ kernel void kernel_mul_mv_q6_K_f32_flat(
|
||||
global half * blk_d = (global half *) src0_d + offset_src0_d;
|
||||
global float * yy = (global float *) src1 + r1*ne10 + im*ne00*ne1;
|
||||
|
||||
int tid = get_sub_group_local_id()/BLOCK_STRIDE; // first block_stride groups have tid=0
|
||||
int ix = get_sub_group_local_id()%BLOCK_STRIDE; // first block is 0..block_stride-1
|
||||
int tid = get_sub_group_local_id()%(N_SIMDWIDTH/BLOCK_STRIDE); // within-super-block part, 0..15
|
||||
int ix = get_sub_group_local_id()/(N_SIMDWIDTH/BLOCK_STRIDE); // super-block selector, 0..BLOCK_STRIDE-1
|
||||
int ip = tid/8; // first or second half of (super) block (0 or 1)
|
||||
int il = tid%8; // each half has 8 parts, one per scale
|
||||
int n = 4; // 4 scales at a time (and 4 sums)
|
||||
int l0 = n*il; // offset into half-block, 0..28
|
||||
int is = 8*ip + l0/16; // 0, 1, 8, 9
|
||||
|
||||
float4 sumf = 0;
|
||||
float sumf[N_DST];
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
sumf[row] = 0.f;
|
||||
}
|
||||
|
||||
for (int ib = ix; ib < nb; ib += BLOCK_STRIDE) {
|
||||
if (first_row + 0 < ne01) {
|
||||
sumf.s0 += block_q_6_K_dot_y_flat(blk_ql + 0*nb*128, blk_qh + 0*nb*64, blk_scales + 0*nb*16, blk_d + 0*nb, yy, ib, ip, is, l0);
|
||||
}
|
||||
if (first_row + 1 < ne01) {
|
||||
sumf.s1 += block_q_6_K_dot_y_flat(blk_ql + 1*nb*128, blk_qh + 1*nb*64, blk_scales + 1*nb*16, blk_d + 1*nb, yy, ib, ip, is, l0);
|
||||
}
|
||||
if (first_row + 2 < ne01) {
|
||||
sumf.s2 += block_q_6_K_dot_y_flat(blk_ql + 2*nb*128, blk_qh + 2*nb*64, blk_scales + 2*nb*16, blk_d + 2*nb, yy, ib, ip, is, l0);
|
||||
}
|
||||
if (first_row + 3 < ne01) {
|
||||
sumf.s3 += block_q_6_K_dot_y_flat(blk_ql + 3*nb*128, blk_qh + 3*nb*64, blk_scales + 3*nb*16, blk_d + 3*nb, yy, ib, ip, is, l0);
|
||||
global float * y = yy + ib * QK_K + 128*ip + l0;
|
||||
float4 y0 = vload4(0, y + 0);
|
||||
float4 y1 = vload4(0, y + 32);
|
||||
float4 y2 = vload4(0, y + 64);
|
||||
float4 y3 = vload4(0, y + 96);
|
||||
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
if (first_row + row < ne01) {
|
||||
sumf[row] += block_q_6_K_dot_y_flat(
|
||||
blk_ql + row*nb*128, blk_qh + row*nb*64, blk_scales + row*nb*16, blk_d + row*nb,
|
||||
ib, ip, is, l0, y0, y1, y2, y3);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
float4 tot = (float4)(
|
||||
sub_group_reduce_add(sumf.s0),
|
||||
sub_group_reduce_add(sumf.s1),
|
||||
sub_group_reduce_add(sumf.s2),
|
||||
sub_group_reduce_add(sumf.s3)
|
||||
);
|
||||
if (get_sub_group_local_id() == 0) {
|
||||
if (first_row + 0 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
|
||||
}
|
||||
if (first_row + 1 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
|
||||
}
|
||||
if (first_row + 2 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
|
||||
}
|
||||
if (first_row + 3 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
float tot = sub_group_reduce_add(sumf[row]);
|
||||
if (get_sub_group_local_id() == 0 && first_row + row < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -44,9 +44,9 @@ void gated_delta_net_sycl(const float * q,
|
||||
float * attn_data = dst;
|
||||
float * state = dst + attn_score_elems;
|
||||
|
||||
// input state layout (D, K, n_seqs) — seq stride is K * D = K * H * S_v * S_v.
|
||||
// input state holds s0 only [S_v, S_v, H, n_seqs] — seq stride is D = H * S_v * S_v.
|
||||
// output state layout (per-slot D * n_seqs) — same per-(seq,head) offset as before.
|
||||
const int64_t state_in_offset = sequence * K * H * S_v * S_v + h_idx * S_v * S_v;
|
||||
const int64_t state_in_offset = sequence * H * S_v * S_v + h_idx * S_v * S_v;
|
||||
const int64_t state_out_offset = (sequence * H + h_idx) * S_v * S_v;
|
||||
const int64_t state_size_per_token = S_v * S_v * H * n_seqs; // per-slot stride in output
|
||||
state += state_out_offset;
|
||||
@@ -63,9 +63,8 @@ void gated_delta_net_sycl(const float * q,
|
||||
s_shard[r] = curr_state[i];
|
||||
}
|
||||
|
||||
// slot mapping: target_slot = t - shift. When n_tokens < K only the last n_tokens slots
|
||||
// are written; earlier slots are left untouched (caller-owned).
|
||||
const int shift = (int) n_tokens - K;
|
||||
// snapshot slot mapping: slot 0 = most recent state, slot s = s tokens back.
|
||||
// When n_tokens < K only slots 0..n_tokens-1 are written; older slots are caller-owned.
|
||||
|
||||
for (int t = 0; t < n_tokens; t++) {
|
||||
const float * q_t = q + iq3 * sq3 + t * sq2 + iq1 * sq1;
|
||||
@@ -144,7 +143,7 @@ void gated_delta_net_sycl(const float * q,
|
||||
|
||||
// Write state back to global memory
|
||||
if constexpr (keep_rs_t) {
|
||||
const int target_slot = t - shift;
|
||||
const int target_slot = (int) n_tokens - 1 - t;
|
||||
if (target_slot >= 0 && target_slot < K) {
|
||||
float * curr_state = (dst + attn_score_elems) + target_slot * state_size_per_token + state_out_offset;
|
||||
#pragma unroll
|
||||
@@ -315,8 +314,8 @@ void ggml_sycl_op_gated_delta_net(ggml_backend_sycl_context & ctx, ggml_tensor *
|
||||
|
||||
dpct::queue_ptr stream = ctx.stream();
|
||||
|
||||
// state is 3D (S_v*S_v*H, K, n_seqs); K is the snapshot slot count.
|
||||
const int K = (int) src_state->ne[1];
|
||||
// K (snapshot slot count) is an op param; state holds s0 only [S_v, S_v, H, n_seqs].
|
||||
const int K = ggml_get_op_params_i32(dst, 0);
|
||||
const bool keep_rs = K > 1;
|
||||
|
||||
if (kda) {
|
||||
|
||||
@@ -113,6 +113,21 @@ typedef struct VkPhysicalDeviceShaderBfloat16FeaturesKHR {
|
||||
} VkPhysicalDeviceShaderBfloat16FeaturesKHR;
|
||||
#endif
|
||||
|
||||
#if !defined(VK_VALVE_shader_mixed_float_dot_product)
|
||||
#define VK_VALVE_shader_mixed_float_dot_product 1
|
||||
#define VK_VALVE_SHADER_MIXED_FLOAT_DOT_PRODUCT_SPEC_VERSION 1
|
||||
#define VK_VALVE_SHADER_MIXED_FLOAT_DOT_PRODUCT_EXTENSION_NAME "VK_VALVE_shader_mixed_float_dot_product"
|
||||
#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_MIXED_FLOAT_DOT_PRODUCT_FEATURES_VALVE ((VkStructureType)1000673000)
|
||||
typedef struct VkPhysicalDeviceShaderMixedFloatDotProductFeaturesVALVE {
|
||||
VkStructureType sType;
|
||||
void* pNext;
|
||||
VkBool32 shaderMixedFloatDotProductFloat16AccFloat32;
|
||||
VkBool32 shaderMixedFloatDotProductFloat16AccFloat16;
|
||||
VkBool32 shaderMixedFloatDotProductBFloat16Acc;
|
||||
VkBool32 shaderMixedFloatDotProductFloat8AccFloat32;
|
||||
} VkPhysicalDeviceShaderMixedFloatDotProductFeaturesVALVE;
|
||||
#endif
|
||||
|
||||
#define ROUNDUP_POW2(M, N) (((M) + (N) - 1) & ~((N) - 1))
|
||||
#define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
|
||||
static bool is_pow2(uint32_t x) { return x > 1 && (x & (x-1)) == 0; }
|
||||
@@ -705,6 +720,8 @@ struct vk_device_struct {
|
||||
bool coopmat2_bf16_support {};
|
||||
bool coopmat2_decode_vector;
|
||||
|
||||
bool dot2_f16 {};
|
||||
|
||||
bool pipeline_executable_properties_support {};
|
||||
|
||||
size_t idx;
|
||||
@@ -816,6 +833,7 @@ struct vk_device_struct {
|
||||
|
||||
// [src/dst 0=fp32,1=fp16]
|
||||
vk_pipeline pipeline_exp[2];
|
||||
vk_pipeline pipeline_expm1[2];
|
||||
vk_pipeline pipeline_elu[2];
|
||||
vk_pipeline pipeline_gelu[2];
|
||||
vk_pipeline pipeline_gelu_erf[2];
|
||||
@@ -1185,30 +1203,35 @@ struct vk_op_glu_push_constants {
|
||||
uint32_t mode; // 0: default, 1: swapped, 2: split
|
||||
float alpha; // for swiglu_oai
|
||||
float limit;
|
||||
uint32_t nb00;
|
||||
uint32_t nb01;
|
||||
uint32_t nb02;
|
||||
uint32_t nb03;
|
||||
uint32_t ne01;
|
||||
uint32_t ne02;
|
||||
uint32_t nb10;
|
||||
uint32_t nb11;
|
||||
uint32_t nb12;
|
||||
uint32_t nb13;
|
||||
uint32_t ne11;
|
||||
uint32_t ne12;
|
||||
uint32_t nb20;
|
||||
uint32_t nb21;
|
||||
uint32_t nb22;
|
||||
uint32_t nb23;
|
||||
uint32_t ne21;
|
||||
uint32_t ne22;
|
||||
uint32_t misalign_offsets;
|
||||
uint32_t ne2_012mp; uint32_t ne2_012L;
|
||||
uint32_t ne2_01mp; uint32_t ne2_01L;
|
||||
uint32_t ne2_0mp; uint32_t ne2_0L;
|
||||
};
|
||||
static_assert(sizeof(vk_op_glu_push_constants) <= 128, "sizeof(vk_op_glu_push_constants) must be <= 128");
|
||||
|
||||
struct vk_op_unary_push_constants {
|
||||
uint32_t ne;
|
||||
uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
|
||||
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
|
||||
uint32_t misalign_offsets;
|
||||
float param1; float param2;
|
||||
uint32_t ne0_012mp; uint32_t ne0_012L;
|
||||
uint32_t ne0_01mp; uint32_t ne0_01L;
|
||||
uint32_t ne0_0mp; uint32_t ne0_0L;
|
||||
uint32_t ne1_012mp; uint32_t ne1_012L;
|
||||
uint32_t ne1_01mp; uint32_t ne1_01L;
|
||||
uint32_t ne1_0mp; uint32_t ne1_0L;
|
||||
float param1; float param2; float param3; float param4;
|
||||
uint32_t ne0_012mp; uint32_t ne0_01mp; uint32_t ne0_0mp; uint32_t ne0_Ls;
|
||||
uint32_t ne1_012mp; uint32_t ne1_01mp; uint32_t ne1_0mp; uint32_t ne1_Ls;
|
||||
};
|
||||
static_assert(sizeof(vk_op_unary_push_constants) <= 128, "sizeof(vk_op_unary_push_constants) must be <= 128");
|
||||
|
||||
@@ -1313,6 +1336,10 @@ static void init_fastdiv_values(uint32_t d, uint32_t &mp, uint32_t &L)
|
||||
mp = (uint32_t)((uint64_t{1} << 32) * ((uint64_t{1} << L) - d) / d + 1);
|
||||
}
|
||||
|
||||
static uint32_t pack_fastdiv_L(uint32_t L0, uint32_t L1, uint32_t L2) {
|
||||
return L0 | (L1 << 8) | (L2 << 16);
|
||||
}
|
||||
|
||||
template <typename T> void init_pushconst_fastdiv(T &p) {
|
||||
GGML_UNUSED(p);
|
||||
static_assert(!std::is_const<T>::value, "unexpected type");
|
||||
@@ -1320,12 +1347,29 @@ template <typename T> void init_pushconst_fastdiv(T &p) {
|
||||
|
||||
template <> void init_pushconst_fastdiv(vk_op_unary_push_constants &p) {
|
||||
// Compute magic values to divide by these six numbers.
|
||||
init_fastdiv_values(p.ne02*p.ne01*p.ne00, p.ne0_012mp, p.ne0_012L);
|
||||
init_fastdiv_values(p.ne01*p.ne00, p.ne0_01mp, p.ne0_01L);
|
||||
init_fastdiv_values(p.ne00, p.ne0_0mp, p.ne0_0L);
|
||||
init_fastdiv_values(p.ne12*p.ne11*p.ne10, p.ne1_012mp, p.ne1_012L);
|
||||
init_fastdiv_values(p.ne11*p.ne10, p.ne1_01mp, p.ne1_01L);
|
||||
init_fastdiv_values(p.ne10, p.ne1_0mp, p.ne1_0L);
|
||||
uint32_t ne0_012L;
|
||||
uint32_t ne0_01L;
|
||||
uint32_t ne0_0L;
|
||||
uint32_t ne1_012L;
|
||||
uint32_t ne1_01L;
|
||||
uint32_t ne1_0L;
|
||||
|
||||
init_fastdiv_values(p.ne02*p.ne01*p.ne00, p.ne0_012mp, ne0_012L);
|
||||
init_fastdiv_values(p.ne01*p.ne00, p.ne0_01mp, ne0_01L);
|
||||
init_fastdiv_values(p.ne00, p.ne0_0mp, ne0_0L);
|
||||
init_fastdiv_values(p.ne12*p.ne11*p.ne10, p.ne1_012mp, ne1_012L);
|
||||
init_fastdiv_values(p.ne11*p.ne10, p.ne1_01mp, ne1_01L);
|
||||
init_fastdiv_values(p.ne10, p.ne1_0mp, ne1_0L);
|
||||
|
||||
p.ne0_Ls = pack_fastdiv_L(ne0_012L, ne0_01L, ne0_0L);
|
||||
p.ne1_Ls = pack_fastdiv_L(ne1_012L, ne1_01L, ne1_0L);
|
||||
}
|
||||
|
||||
template <> void init_pushconst_fastdiv(vk_op_glu_push_constants &p) {
|
||||
// GLU linearizes over dst, then uses dst coordinates for src0/src1.
|
||||
init_fastdiv_values(p.ne22*p.ne21*p.ne20, p.ne2_012mp, p.ne2_012L);
|
||||
init_fastdiv_values(p.ne21*p.ne20, p.ne2_01mp, p.ne2_01L);
|
||||
init_fastdiv_values(p.ne20, p.ne2_0mp, p.ne2_0L);
|
||||
}
|
||||
|
||||
struct vk_op_binary_push_constants {
|
||||
@@ -1976,6 +2020,9 @@ struct ggml_backend_vk_context {
|
||||
// Cache most recent tensor that was converted into prealloc_y, and what pipeline it used to convert.
|
||||
vk_pipeline_struct * prealloc_y_last_pipeline_used {};
|
||||
const ggml_tensor * prealloc_y_last_tensor_used {};
|
||||
// True when prealloc_y holds the padded fp16 layout used by the coopmat2 B decode-vector callback.
|
||||
// If false, then it's contiguous.
|
||||
bool prealloc_y_last_decode_vector_staging {};
|
||||
|
||||
// Track which nodes have been used since the last sync, and whether they were written to
|
||||
std::vector<const ggml_tensor *> unsynced_nodes_written;
|
||||
@@ -3374,7 +3421,9 @@ static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vec
|
||||
switch (src0_type) {
|
||||
case GGML_TYPE_IQ1_S:
|
||||
case GGML_TYPE_IQ1_M:
|
||||
lut_size = 2*2048 + 4*2048;
|
||||
// Regular matmul uses the compact uint16_t IQ1 grid; the expanded
|
||||
// uint32_t grid is only enabled for the q8_1/int-dot vector path.
|
||||
lut_size = 2*2048;
|
||||
break;
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
lut_size = 8*256;
|
||||
@@ -3652,9 +3701,10 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
|
||||
s_mmq_wg_denoms_k = { 32, 64, 1 };
|
||||
|
||||
// spec constants and tile sizes for quant matmul_id
|
||||
l_warptile_mmqid = { 256, 128, 128, 32, 1, device->subgroup_size };
|
||||
m_warptile_mmqid = { 256, 128, 64, 32, 0, device->subgroup_size };
|
||||
s_warptile_mmqid = { 256, 128, 64, 32, 0, device->subgroup_size };
|
||||
const uint32_t mmqid_bk = device->coopmat2_decode_vector ? 64u : 32u;
|
||||
l_warptile_mmqid = { 256, 128, 128, mmqid_bk, 1, device->subgroup_size };
|
||||
m_warptile_mmqid = { 256, 128, 64, mmqid_bk, 0, device->subgroup_size };
|
||||
s_warptile_mmqid = { 256, 128, 64, mmqid_bk, 0, device->subgroup_size };
|
||||
l_mmqid_wg_denoms = { 128, 128, 1 };
|
||||
m_mmqid_wg_denoms = { 128, 64, 1 };
|
||||
s_mmqid_wg_denoms = { 128, 64, 1 };
|
||||
@@ -3916,8 +3966,13 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
|
||||
name = aligned ? "flash_attn_f32_f16_aligned" : "flash_attn_f32_f16";
|
||||
} else {
|
||||
if (device->fp16) {
|
||||
if (f32acc) { spv_data = flash_attn_f32_f16_data; spv_size = flash_attn_f32_f16_len; }
|
||||
else { spv_data = flash_attn_f32_f16_f16acc_data; spv_size = flash_attn_f32_f16_f16acc_len; }
|
||||
if (device->dot2_f16) {
|
||||
if (f32acc) { spv_data = flash_attn_f32_f16_dot2_data; spv_size = flash_attn_f32_f16_dot2_len; }
|
||||
else { spv_data = flash_attn_f32_f16_dot2_f16acc_data; spv_size = flash_attn_f32_f16_dot2_f16acc_len; }
|
||||
} else {
|
||||
if (f32acc) { spv_data = flash_attn_f32_f16_data; spv_size = flash_attn_f32_f16_len; }
|
||||
else { spv_data = flash_attn_f32_f16_f16acc_data; spv_size = flash_attn_f32_f16_f16acc_len; }
|
||||
}
|
||||
} else {
|
||||
spv_data = flash_attn_f32_f16_fp32_data;
|
||||
spv_size = flash_attn_f32_f16_fp32_len;
|
||||
@@ -4211,7 +4266,23 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
|
||||
#endif // defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
|
||||
if (device->fp16) {
|
||||
// Create 6 variants, {s,m,l}x{unaligned,aligned}
|
||||
// Selects dot2 SPIR-V variant at runtime when device->dot2_f16 is true
|
||||
#define CREATE_MM(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID, REQSUBGROUPSIZE) \
|
||||
if (device->mul_mat ## ID ## _l[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _len : NAMELC ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _data : NAMELC ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
if (device->mul_mat ## ID ## _m[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _len : NAMELC ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _data : NAMELC ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
if (device->mul_mat ## ID ## _s[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _len : NAMELC ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _data : NAMELC ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
if (device->mul_mat ## ID ## _l[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", (device->dot2_f16 ? NAMELC ## _dot2_aligned ## F16ACC ## _len : NAMELC ## _aligned ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2_aligned ## F16ACC ## _data : NAMELC ## _aligned ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
if (device->mul_mat ## ID ## _m[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", (device->dot2_f16 ? NAMELC ## _dot2_aligned ## F16ACC ## _len : NAMELC ## _aligned ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2_aligned ## F16ACC ## _data : NAMELC ## _aligned ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
if (device->mul_mat ## ID ## _s[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", (device->dot2_f16 ? NAMELC ## _dot2_aligned ## F16ACC ## _len : NAMELC ## _aligned ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2_aligned ## F16ACC ## _data : NAMELC ## _aligned ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
|
||||
// bf16 scalar path promotes to f32, no dot2 variant
|
||||
#define CREATE_MM_NODOT2(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID, REQSUBGROUPSIZE) \
|
||||
if (device->mul_mat ## ID ## _l[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
if (device->mul_mat ## ID ## _m[TYPE]) \
|
||||
@@ -4246,7 +4317,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
|
||||
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0);
|
||||
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_f16_f32, matmul_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0);
|
||||
|
||||
CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_bf16, matmul_bf16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0);
|
||||
CREATE_MM_NODOT2(GGML_TYPE_BF16, pipeline_matmul_bf16, matmul_bf16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0);
|
||||
|
||||
CREATE_MM2(GGML_TYPE_Q1_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q1_0], matmul_q1_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0);
|
||||
CREATE_MM2(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0], matmul_q4_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0);
|
||||
@@ -4254,7 +4325,6 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
|
||||
CREATE_MM2(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0], matmul_q5_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0);
|
||||
CREATE_MM2(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1], matmul_q5_1_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0);
|
||||
CREATE_MM2(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0], matmul_q8_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0);
|
||||
|
||||
CREATE_MM2(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K], matmul_q2_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0);
|
||||
CREATE_MM2(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K], matmul_q3_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0);
|
||||
CREATE_MM2(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K], matmul_q4_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0);
|
||||
@@ -4294,8 +4364,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
|
||||
CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_subgroup_f32_f32, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_subgroup_f16, wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_subgroup_f16_f32, wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||
CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_subgroup_bf16, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||
|
||||
CREATE_MM_NODOT2(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_subgroup_bf16, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||
CREATE_MM2(GGML_TYPE_Q1_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q1_0], matmul_id_subgroup_q1_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size);
|
||||
CREATE_MM2(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_subgroup_q4_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size);
|
||||
CREATE_MM2(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_subgroup_q4_1_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size);
|
||||
@@ -4340,8 +4409,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
|
||||
CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
|
||||
CREATE_MM_NODOT2(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
CREATE_MM2(GGML_TYPE_Q1_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q1_0], matmul_id_q1_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
CREATE_MM2(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_q4_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
CREATE_MM2(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_q4_1_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
@@ -4386,6 +4454,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
|
||||
#undef CREATE_MM2
|
||||
#undef CREATE_MMQ
|
||||
#undef CREATE_MM
|
||||
#undef CREATE_MM_NODOT2
|
||||
} else {
|
||||
// Create 6 variants, {s,m,l}x{unaligned,aligned}
|
||||
#define CREATE_MM(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID, REQSUBGROUPSIZE) \
|
||||
@@ -4964,8 +5033,8 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_repeat_i16, "repeat_i16", repeat_i16_len, repeat_i16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
#define CREATE_UNARY(name) \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
CREATE_UNARY(elu)
|
||||
CREATE_UNARY(gelu)
|
||||
@@ -4988,6 +5057,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
|
||||
CREATE_UNARY(trunc)
|
||||
CREATE_UNARY(sgn)
|
||||
CREATE_UNARY(exp)
|
||||
CREATE_UNARY(expm1)
|
||||
#undef CREATE_UNARY
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_add1_f16_f16, "add1_f16_f16", add1_f16_f16_len, add1_f16_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
||||
@@ -5084,6 +5154,14 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
|
||||
}
|
||||
++idx;
|
||||
}
|
||||
} else if (device->driver_id != vk::DriverId::eIntelProprietaryWindows) {
|
||||
// Disabled on Intel Windows due to a driver bug: https://github.com/ggml-org/llama.cpp/pull/23964#issuecomment-4598226147
|
||||
int idx = 0;
|
||||
for (uint32_t n : {64, 128, 256, 512}) {
|
||||
const uint32_t block_size = std::min(device->subgroup_size, n);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_fwht_f32[idx], "fwht_shmem_f32", fwht_shmem_f32_len, fwht_shmem_f32_data, "main", 2, sizeof(vk_op_fwht_push_constants), {1, 1, 1}, { block_size, n }, 1);
|
||||
++idx;
|
||||
}
|
||||
}
|
||||
|
||||
const uint32_t cumsum_elem_per_thread = (device->vendor_id == VK_VENDOR_ID_AMD || device->vendor_id == VK_VENDOR_ID_INTEL) ? 2 : 4;
|
||||
@@ -5441,6 +5519,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
device->integer_dot_product = false;
|
||||
device->shader_64b_indexing = false;
|
||||
bool bfloat16_support = false;
|
||||
bool dot2_f16_support = false;
|
||||
|
||||
for (const auto& properties : ext_props) {
|
||||
if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) {
|
||||
@@ -5483,6 +5562,9 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
!getenv("GGML_VK_DISABLE_BFLOAT16")) {
|
||||
bfloat16_support = true;
|
||||
#endif
|
||||
} else if (strcmp("VK_VALVE_shader_mixed_float_dot_product", properties.extensionName) == 0 &&
|
||||
!getenv("GGML_VK_DISABLE_DOT2")) {
|
||||
dot2_f16_support = true;
|
||||
} else if (strcmp("VK_KHR_pipeline_executable_properties", properties.extensionName) == 0) {
|
||||
pipeline_executable_properties_support = true;
|
||||
} else if (strcmp("VK_EXT_memory_priority", properties.extensionName) == 0 &&
|
||||
@@ -5630,6 +5712,11 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
#endif
|
||||
device->subgroup_shuffle = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) &&
|
||||
(vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eShuffle);
|
||||
#ifdef __APPLE__
|
||||
if (device->vendor_id == VK_VENDOR_ID_AMD) {
|
||||
device->subgroup_shuffle = false;
|
||||
}
|
||||
#endif
|
||||
device->subgroup_clustered = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) &&
|
||||
(vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eClustered);
|
||||
|
||||
@@ -5785,6 +5872,14 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
device_extensions.push_back("VK_KHR_shader_integer_dot_product");
|
||||
}
|
||||
|
||||
VkPhysicalDeviceShaderMixedFloatDotProductFeaturesVALVE dot2_features {};
|
||||
dot2_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_MIXED_FLOAT_DOT_PRODUCT_FEATURES_VALVE;
|
||||
if (dot2_f16_support) {
|
||||
last_struct->pNext = (VkBaseOutStructure *)&dot2_features;
|
||||
last_struct = (VkBaseOutStructure *)&dot2_features;
|
||||
device_extensions.push_back("VK_VALVE_shader_mixed_float_dot_product");
|
||||
}
|
||||
|
||||
VkPhysicalDevicePipelineExecutablePropertiesFeaturesKHR pep_features {};
|
||||
pep_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_EXECUTABLE_PROPERTIES_FEATURES_KHR;
|
||||
if (pipeline_executable_properties_support) {
|
||||
@@ -5819,6 +5914,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
device->bf16 = false;
|
||||
#endif
|
||||
|
||||
device->dot2_f16 = dot2_f16_support && dot2_features.shaderMixedFloatDotProductFloat16AccFloat32;
|
||||
|
||||
device->pipeline_robustness = pl_robustness_features.pipelineRobustness;
|
||||
|
||||
device->multi_add = vk12_props.shaderRoundingModeRTEFloat16 &&
|
||||
@@ -6133,6 +6230,19 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
break;
|
||||
}
|
||||
|
||||
#if VK_HEADER_VERSION >= 287
|
||||
// Honeykrisp driver for Asahi Linux doesn't report VK_VENDOR_ID_APPLE.
|
||||
// Check for Honeykrisp driver and force same configuration as the VK_VENDOR_ID_APPLE case.
|
||||
if (device->driver_id == vk::DriverId::eMesaHoneykrisp) {
|
||||
device->mul_mat_l[i] = false;
|
||||
device->mul_mat_m[i] = true;
|
||||
device->mul_mat_s[i] = false;
|
||||
device->mul_mat_id_l[i] = false;
|
||||
device->mul_mat_id_m[i] = true;
|
||||
device->mul_mat_id_s[i] = false;
|
||||
}
|
||||
#endif
|
||||
|
||||
device->mul_mat_l_int[i] = device->mul_mat_l[i];
|
||||
device->mul_mat_m_int[i] = device->mul_mat_m[i];
|
||||
device->mul_mat_s_int[i] = device->mul_mat_s[i];
|
||||
@@ -6233,6 +6343,7 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
||||
bool coopmat2_decode_vector_support = false;
|
||||
bool integer_dot_product = false;
|
||||
bool bfloat16_support = false;
|
||||
bool dot2_f16_support = false;
|
||||
|
||||
for (auto properties : ext_props) {
|
||||
if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) {
|
||||
@@ -6262,6 +6373,9 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
||||
!getenv("GGML_VK_DISABLE_BFLOAT16")) {
|
||||
bfloat16_support = true;
|
||||
#endif
|
||||
} else if (strcmp("VK_VALVE_shader_mixed_float_dot_product", properties.extensionName) == 0 &&
|
||||
!getenv("GGML_VK_DISABLE_DOT2")) {
|
||||
dot2_f16_support = true;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6336,6 +6450,15 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(VK_NV_cooperative_matrix2)
|
||||
VkPhysicalDeviceCooperativeMatrix2FeaturesNV coopmat2_features {};
|
||||
coopmat2_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_2_FEATURES_NV;
|
||||
if (coopmat2_support) {
|
||||
last_struct->pNext = (VkBaseOutStructure *)&coopmat2_features;
|
||||
last_struct = (VkBaseOutStructure *)&coopmat2_features;
|
||||
}
|
||||
#endif
|
||||
|
||||
VkPhysicalDeviceCooperativeMatrixDecodeVectorFeaturesNV coopmat2_decode_vector_features {};
|
||||
coopmat2_decode_vector_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_DECODE_VECTOR_FEATURES_NV;
|
||||
if (coopmat2_decode_vector_support) {
|
||||
@@ -6343,6 +6466,13 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
||||
last_struct = (VkBaseOutStructure *)&coopmat2_decode_vector_features;
|
||||
}
|
||||
|
||||
VkPhysicalDeviceShaderMixedFloatDotProductFeaturesVALVE dot2_features {};
|
||||
dot2_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_MIXED_FLOAT_DOT_PRODUCT_FEATURES_VALVE;
|
||||
if (dot2_f16_support) {
|
||||
last_struct->pNext = (VkBaseOutStructure *)&dot2_features;
|
||||
last_struct = (VkBaseOutStructure *)&dot2_features;
|
||||
}
|
||||
|
||||
vkGetPhysicalDeviceFeatures2(physical_device, &device_features2);
|
||||
|
||||
fp16 = fp16 && vk12_features.shaderFloat16;
|
||||
@@ -6367,6 +6497,19 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
||||
#endif
|
||||
&& ggml_vk_khr_cooperative_matrix_support(props2.properties, driver_props, device_architecture);
|
||||
|
||||
#if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||
coopmat2_support = coopmat2_support &&
|
||||
coopmat2_features.cooperativeMatrixWorkgroupScope &&
|
||||
coopmat2_features.cooperativeMatrixFlexibleDimensions &&
|
||||
coopmat2_features.cooperativeMatrixReductions &&
|
||||
coopmat2_features.cooperativeMatrixConversions &&
|
||||
coopmat2_features.cooperativeMatrixPerElementOperations &&
|
||||
coopmat2_features.cooperativeMatrixTensorAddressing &&
|
||||
coopmat2_features.cooperativeMatrixBlockLoads;
|
||||
#else
|
||||
coopmat2_support = false;
|
||||
#endif
|
||||
|
||||
coopmat2_decode_vector_support = coopmat2_decode_vector_support && coopmat2_decode_vector_features.cooperativeMatrixDecodeVector;
|
||||
#if !defined(GGML_VULKAN_COOPMAT2_DECODE_VECTOR_GLSLC_SUPPORT)
|
||||
coopmat2_decode_vector_support = false;
|
||||
@@ -6376,9 +6519,12 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
||||
: coopmat_support ? "KHR_coopmat"
|
||||
: "none";
|
||||
|
||||
bool dot2_f16 = dot2_f16_support && dot2_features.shaderMixedFloatDotProductFloat16AccFloat32;
|
||||
const char *fp16_str = fp16 ? (dot2_f16 ? "dot2" : "1") : "0";
|
||||
|
||||
std::string device_name = props2.properties.deviceName.data();
|
||||
GGML_LOG_DEBUG("ggml_vulkan: %zu = %s (%s) | uma: %d | fp16: %d | bf16: %d | warp size: %zu | shared memory: %d | int dot: %d | matrix cores: %s\n",
|
||||
idx, device_name.c_str(), driver_props.driverName.data(), uma, fp16, bf16, subgroup_size,
|
||||
GGML_LOG_DEBUG("ggml_vulkan: %zu = %s (%s) | uma: %d | fp16: %s | bf16: %d | warp size: %zu | shared memory: %d | int dot: %d | matrix cores: %s\n",
|
||||
idx, device_name.c_str(), driver_props.driverName.data(), uma, fp16_str, bf16, subgroup_size,
|
||||
props2.properties.limits.maxComputeSharedMemorySize, integer_dot_product, matrix_cores.c_str());
|
||||
|
||||
if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) {
|
||||
@@ -7499,8 +7645,12 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void *
|
||||
if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
||||
GGML_ASSERT(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
|
||||
|
||||
for (size_t i = 0; i < height; i++) {
|
||||
memcpy((uint8_t *)dst->ptr + offset + i * dpitch, (const uint8_t *) src + i * spitch, width);
|
||||
if (width == spitch && width == dpitch) {
|
||||
memcpy((uint8_t *)dst->ptr + offset, src, width * height);
|
||||
} else {
|
||||
for (size_t i = 0; i < height; i++) {
|
||||
memcpy((uint8_t *)dst->ptr + offset + i * dpitch, (const uint8_t *) src + i * spitch, width);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
std::lock_guard<std::recursive_mutex> guard(dst->device->mutex);
|
||||
@@ -7619,8 +7769,29 @@ static void ggml_vk_buffer_read_2d(vk_buffer& src, size_t offset, void * dst, si
|
||||
if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible && src->device->uma) {
|
||||
GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
|
||||
|
||||
for (size_t i = 0; i < height; i++) {
|
||||
memcpy((uint8_t *) dst + i * dpitch, (const uint8_t *) src->ptr + offset + i * spitch, width);
|
||||
std::lock_guard<std::recursive_mutex> guard(src->device->mutex);
|
||||
vk_context subctx = ggml_vk_create_temporary_context(src->device->compute_queue.cmd_pool);
|
||||
ggml_vk_ctx_begin(src->device, subctx);
|
||||
subctx->s->buffer->buf.pipelineBarrier(
|
||||
vk::PipelineStageFlagBits::eComputeShader | vk::PipelineStageFlagBits::eTransfer,
|
||||
vk::PipelineStageFlagBits::eHost,
|
||||
{},
|
||||
{ { vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferWrite,
|
||||
vk::AccessFlagBits::eHostRead } },
|
||||
{}, {});
|
||||
ggml_vk_ctx_end(subctx);
|
||||
ggml_vk_submit(subctx, src->device->fence);
|
||||
VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX),
|
||||
"vk_buffer_read_2d uma waitForFences");
|
||||
src->device->device.resetFences({ src->device->fence });
|
||||
ggml_vk_queue_command_pools_cleanup(src->device);
|
||||
|
||||
if (width == spitch && width == dpitch) {
|
||||
memcpy(dst, (const uint8_t *) src->ptr + offset, width * height);
|
||||
} else {
|
||||
for (size_t i = 0; i < height; i++) {
|
||||
memcpy((uint8_t *) dst + i * dpitch, (const uint8_t *) src->ptr + offset + i * spitch, width);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
std::lock_guard<std::recursive_mutex> guard(src->device->mutex);
|
||||
@@ -8049,7 +8220,6 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const
|
||||
static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline pipeline, const ggml_tensor * tensor, const vk_subbuffer & in, const vk_subbuffer & out) {
|
||||
VK_LOG_DEBUG("ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
|
||||
std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")");
|
||||
const int tensor_type_size = ggml_type_size(tensor->type);
|
||||
|
||||
const uint32_t ne = ggml_nelements(tensor);
|
||||
std::array<uint32_t, 3> elements;
|
||||
@@ -8062,14 +8232,41 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context&
|
||||
elements = { ne, 1, 1 };
|
||||
}
|
||||
|
||||
vk_op_unary_push_constants pc = {
|
||||
(uint32_t)ne,
|
||||
(uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], (uint32_t)tensor->nb[0] / tensor_type_size, (uint32_t)tensor->nb[1] / tensor_type_size, (uint32_t)tensor->nb[2] / tensor_type_size, (uint32_t)tensor->nb[3] / tensor_type_size,
|
||||
(uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], 1 , (uint32_t)tensor->ne[0] , (uint32_t)(tensor->ne[0] * tensor->ne[1]) , (uint32_t)(tensor->ne[0] * tensor->ne[1] * tensor->ne[2]),
|
||||
0,
|
||||
0.0f, 0.0f,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
};
|
||||
vk_op_unary_push_constants pc = vk_op_unary_push_constants_init(tensor, tensor, ne);
|
||||
pc.nb10 = 1;
|
||||
pc.nb11 = (uint32_t)tensor->ne[0];
|
||||
pc.nb12 = (uint32_t)(tensor->ne[0] * tensor->ne[1]);
|
||||
pc.nb13 = (uint32_t)(tensor->ne[0] * tensor->ne[1] * tensor->ne[2]);
|
||||
init_pushconst_fastdiv(pc);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, pc, elements);
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
}
|
||||
|
||||
// Copy/convert tensor into a caller-defined dense layout. Destination strides
|
||||
// are in output elements, not bytes.
|
||||
static void ggml_vk_cpy_to_strided(
|
||||
ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline pipeline, const ggml_tensor * tensor,
|
||||
const vk_subbuffer & in, const vk_subbuffer & out,
|
||||
uint32_t nb10, uint32_t nb11, uint32_t nb12, uint32_t nb13) {
|
||||
VK_LOG_DEBUG("ggml_vk_cpy_to_strided((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
|
||||
std::cerr << "dst_nb=(" << nb10 << ", " << nb11 << ", " << nb12 << ", " << nb13 << "), buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")");
|
||||
|
||||
const uint32_t ne = ggml_nelements(tensor);
|
||||
std::array<uint32_t, 3> elements;
|
||||
|
||||
if (ne > 262144) {
|
||||
elements = { 512, 512, CEIL_DIV(ne, 262144) };
|
||||
} else if (ne > 512) {
|
||||
elements = { 512, CEIL_DIV(ne, 512), 1 };
|
||||
} else {
|
||||
elements = { ne, 1, 1 };
|
||||
}
|
||||
|
||||
vk_op_unary_push_constants pc = vk_op_unary_push_constants_init(tensor, tensor, ne);
|
||||
pc.nb10 = nb10;
|
||||
pc.nb11 = nb11;
|
||||
pc.nb12 = nb12;
|
||||
pc.nb13 = nb13;
|
||||
init_pushconst_fastdiv(pc);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, pc, elements);
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
@@ -8332,24 +8529,28 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
||||
}
|
||||
if (y_non_contig) {
|
||||
if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() ||
|
||||
ctx->prealloc_y_last_tensor_used != src1) {
|
||||
ctx->prealloc_y_last_tensor_used != src1 ||
|
||||
ctx->prealloc_y_last_decode_vector_staging) {
|
||||
if (ctx->prealloc_y_need_sync) {
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
}
|
||||
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0));
|
||||
ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
|
||||
ctx->prealloc_y_last_tensor_used = src1;
|
||||
ctx->prealloc_y_last_decode_vector_staging = false;
|
||||
}
|
||||
}
|
||||
if (quantize_y) {
|
||||
if (ctx->prealloc_y_last_pipeline_used != to_q8_1.get() ||
|
||||
ctx->prealloc_y_last_tensor_used != src1) {
|
||||
ctx->prealloc_y_last_tensor_used != src1 ||
|
||||
ctx->prealloc_y_last_decode_vector_staging) {
|
||||
if (ctx->prealloc_y_need_sync) {
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
}
|
||||
ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0), y_ne);
|
||||
ctx->prealloc_y_last_pipeline_used = to_q8_1.get();
|
||||
ctx->prealloc_y_last_tensor_used = src1;
|
||||
ctx->prealloc_y_last_decode_vector_staging = false;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8607,24 +8808,28 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||
if (y_non_contig) {
|
||||
GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
|
||||
if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() ||
|
||||
ctx->prealloc_y_last_tensor_used != src1) {
|
||||
ctx->prealloc_y_last_tensor_used != src1 ||
|
||||
ctx->prealloc_y_last_decode_vector_staging) {
|
||||
if (ctx->prealloc_y_need_sync) {
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
}
|
||||
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, d_Qy, d_Y);
|
||||
ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
|
||||
ctx->prealloc_y_last_tensor_used = src1;
|
||||
ctx->prealloc_y_last_decode_vector_staging = false;
|
||||
}
|
||||
}
|
||||
if (quantize_y) {
|
||||
if (ctx->prealloc_y_last_pipeline_used != to_q8_1.get() ||
|
||||
ctx->prealloc_y_last_tensor_used != src1) {
|
||||
ctx->prealloc_y_last_tensor_used != src1 ||
|
||||
ctx->prealloc_y_last_decode_vector_staging) {
|
||||
if (ctx->prealloc_y_need_sync) {
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
}
|
||||
ggml_vk_quantize_q8_1(ctx, subctx, d_Qy, d_Y, y_ne);
|
||||
ctx->prealloc_y_last_pipeline_used = to_q8_1.get();
|
||||
ctx->prealloc_y_last_tensor_used = src1;
|
||||
ctx->prealloc_y_last_decode_vector_staging = false;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9075,12 +9280,30 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||
// Reformat and convert to fp16 if non-contiguous, or for coopmat2 for better perf
|
||||
const bool x_non_contig = (ctx->device->coopmat2 && src0->type == GGML_TYPE_F32) ||
|
||||
!ggml_vk_dim01_contiguous(src0);
|
||||
const bool y_non_contig = (ctx->device->coopmat2 && src1->type == GGML_TYPE_F32) ||
|
||||
// If src0 is BF16, try to use a BF16 x BF16 multiply
|
||||
ggml_type f16_type = src0->type == GGML_TYPE_BF16 ? GGML_TYPE_BF16 : GGML_TYPE_F16;
|
||||
#if defined(GGML_VULKAN_COOPMAT2_DECODE_VECTOR_GLSLC_SUPPORT)
|
||||
// B must already be, or be convertible to, the matmul B type used by this path.
|
||||
const bool y_decode_vector_supported = ctx->device->coopmat2_decode_vector &&
|
||||
(f16_type != GGML_TYPE_BF16 || ctx->device->coopmat2_bf16_support) &&
|
||||
(src1->type == GGML_TYPE_F32 || src1->type == f16_type);
|
||||
// If B is copied to prealloc_y, we can choose a 4-element-aligned row stride.
|
||||
const bool y_decode_vector_uses_prealloc = !ggml_vk_dim01_contiguous(src1) || src1->type != f16_type;
|
||||
// Direct B reads are safe only if row starts and the original buffer offset are 4-element aligned.
|
||||
const bool y_decode_vector_aligned =
|
||||
(ne10 % 4 == 0) &&
|
||||
(y_decode_vector_uses_prealloc || get_misalign_bytes(ctx, src1) % (4 * ggml_type_size(src1->type)) == 0);
|
||||
// Stage B only when decode-vector is available and direct B reads would be misaligned.
|
||||
const bool y_decode_vector_staging = y_decode_vector_supported && !y_decode_vector_aligned;
|
||||
#else
|
||||
const bool y_decode_vector_staging = false;
|
||||
#endif
|
||||
const bool y_non_contig = y_decode_vector_staging ||
|
||||
(ctx->device->coopmat2 && src1->type == GGML_TYPE_F32) ||
|
||||
(src0->type == GGML_TYPE_BF16 && src1->type != GGML_TYPE_BF16) ||
|
||||
!ggml_vk_dim01_contiguous(src1);
|
||||
|
||||
// If src0 is BF16, try to use a BF16 x BF16 multiply
|
||||
ggml_type f16_type = src0->type == GGML_TYPE_BF16 ? GGML_TYPE_BF16 : GGML_TYPE_F16;
|
||||
const uint32_t y_staged_row_stride = y_decode_vector_staging ? (uint32_t)ggml_vk_align_size(ne10, 4) : (uint32_t)ne10;
|
||||
|
||||
const bool y_f32_kernel = src1->type == GGML_TYPE_F32 && !y_non_contig;
|
||||
|
||||
@@ -9119,11 +9342,11 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||
// Reserve extra storage in the N dimension for the Y matrix, so we can avoid bounds-checking
|
||||
uint32_t padded_n = qy_needs_dequant ? ROUNDUP_POW2(ne11, pipeline->wg_denoms[1]) :ne11;
|
||||
const uint64_t x_ne = ggml_nelements(src0);
|
||||
const uint64_t y_ne = padded_n * ne10 * ne12 * ne13;
|
||||
const uint64_t y_ne = (uint64_t)y_staged_row_stride * padded_n * ne12 * ne13;
|
||||
const uint64_t d_ne = ggml_nelements(dst);
|
||||
|
||||
const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type);
|
||||
const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
|
||||
const uint64_t qy_sz = ggml_type_size(src1->type) * ggml_nelements(src1) / ggml_blck_size(src1->type);
|
||||
const uint64_t x_sz = !qx_needs_dequant ? qx_sz : sizeof(ggml_fp16_t) * x_ne;
|
||||
const uint64_t y_sz = quantize_y ? (ggml_vk_align_size(y_ne, 128) * ggml_type_size(GGML_TYPE_Q8_1) / ggml_blck_size(GGML_TYPE_Q8_1)) : (y_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne);
|
||||
const uint64_t ids_sz = nbi2;
|
||||
@@ -9133,13 +9356,30 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||
vk_pipeline to_fp16_vk_1 = nullptr;
|
||||
vk_pipeline to_q8_1 = nullptr;
|
||||
|
||||
auto make_y_staged_dst = [&]() {
|
||||
ggml_tensor y_staged_dst = *src1;
|
||||
y_staged_dst.type = f16_type;
|
||||
y_staged_dst.nb[0] = ggml_type_size(f16_type);
|
||||
y_staged_dst.nb[1] = y_staged_dst.nb[0] * y_staged_row_stride;
|
||||
y_staged_dst.nb[2] = y_staged_dst.nb[1] * padded_n;
|
||||
y_staged_dst.nb[3] = y_staged_dst.nb[2] * y_staged_dst.ne[2];
|
||||
return y_staged_dst;
|
||||
};
|
||||
|
||||
if (x_non_contig) {
|
||||
to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, f16_type);
|
||||
} else {
|
||||
to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type);
|
||||
}
|
||||
if (y_non_contig) {
|
||||
to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, f16_type);
|
||||
ggml_tensor y_staged_dst;
|
||||
const ggml_tensor * y_staged_dst_ptr = nullptr;
|
||||
if (y_decode_vector_staging) {
|
||||
y_staged_dst = make_y_staged_dst();
|
||||
y_staged_dst_ptr = &y_staged_dst;
|
||||
}
|
||||
|
||||
to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, y_staged_dst_ptr, f16_type);
|
||||
} else {
|
||||
to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
|
||||
}
|
||||
@@ -9257,30 +9497,47 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||
}
|
||||
if (y_non_contig) {
|
||||
if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() ||
|
||||
ctx->prealloc_y_last_tensor_used != src1) {
|
||||
ctx->prealloc_y_last_tensor_used != src1 ||
|
||||
ctx->prealloc_y_last_decode_vector_staging != y_decode_vector_staging) {
|
||||
if (ctx->prealloc_y_need_sync) {
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
}
|
||||
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0));
|
||||
if (y_decode_vector_staging) {
|
||||
const ggml_tensor y_staged_dst = make_y_staged_dst();
|
||||
const uint32_t y_staged_dst_type_size = ggml_type_size(y_staged_dst.type);
|
||||
ggml_vk_cpy_to_strided(
|
||||
ctx, subctx, to_fp16_vk_1, src1,
|
||||
ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0),
|
||||
(uint32_t)(y_staged_dst.nb[0] / y_staged_dst_type_size),
|
||||
(uint32_t)(y_staged_dst.nb[1] / y_staged_dst_type_size),
|
||||
(uint32_t)(y_staged_dst.nb[2] / y_staged_dst_type_size),
|
||||
(uint32_t)(y_staged_dst.nb[3] / y_staged_dst_type_size));
|
||||
} else {
|
||||
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0));
|
||||
}
|
||||
ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
|
||||
ctx->prealloc_y_last_tensor_used = src1;
|
||||
ctx->prealloc_y_last_decode_vector_staging = y_decode_vector_staging;
|
||||
}
|
||||
}
|
||||
if (quantize_y) {
|
||||
if (ctx->prealloc_y_last_pipeline_used != to_q8_1.get() ||
|
||||
ctx->prealloc_y_last_tensor_used != src1) {
|
||||
ctx->prealloc_y_last_tensor_used != src1 ||
|
||||
ctx->prealloc_y_last_decode_vector_staging) {
|
||||
if (ctx->prealloc_y_need_sync) {
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
}
|
||||
ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0), y_ne);
|
||||
ctx->prealloc_y_last_pipeline_used = to_q8_1.get();
|
||||
ctx->prealloc_y_last_tensor_used = src1;
|
||||
ctx->prealloc_y_last_decode_vector_staging = false;
|
||||
}
|
||||
}
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
|
||||
uint32_t stride_batch_x = ne00*ne01;
|
||||
uint32_t stride_batch_y = ne10*ne11;
|
||||
uint32_t stride_b_y = y_decode_vector_staging ? y_staged_row_stride : ne10;
|
||||
uint32_t stride_batch_y = y_decode_vector_staging ? y_staged_row_stride * padded_n : ne10*ne11;
|
||||
|
||||
if (!ggml_vk_dim01_contiguous(src0) && !qx_needs_dequant) {
|
||||
stride_batch_x = src0->nb[0] / ggml_type_size(src0->type);
|
||||
@@ -9295,7 +9552,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||
ctx, subctx, pipeline,
|
||||
{ d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz },
|
||||
{ d_D, d_buf_offset, d_sz }, { d_ids, ids_buf_offset, ids_sz }, expert_count_buf,
|
||||
ne01, ne21, ne10, ne10, ne10, ne01,
|
||||
ne01, ne21, ne10, ne10, stride_b_y, ne01,
|
||||
stride_batch_x, stride_batch_y, ne20*ne21,
|
||||
n_as, nei0, nei1, nbi1 / ggml_type_size(ids->type), ne11, padded_n
|
||||
); // NOLINT
|
||||
@@ -9453,24 +9710,28 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
||||
if (y_non_contig) {
|
||||
GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
|
||||
if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() ||
|
||||
ctx->prealloc_y_last_tensor_used != src1) {
|
||||
ctx->prealloc_y_last_tensor_used != src1 ||
|
||||
ctx->prealloc_y_last_decode_vector_staging) {
|
||||
if (ctx->prealloc_y_need_sync) {
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
}
|
||||
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, d_Qy, d_Y);
|
||||
ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
|
||||
ctx->prealloc_y_last_tensor_used = src1;
|
||||
ctx->prealloc_y_last_decode_vector_staging = false;
|
||||
}
|
||||
}
|
||||
if (quantize_y) {
|
||||
if (ctx->prealloc_y_last_pipeline_used != to_q8_1.get() ||
|
||||
ctx->prealloc_y_last_tensor_used != src1) {
|
||||
ctx->prealloc_y_last_tensor_used != src1 ||
|
||||
ctx->prealloc_y_last_decode_vector_staging) {
|
||||
if (ctx->prealloc_y_need_sync) {
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
}
|
||||
ggml_vk_quantize_q8_1(ctx, subctx, d_Qy, d_Y, y_ne);
|
||||
ctx->prealloc_y_last_pipeline_used = to_q8_1.get();
|
||||
ctx->prealloc_y_last_tensor_used = src1;
|
||||
ctx->prealloc_y_last_decode_vector_staging = false;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10210,6 +10471,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||
switch (ggml_get_unary_op(dst)) {
|
||||
case GGML_UNARY_OP_EXP:
|
||||
return ctx->device->pipeline_exp[dst->type == GGML_TYPE_F16];
|
||||
case GGML_UNARY_OP_EXPM1:
|
||||
return ctx->device->pipeline_expm1[dst->type == GGML_TYPE_F16];
|
||||
case GGML_UNARY_OP_ELU:
|
||||
return ctx->device->pipeline_elu[dst->type == GGML_TYPE_F16];
|
||||
case GGML_UNARY_OP_SILU:
|
||||
@@ -10608,6 +10871,21 @@ template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk
|
||||
GGML_UNUSED(src3);
|
||||
}
|
||||
|
||||
template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_glu_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
|
||||
const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
|
||||
const uint32_t b_offset = src1 ? get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type) : a_offset;
|
||||
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
|
||||
|
||||
GGML_ASSERT(a_offset < (1u << 8));
|
||||
GGML_ASSERT(b_offset < (1u << 8));
|
||||
GGML_ASSERT(d_offset < (1u << 8));
|
||||
|
||||
p.misalign_offsets = (a_offset << 16) | (b_offset << 8) | d_offset;
|
||||
|
||||
GGML_UNUSED(src2);
|
||||
GGML_UNUSED(src3);
|
||||
}
|
||||
|
||||
template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_sum_rows_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
|
||||
const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
|
||||
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
|
||||
@@ -11325,7 +11603,6 @@ static void ggml_vk_gated_delta_net(ggml_backend_vk_context * ctx, vk_context& s
|
||||
const ggml_tensor * src_q = dst->src[0];
|
||||
const ggml_tensor * src_v = dst->src[2];
|
||||
const ggml_tensor * src_beta = dst->src[4];
|
||||
const ggml_tensor * src_state = dst->src[5];
|
||||
|
||||
GGML_ASSERT(dst->buffer != nullptr);
|
||||
|
||||
@@ -11334,8 +11611,8 @@ static void ggml_vk_gated_delta_net(ggml_backend_vk_context * ctx, vk_context& s
|
||||
const uint32_t n_tokens = (uint32_t)src_v->ne[2];
|
||||
const uint32_t n_seqs = (uint32_t)src_v->ne[3];
|
||||
|
||||
// state is 3D (S_v*S_v*H, K, n_seqs); K is the snapshot slot count.
|
||||
const uint32_t K = (uint32_t)src_state->ne[1];
|
||||
// K (snapshot slot count) is an op param; state holds s0 only [S_v, S_v, H, n_seqs].
|
||||
const uint32_t K = (uint32_t)ggml_get_op_params_i32(dst, 0);
|
||||
|
||||
const uint32_t s_off = S_v * H * n_tokens * n_seqs;
|
||||
|
||||
@@ -11958,17 +12235,17 @@ static void ggml_vk_l2_norm(ggml_backend_vk_context * ctx, vk_context& subctx, c
|
||||
}
|
||||
|
||||
static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f, 0.0f, 0.0f });
|
||||
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY, vk_op_unary_push_constants_init(src0, dst));
|
||||
}
|
||||
|
||||
static void ggml_vk_xielu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
float * op_params = (float *)dst->op_params;
|
||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY,
|
||||
{
|
||||
(uint32_t)ggml_nelements(src0), 0,
|
||||
op_params[1], op_params[2], op_params[3], op_params[4]
|
||||
}
|
||||
);
|
||||
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst);
|
||||
p.param1 = op_params[1];
|
||||
p.param2 = op_params[2];
|
||||
p.param3 = op_params[3];
|
||||
p.param4 = op_params[4];
|
||||
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY, std::move(p));
|
||||
}
|
||||
|
||||
static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
@@ -11988,6 +12265,9 @@ static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const
|
||||
}
|
||||
|
||||
const uint32_t mode = split ? 2 : (swapped ? 1 : 0);
|
||||
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
||||
const uint32_t src1_type_size = split ? ggml_type_size(src1->type) : src0_type_size;
|
||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||
|
||||
ggml_vk_op_f32<vk_op_glu_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_GLU,
|
||||
{
|
||||
@@ -11997,16 +12277,22 @@ static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const
|
||||
mode,
|
||||
alpha,
|
||||
limit,
|
||||
(uint32_t)(src0->nb[1] / src0->nb[0]),
|
||||
(uint32_t)(src0->nb[2] / src0->nb[0]),
|
||||
(uint32_t)(src0->nb[3] / src0->nb[0]),
|
||||
(uint32_t)src0->ne[1],
|
||||
(uint32_t)src0->ne[2],
|
||||
(uint32_t)(dst->nb[1] / dst->nb[0]),
|
||||
(uint32_t)(dst->nb[2] / dst->nb[0]),
|
||||
(uint32_t)(dst->nb[3] / dst->nb[0]),
|
||||
(uint32_t)(src0->nb[0] / src0_type_size),
|
||||
(uint32_t)(src0->nb[1] / src0_type_size),
|
||||
(uint32_t)(src0->nb[2] / src0_type_size),
|
||||
(uint32_t)(src0->nb[3] / src0_type_size),
|
||||
(uint32_t)((split ? src1->nb[0] : src0->nb[0]) / src1_type_size),
|
||||
(uint32_t)((split ? src1->nb[1] : src0->nb[1]) / src1_type_size),
|
||||
(uint32_t)((split ? src1->nb[2] : src0->nb[2]) / src1_type_size),
|
||||
(uint32_t)((split ? src1->nb[3] : src0->nb[3]) / src1_type_size),
|
||||
(uint32_t)(dst->nb[0] / dst_type_size),
|
||||
(uint32_t)(dst->nb[1] / dst_type_size),
|
||||
(uint32_t)(dst->nb[2] / dst_type_size),
|
||||
(uint32_t)(dst->nb[3] / dst_type_size),
|
||||
(uint32_t)dst->ne[1],
|
||||
(uint32_t)dst->ne[2]
|
||||
(uint32_t)dst->ne[2],
|
||||
0,
|
||||
0, 0, 0, 0, 0, 0,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -13695,7 +13981,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx, vk_contex
|
||||
ggml_vk_destroy_buffer(ctx->prealloc_y);
|
||||
}
|
||||
ctx->prealloc_y = ggml_vk_create_buffer_device(ctx->device, ctx->prealloc_size_y);
|
||||
ctx->prealloc_y_last_pipeline_used = nullptr;
|
||||
ctx->prealloc_y_last_tensor_used = nullptr;
|
||||
ctx->prealloc_y_last_decode_vector_staging = false;
|
||||
}
|
||||
if (ctx->prealloc_split_k == nullptr || (ctx->prealloc_size_split_k > 0 && ctx->prealloc_split_k->size < ctx->prealloc_size_split_k)) {
|
||||
VK_LOG_MEMORY("ggml_vk_preallocate_buffers(split_k_size: " << ctx->prealloc_size_split_k << ")");
|
||||
@@ -14007,6 +14295,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
||||
switch (ggml_get_unary_op(node)) {
|
||||
case GGML_UNARY_OP_ELU:
|
||||
case GGML_UNARY_OP_EXP:
|
||||
case GGML_UNARY_OP_EXPM1:
|
||||
case GGML_UNARY_OP_SILU:
|
||||
case GGML_UNARY_OP_GELU:
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
@@ -14275,6 +14564,8 @@ static void ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph *
|
||||
static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
|
||||
VK_LOG_DEBUG("ggml_vk_graph_cleanup()");
|
||||
ctx->prealloc_y_last_pipeline_used = {};
|
||||
ctx->prealloc_y_last_tensor_used = nullptr;
|
||||
ctx->prealloc_y_last_decode_vector_staging = false;
|
||||
|
||||
ctx->unsynced_nodes_written.clear();
|
||||
ctx->unsynced_nodes_read.clear();
|
||||
@@ -14325,6 +14616,8 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
|
||||
ggml_vk_destroy_buffer(ctx->sync_staging);
|
||||
|
||||
ctx->prealloc_y_last_pipeline_used = nullptr;
|
||||
ctx->prealloc_y_last_tensor_used = nullptr;
|
||||
ctx->prealloc_y_last_decode_vector_staging = false;
|
||||
|
||||
ctx->prealloc_size_x = 0;
|
||||
ctx->prealloc_size_y = 0;
|
||||
@@ -15504,6 +15797,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
|
||||
ctx->prealloc_y_last_pipeline_used = nullptr;
|
||||
ctx->prealloc_y_last_tensor_used = nullptr;
|
||||
ctx->prealloc_y_last_decode_vector_staging = false;
|
||||
|
||||
if (ctx->prealloc_size_add_rms_partials) {
|
||||
ggml_vk_preallocate_buffers(ctx, nullptr);
|
||||
@@ -16391,6 +16685,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(op)) {
|
||||
case GGML_UNARY_OP_EXP:
|
||||
case GGML_UNARY_OP_EXPM1:
|
||||
case GGML_UNARY_OP_ELU:
|
||||
case GGML_UNARY_OP_GELU:
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
@@ -16411,8 +16706,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
case GGML_UNARY_OP_FLOOR:
|
||||
case GGML_UNARY_OP_TRUNC:
|
||||
case GGML_UNARY_OP_SGN:
|
||||
return ggml_is_contiguous(op->src[0]) &&
|
||||
(op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
|
||||
return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
|
||||
(op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) &&
|
||||
(op->src[0]->type == op->type);
|
||||
default:
|
||||
@@ -16428,7 +16722,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
case GGML_GLU_OP_GEGLU_QUICK:
|
||||
return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
|
||||
(op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) &&
|
||||
(op->src[0]->type == op->type);
|
||||
(op->src[0]->type == op->type) &&
|
||||
(!op->src[1] || op->src[1]->type == op->src[0]->type);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -17558,6 +17853,9 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
|
||||
case GGML_UNARY_OP_EXP:
|
||||
tensor_clone = ggml_exp(ggml_ctx, src_clone[0]);
|
||||
break;
|
||||
case GGML_UNARY_OP_EXPM1:
|
||||
tensor_clone = ggml_expm1(ggml_ctx, src_clone[0]);
|
||||
break;
|
||||
case GGML_UNARY_OP_ELU:
|
||||
tensor_clone = ggml_elu(ggml_ctx, src_clone[0]);
|
||||
break;
|
||||
@@ -17744,7 +18042,8 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
|
||||
src_clone[4], src_clone[5], src_clone[6]);
|
||||
} else if (tensor->op == GGML_OP_GATED_DELTA_NET) {
|
||||
tensor_clone = ggml_gated_delta_net(ggml_ctx, src_clone[0], src_clone[1],
|
||||
src_clone[2], src_clone[3], src_clone[4], src_clone[5]);
|
||||
src_clone[2], src_clone[3], src_clone[4], src_clone[5],
|
||||
ggml_get_op_params_i32(tensor, 0));
|
||||
} else if (tensor->op == GGML_OP_OPT_STEP_ADAMW) {
|
||||
src_clone[0]->flags = tensor->src[0]->flags;
|
||||
tensor_clone = ggml_opt_step_adamw(ggml_ctx, src_clone[0], src_clone[1],
|
||||
|
||||
@@ -1,21 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "generic_head.glsl"
|
||||
#include "types.glsl"
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
void main() {
|
||||
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
|
||||
if (i >= p.KX) {
|
||||
return;
|
||||
}
|
||||
|
||||
data_d[i] = D_TYPE(abs(float(data_a[i])));
|
||||
}
|
||||
@@ -1,22 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "generic_head.glsl"
|
||||
#include "types.glsl"
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
void main() {
|
||||
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
|
||||
if (i >= p.KX) {
|
||||
return;
|
||||
}
|
||||
|
||||
const float x = float(data_a[i]);
|
||||
data_d[i] = D_TYPE(ceil(x));
|
||||
}
|
||||
@@ -12,11 +12,11 @@ void main() {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L);
|
||||
const uint i13 = fastdiv(idx, p.ne1_012mp, fastdiv_L(p.ne1_Ls, 0));
|
||||
const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
|
||||
const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L);
|
||||
const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, fastdiv_L(p.ne1_Ls, 1));
|
||||
const uint i12_offset = i12*p.ne11*p.ne10;
|
||||
const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L);
|
||||
const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, fastdiv_L(p.ne1_Ls, 2));
|
||||
const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
|
||||
|
||||
if (i10 == i11) {
|
||||
|
||||
@@ -0,0 +1,27 @@
|
||||
#ifdef DOT2_F16
|
||||
#extension GL_EXT_spirv_intrinsics : require
|
||||
|
||||
spirv_instruction(extensions = ["SPV_VALVE_mixed_float_dot_product"],
|
||||
capabilities = [6912], id = 6916)
|
||||
float v_dot2_f32_f16(f16vec2 a, f16vec2 b, float acc);
|
||||
|
||||
ACC_TYPE dot_product(f16vec4 a, f16vec4 b, ACC_TYPE acc) {
|
||||
return ACC_TYPE(v_dot2_f32_f16(a.zw, b.zw, v_dot2_f32_f16(a.xy, b.xy, float(acc))));
|
||||
}
|
||||
|
||||
ACC_TYPE dot_product(f16vec2 a, f16vec2 b, ACC_TYPE acc) {
|
||||
return ACC_TYPE(v_dot2_f32_f16(a, b, float(acc)));
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
ACC_TYPE dot_product(FLOAT_TYPEV4 a, FLOAT_TYPEV4 b, ACC_TYPE acc) {
|
||||
return fma(ACC_TYPE(a.x), ACC_TYPE(b.x), fma(ACC_TYPE(a.y), ACC_TYPE(b.y),
|
||||
fma(ACC_TYPE(a.z), ACC_TYPE(b.z), fma(ACC_TYPE(a.w), ACC_TYPE(b.w), acc))));
|
||||
}
|
||||
|
||||
ACC_TYPE dot_product(FLOAT_TYPEV2 a, FLOAT_TYPEV2 b, ACC_TYPE acc) {
|
||||
return fma(ACC_TYPE(a.x), ACC_TYPE(b.x), fma(ACC_TYPE(a.y), ACC_TYPE(b.y), acc));
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,27 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "generic_head.glsl"
|
||||
#include "types.glsl"
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
void main() {
|
||||
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
|
||||
if (i >= p.KX) {
|
||||
return;
|
||||
}
|
||||
|
||||
float x = float(data_a[i]);
|
||||
|
||||
if (x < 0.0f) {
|
||||
x = exp(x) - 1;
|
||||
}
|
||||
|
||||
data_d[i] = D_TYPE(x);
|
||||
}
|
||||
@@ -1,20 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "generic_head.glsl"
|
||||
#include "types.glsl"
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
void main() {
|
||||
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
|
||||
if (i >= p.KX) {
|
||||
return;
|
||||
}
|
||||
data_d[i] = D_TYPE(exp(float(data_a[i])));
|
||||
}
|
||||
@@ -21,6 +21,7 @@
|
||||
#extension GL_KHR_shader_subgroup_vote : enable
|
||||
|
||||
#include "types.glsl"
|
||||
#include "dot_product_funcs.glsl"
|
||||
#include "flash_attn_base.glsl"
|
||||
#include "flash_attn_dequant.glsl"
|
||||
|
||||
@@ -318,7 +319,7 @@ void main() {
|
||||
K_Tf = FLOAT_TYPEV4(data_kv4[k_offset / 4 + (j * Bc + c * cols_per_iter + col_tid) * k_stride / 4 + d * D_split + d_tid]);
|
||||
}
|
||||
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
|
||||
Sf[r][c] += dot(ACC_TYPEV4(Q_cache[r]), ACC_TYPEV4(K_Tf));
|
||||
Sf[r][c] = dot_product(Q_cache[r], K_Tf, Sf[r][c]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -341,7 +342,7 @@ void main() {
|
||||
K_Tf = FLOAT_TYPEV4(data_kv4[k_offset / 4 + (j * Bc + c * cols_per_iter + col_tid) * k_stride / 4 + d * D_split + d_tid]);
|
||||
}
|
||||
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
|
||||
Sf[r][c] += dot(ACC_TYPEV4(Qf[tile_row(r) * qf_stride + d * D_split + d_tid]), ACC_TYPEV4(K_Tf));
|
||||
Sf[r][c] = dot_product(Qf[tile_row(r) * qf_stride + d * D_split + d_tid], K_Tf, Sf[r][c]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,22 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "generic_head.glsl"
|
||||
#include "types.glsl"
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
void main() {
|
||||
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
|
||||
if (i >= p.KX) {
|
||||
return;
|
||||
}
|
||||
|
||||
const float x = float(data_a[i]);
|
||||
data_d[i] = D_TYPE(floor(x));
|
||||
}
|
||||
@@ -1,14 +1,16 @@
|
||||
#version 450
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : require
|
||||
#ifndef FWHT_SHMEM
|
||||
#extension GL_KHR_shader_subgroup_basic : enable
|
||||
#extension GL_KHR_shader_subgroup_shuffle : enable
|
||||
#endif
|
||||
|
||||
layout(constant_id = 0) const uint BLOCK_SIZE = 32;
|
||||
layout(constant_id = 1) const uint N = 128;
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 4, local_size_z = 1) in;
|
||||
|
||||
layout(constant_id = 0) const uint WARP_SIZE = 32;
|
||||
layout(constant_id = 1) const uint N = 128;
|
||||
|
||||
layout(push_constant) uniform parameter
|
||||
{
|
||||
uint n_rows;
|
||||
@@ -20,35 +22,72 @@ layout(push_constant) uniform parameter
|
||||
layout(binding = 0, std430) readonly buffer A { float data_a[]; };
|
||||
layout(binding = 1, std430) writeonly buffer D { float data_d[]; };
|
||||
|
||||
const uint EL_W = N / WARP_SIZE;
|
||||
const uint EL_W = N / BLOCK_SIZE;
|
||||
|
||||
#ifdef FWHT_SHMEM
|
||||
shared float shmem[4 * N];
|
||||
#endif
|
||||
|
||||
void main() {
|
||||
const uint lane = gl_SubgroupInvocationID;
|
||||
for (uint row = gl_WorkGroupID.x * gl_WorkGroupSize.y + gl_SubgroupID;
|
||||
row < n_rows;
|
||||
row += gl_NumWorkGroups.x * gl_WorkGroupSize.y) {
|
||||
#ifdef FWHT_SHMEM
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
const uint shmem_base = gl_LocalInvocationID.y * N;
|
||||
const uint row_id = gl_LocalInvocationID.y;
|
||||
#else
|
||||
const uint tid = gl_SubgroupInvocationID;
|
||||
const uint row_id = gl_SubgroupID;
|
||||
#endif
|
||||
|
||||
for (uint base_row = gl_WorkGroupID.x * gl_WorkGroupSize.y;
|
||||
base_row < n_rows;
|
||||
base_row += gl_NumWorkGroups.x * gl_WorkGroupSize.y) {
|
||||
const uint row = base_row + row_id;
|
||||
const uint row_offset = row * N;
|
||||
|
||||
#ifndef FWHT_SHMEM
|
||||
if (row >= n_rows) {
|
||||
continue;
|
||||
}
|
||||
#endif
|
||||
|
||||
float reg[EL_W];
|
||||
|
||||
[[unroll]]
|
||||
for (uint i = 0; i < EL_W; ++i) {
|
||||
reg[i] = data_a[src_offset + row_offset + i * WARP_SIZE + lane] * scale;
|
||||
reg[i] = row < n_rows ? data_a[src_offset + row_offset + i * BLOCK_SIZE + tid] * scale : 0.0;
|
||||
}
|
||||
|
||||
#ifdef FWHT_SHMEM
|
||||
[[unroll]]
|
||||
for (uint h = 1; h < WARP_SIZE; h <<= 1) {
|
||||
for (uint h = 1; h < BLOCK_SIZE; h <<= 1) {
|
||||
[[unroll]]
|
||||
for (uint i = 0; i < EL_W; ++i) {
|
||||
shmem[shmem_base + i * BLOCK_SIZE + tid] = reg[i];
|
||||
}
|
||||
barrier();
|
||||
[[unroll]]
|
||||
for (uint j = 0; j < EL_W; ++j) {
|
||||
const float val = reg[j];
|
||||
const float other = shmem[shmem_base + j * BLOCK_SIZE + (tid ^ h)];
|
||||
reg[j] = (tid & h) == 0 ? val + other : other - val;
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
#else
|
||||
[[unroll]]
|
||||
for (uint h = 1; h < BLOCK_SIZE; h <<= 1) {
|
||||
[[unroll]]
|
||||
for (uint j = 0; j < EL_W; ++j) {
|
||||
const float val = reg[j];
|
||||
const float val2 = subgroupShuffleXor(val, h);
|
||||
reg[j] = (lane & h) == 0 ? val + val2 : val2 - val;
|
||||
reg[j] = (tid & h) == 0 ? val + val2 : val2 - val;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
[[unroll]]
|
||||
for (uint h = WARP_SIZE; h < N; h <<= 1) {
|
||||
const uint step = h / WARP_SIZE;
|
||||
for (uint h = BLOCK_SIZE; h < N; h <<= 1) {
|
||||
const uint step = h / BLOCK_SIZE;
|
||||
[[unroll]]
|
||||
for (uint j = 0; j < EL_W; j += 2 * step) {
|
||||
[[unroll]]
|
||||
@@ -61,9 +100,16 @@ void main() {
|
||||
}
|
||||
}
|
||||
|
||||
[[unroll]]
|
||||
for (uint i = 0; i < EL_W; ++i) {
|
||||
data_d[dst_offset + row_offset + i * WARP_SIZE + lane] = reg[i];
|
||||
#ifdef FWHT_SHMEM
|
||||
if (row < n_rows) {
|
||||
#endif
|
||||
[[unroll]]
|
||||
for (uint i = 0; i < EL_W; ++i) {
|
||||
data_d[dst_offset + row_offset + i * BLOCK_SIZE + tid] = reg[i];
|
||||
}
|
||||
#ifdef FWHT_SHMEM
|
||||
}
|
||||
barrier();
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@@ -102,8 +102,8 @@ void main() {
|
||||
const uint iq3 = seq_id / rq3;
|
||||
|
||||
const uint state_size = S_V * S_V;
|
||||
// input state layout (D, K, n_seqs): per-seq stride is K*H*D; we read slot 0.
|
||||
const uint state_in_base = (seq_id * K * H + head_id) * state_size;
|
||||
// input state holds s0 only [S_v, S_v, H, n_seqs]: per-seq stride is H*D.
|
||||
const uint state_in_base = (seq_id * H + head_id) * state_size;
|
||||
// output state layout per slot: same per-(seq,head) offset as the single-slot case.
|
||||
const uint state_out_base = (seq_id * H + head_id) * state_size;
|
||||
const uint state_size_per_snap = state_size * H * n_seqs;
|
||||
@@ -113,9 +113,8 @@ void main() {
|
||||
s_shard[r] = FLOAT_TYPE(data_state[state_in_base + col * S_V + r * LANES_PER_COLUMN + lane]);
|
||||
}
|
||||
|
||||
// snapshot slot mapping: target_slot = t - shift. When n_tokens < K, only the last
|
||||
// n_tokens slots are written; earlier slots are left untouched (caller-owned).
|
||||
const int shift = int(n_tokens) - int(K);
|
||||
// snapshot slot mapping: slot 0 = most recent state, slot s = s tokens back.
|
||||
// When n_tokens < K, only slots 0..n_tokens-1 are written; older slots are caller-owned.
|
||||
|
||||
uint attn_off = (seq_id * n_tokens * H + head_id) * S_V;
|
||||
|
||||
@@ -172,7 +171,7 @@ void main() {
|
||||
attn_off += S_V * H;
|
||||
|
||||
if (K > 1u) {
|
||||
const int target_slot = int(t) - shift;
|
||||
const int target_slot = int(n_tokens) - 1 - int(t);
|
||||
if (target_slot >= 0 && target_slot < int(K)) {
|
||||
const uint slot_base = s_off + uint(target_slot) * state_size_per_snap + state_out_base;
|
||||
[[unroll]] for (uint r = 0; r < ROWS_PER_LANE; r++) {
|
||||
|
||||
@@ -1,25 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "generic_head.glsl"
|
||||
#include "types.glsl"
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
void main() {
|
||||
const float GELU_COEF_A = 0.044715f;
|
||||
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
||||
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
|
||||
if (i >= p.KX) {
|
||||
return;
|
||||
}
|
||||
|
||||
const float xi = float(data_a[i]);
|
||||
const float val = SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi);
|
||||
data_d[i] = D_TYPE(0.5f*xi*(2.0f - 2.0f / (exp(2 * val) + 1)));
|
||||
}
|
||||
@@ -1,39 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "generic_head.glsl"
|
||||
#include "types.glsl"
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
void main() {
|
||||
// based on Abramowitz and Stegun formula 7.1.26 or similar Hastings' approximation
|
||||
// ref: https://www.johndcook.com/blog/python_erf/
|
||||
const float p_erf = 0.3275911f;
|
||||
const float a1_erf = 0.254829592f;
|
||||
const float a2_erf = -0.284496736f;
|
||||
const float a3_erf = 1.421413741f;
|
||||
const float a4_erf = -1.453152027f;
|
||||
const float a5_erf = 1.061405429f;
|
||||
|
||||
const float SQRT_2_INV = 0.70710678118654752440084436210484f;
|
||||
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
|
||||
if (i >= p.KX) {
|
||||
return;
|
||||
}
|
||||
|
||||
const float a = float(data_a[i]);
|
||||
const float a_div_sqr2 = a * SQRT_2_INV;
|
||||
const float sign_x = sign(a_div_sqr2);
|
||||
const float x = abs(a_div_sqr2);
|
||||
const float t = 1.0f / (1.0f + p_erf * x);
|
||||
const float y = 1.0f - (((((a5_erf * t + a4_erf) * t) + a3_erf) * t + a2_erf) * t + a1_erf) * t * exp(-x * x);
|
||||
const float erf_approx = sign_x * y;
|
||||
|
||||
data_d[i] = D_TYPE(0.5f * a * (1.0f + erf_approx));
|
||||
}
|
||||
@@ -1,23 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "generic_head.glsl"
|
||||
#include "types.glsl"
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
void main() {
|
||||
const float GELU_QUICK_COEF = -1.702f;
|
||||
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
|
||||
if (i >= p.KX) {
|
||||
return;
|
||||
}
|
||||
|
||||
const float x = float(data_a[i]);
|
||||
data_d[i] = D_TYPE(x * (1.0f / (1.0f + exp(GELU_QUICK_COEF * x))));
|
||||
}
|
||||
@@ -7,14 +7,12 @@ layout (push_constant) uniform parameter
|
||||
uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
|
||||
uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
|
||||
uint misalign_offsets;
|
||||
float param1; float param2;
|
||||
float param1; float param2; float param3; float param4;
|
||||
|
||||
uint ne0_012mp; uint ne0_012L;
|
||||
uint ne0_01mp; uint ne0_01L;
|
||||
uint ne0_0mp; uint ne0_0L;
|
||||
uint ne1_012mp; uint ne1_012L;
|
||||
uint ne1_01mp; uint ne1_01L;
|
||||
uint ne1_0mp; uint ne1_0L;
|
||||
// The three L values are packed as bytes to keep this layout under the 128B
|
||||
// push constant limit while still leaving room for four float parameters.
|
||||
uint ne0_012mp; uint ne0_01mp; uint ne0_0mp; uint ne0_Ls;
|
||||
uint ne1_012mp; uint ne1_01mp; uint ne1_0mp; uint ne1_Ls;
|
||||
} p;
|
||||
|
||||
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
||||
@@ -42,42 +40,46 @@ uint fastdiv(uint n, uint mp, uint L) {
|
||||
return (msbs + n) >> L;
|
||||
}
|
||||
|
||||
uint fastdiv_L(uint packed, uint slot) {
|
||||
return (packed >> (slot * 8)) & 0x3Fu;
|
||||
}
|
||||
|
||||
uint src0_idx(uint idx) {
|
||||
const uint i03 = fastdiv(idx, p.ne0_012mp, p.ne0_012L);
|
||||
const uint i03 = fastdiv(idx, p.ne0_012mp, fastdiv_L(p.ne0_Ls, 0));
|
||||
const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
|
||||
const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, p.ne0_01L);
|
||||
const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, fastdiv_L(p.ne0_Ls, 1));
|
||||
const uint i02_offset = i02*p.ne01*p.ne00;
|
||||
const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, p.ne0_0L);
|
||||
const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, fastdiv_L(p.ne0_Ls, 2));
|
||||
const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00;
|
||||
return i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + i00*p.nb00;
|
||||
}
|
||||
|
||||
uint dst_idx(uint idx) {
|
||||
const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L);
|
||||
const uint i13 = fastdiv(idx, p.ne1_012mp, fastdiv_L(p.ne1_Ls, 0));
|
||||
const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
|
||||
const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L);
|
||||
const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, fastdiv_L(p.ne1_Ls, 1));
|
||||
const uint i12_offset = i12*p.ne11*p.ne10;
|
||||
const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L);
|
||||
const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, fastdiv_L(p.ne1_Ls, 2));
|
||||
const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
|
||||
return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + i10*p.nb10;
|
||||
}
|
||||
|
||||
uint src0_idx_quant(uint idx, uint qk) {
|
||||
const uint i03 = fastdiv(idx, p.ne0_012mp, p.ne0_012L);
|
||||
const uint i03 = fastdiv(idx, p.ne0_012mp, fastdiv_L(p.ne0_Ls, 0));
|
||||
const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
|
||||
const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, p.ne0_01L);
|
||||
const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, fastdiv_L(p.ne0_Ls, 1));
|
||||
const uint i02_offset = i02*p.ne01*p.ne00;
|
||||
const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, p.ne0_0L);
|
||||
const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, fastdiv_L(p.ne0_Ls, 2));
|
||||
const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00;
|
||||
return i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + (i00/qk)*p.nb00;
|
||||
}
|
||||
|
||||
uint dst_idx_quant(uint idx, uint qk) {
|
||||
const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L);
|
||||
const uint i13 = fastdiv(idx, p.ne1_012mp, fastdiv_L(p.ne1_Ls, 0));
|
||||
const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
|
||||
const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L);
|
||||
const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, fastdiv_L(p.ne1_Ls, 1));
|
||||
const uint i12_offset = i12*p.ne11*p.ne10;
|
||||
const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L);
|
||||
const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, fastdiv_L(p.ne1_Ls, 2));
|
||||
const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
|
||||
return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + (i10/qk)*p.nb10;
|
||||
}
|
||||
|
||||
@@ -15,14 +15,33 @@ layout (push_constant) uniform parameter
|
||||
uint mode;
|
||||
float alpha;
|
||||
float limit;
|
||||
uint nb00;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb03;
|
||||
uint ne01;
|
||||
uint ne02;
|
||||
uint nb10;
|
||||
uint nb11;
|
||||
uint nb12;
|
||||
uint nb13;
|
||||
uint ne11;
|
||||
uint ne12;
|
||||
uint nb20;
|
||||
uint nb21;
|
||||
uint nb22;
|
||||
uint nb23;
|
||||
uint ne21;
|
||||
uint ne22;
|
||||
uint misalign_offsets;
|
||||
uint ne2_012mp; uint ne2_012L;
|
||||
uint ne2_01mp; uint ne2_01L;
|
||||
uint ne2_0mp; uint ne2_0L;
|
||||
} p;
|
||||
|
||||
uint get_aoffset() { return p.misalign_offsets >> 16; }
|
||||
uint get_boffset() { return (p.misalign_offsets >> 8) & 0xFF; }
|
||||
uint get_doffset() { return p.misalign_offsets & 0xFF; }
|
||||
|
||||
// see init_fastdiv_values in ggml-vulkan.cpp
|
||||
uint fastdiv(uint n, uint mp, uint L) {
|
||||
uint msbs, lsbs;
|
||||
umulExtended(n, mp, msbs, lsbs);
|
||||
return (msbs + n) >> L;
|
||||
}
|
||||
|
||||
@@ -5,35 +5,31 @@ void main() {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint row = i / p.ne20;
|
||||
const uint col = i - row * p.ne20;
|
||||
const uint i23 = fastdiv(i, p.ne2_012mp, p.ne2_012L);
|
||||
const uint i23_offset = i23 * p.ne22*p.ne21*p.ne20;
|
||||
const uint i22 = fastdiv(i - i23_offset, p.ne2_01mp, p.ne2_01L);
|
||||
const uint i22_offset = i22*p.ne21*p.ne20;
|
||||
const uint i21 = fastdiv(i - i23_offset - i22_offset, p.ne2_0mp, p.ne2_0L);
|
||||
const uint i20 = i - i23_offset - i22_offset - i21*p.ne20;
|
||||
|
||||
const uint i3 = row / (p.ne01 * p.ne02);
|
||||
const uint i2 = (row % (p.ne01 * p.ne02)) / p.ne01;
|
||||
const uint i1 = row % p.ne01;
|
||||
const uint src_idx = i3 * p.nb03 + i2 * p.nb02 + i1 * p.nb01 + col;
|
||||
|
||||
const uint dst_i3 = row / (p.ne11 * p.ne12);
|
||||
const uint dst_i2 = (row % (p.ne11 * p.ne12)) / p.ne11;
|
||||
const uint dst_i1 = row % p.ne11;
|
||||
const uint dst_idx = dst_i3 * p.nb13 + dst_i2 * p.nb12 + dst_i1 * p.nb11 + col;
|
||||
const uint src_idx_a = get_aoffset() + i23 * p.nb03 + i22 * p.nb02 + i21 * p.nb01 + i20 * p.nb00;
|
||||
const uint src_idx_b = get_boffset() + i23 * p.nb13 + i22 * p.nb12 + i21 * p.nb11 + i20 * p.nb10;
|
||||
const uint dst_idx = get_doffset() + i23 * p.nb23 + i22 * p.nb22 + i21 * p.nb21 + i20 * p.nb20;
|
||||
|
||||
if (p.mode == 0) {
|
||||
// Default
|
||||
const uint offset = p.ne00 / 2;
|
||||
const uint idx = src_idx;
|
||||
const uint offset = (p.ne00 / 2) * p.nb00;
|
||||
const uint idx = src_idx_a;
|
||||
|
||||
data_d[dst_idx] = D_TYPE(op(float(data_a[idx]), float(data_a[idx + offset])));
|
||||
} else if (p.mode == 1) {
|
||||
// Swapped
|
||||
const uint offset = p.ne00 / 2;
|
||||
const uint idx = src_idx;
|
||||
const uint offset = (p.ne00 / 2) * p.nb00;
|
||||
const uint idx = src_idx_a;
|
||||
|
||||
data_d[dst_idx] = D_TYPE(op(float(data_a[idx + offset]), float(data_a[idx])));
|
||||
} else {
|
||||
// Split
|
||||
const uint idx = src_idx;
|
||||
|
||||
data_d[dst_idx] = D_TYPE(op(float(data_a[idx]), float(data_b[idx])));
|
||||
data_d[dst_idx] = D_TYPE(op(float(data_a[src_idx_a]), float(data_b[src_idx_b])));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,22 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "generic_head.glsl"
|
||||
#include "types.glsl"
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
void main() {
|
||||
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
|
||||
if (i >= p.KX) {
|
||||
return;
|
||||
}
|
||||
|
||||
const float x = float(data_a[i]);
|
||||
data_d[i] = D_TYPE(min(1.0f, max(0.0f, (x + 3.0f) / 6.0f)));
|
||||
}
|
||||
@@ -1,22 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "generic_head.glsl"
|
||||
#include "types.glsl"
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
void main() {
|
||||
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
|
||||
if (i >= p.KX) {
|
||||
return;
|
||||
}
|
||||
|
||||
const float x = float(data_a[i]);
|
||||
data_d[i] = D_TYPE(x * min(1.0f, max(0.0f, (x + 3.0f) / 6.0f)));
|
||||
}
|
||||
@@ -4,6 +4,7 @@
|
||||
#extension GL_EXT_integer_dot_product : require
|
||||
|
||||
#define MMQ
|
||||
#define NEEDS_IQ1S_GRID_GPU
|
||||
#define B_TYPE block_q8_1_x4
|
||||
|
||||
#include "mul_mat_vec_base.glsl"
|
||||
|
||||
@@ -29,6 +29,7 @@
|
||||
#endif
|
||||
|
||||
#include "types.glsl"
|
||||
#include "dot_product_funcs.glsl"
|
||||
|
||||
#ifndef LOAD_VEC_A
|
||||
#define LOAD_VEC_A 1
|
||||
@@ -329,15 +330,8 @@ void main() {
|
||||
[[unroll]] for (uint cr = 0; cr < TM / 2; cr++) {
|
||||
// [WNITER][TN][WMITER][TM / 2] -> [wsic][cc][wsir][cr]
|
||||
const uint sums_idx = (wsic * TN + cc) * WMITER * (TM / 2) + wsir * (TM / 2) + cr;
|
||||
#if defined(DATA_A_F32) || defined(DATA_A_F16)
|
||||
sums[sums_idx].x = fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr ].x), ACC_TYPE(cache_b.x), fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr ].y), ACC_TYPE(cache_b.y),
|
||||
fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr ].z), ACC_TYPE(cache_b.z), fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr ].w), ACC_TYPE(cache_b.w), sums[sums_idx].x))));
|
||||
sums[sums_idx].y = fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr + 1].x), ACC_TYPE(cache_b.x), fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr + 1].y), ACC_TYPE(cache_b.y),
|
||||
fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr + 1].z), ACC_TYPE(cache_b.z), fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr + 1].w), ACC_TYPE(cache_b.w), sums[sums_idx].y))));
|
||||
#else
|
||||
sums[sums_idx].x = fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr ].x), ACC_TYPE(cache_b.x), fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr ].y), ACC_TYPE(cache_b.y), sums[sums_idx].x));
|
||||
sums[sums_idx].y = fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr + 1].x), ACC_TYPE(cache_b.x), fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr + 1].y), ACC_TYPE(cache_b.y), sums[sums_idx].y));
|
||||
#endif
|
||||
sums[sums_idx].x = dot_product(cache_a[wsir * TM + 2 * cr ], cache_b, sums[sums_idx].x);
|
||||
sums[sums_idx].y = dot_product(cache_a[wsir * TM + 2 * cr + 1], cache_b, sums[sums_idx].y);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,6 +11,9 @@
|
||||
#extension GL_KHR_memory_scope_semantics : enable
|
||||
#extension GL_KHR_cooperative_matrix : enable
|
||||
#extension GL_NV_cooperative_matrix2 : enable
|
||||
#ifdef GGML_VULKAN_COOPMAT2_DECODE_VECTOR
|
||||
#extension GL_NV_cooperative_matrix_decode_vector : enable
|
||||
#endif
|
||||
#extension GL_EXT_buffer_reference : enable
|
||||
#extension GL_KHR_shader_subgroup_ballot : enable
|
||||
#extension GL_KHR_shader_subgroup_vote : enable
|
||||
@@ -69,10 +72,13 @@ layout (push_constant) uniform parameter
|
||||
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
||||
layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
|
||||
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
|
||||
#if defined(MUL_MAT_ID) && defined(GGML_VULKAN_COOPMAT2_DECODE_VECTOR)
|
||||
layout (binding = 1) readonly buffer B4 {B_TYPEV4 data_b_v4[];};
|
||||
#endif
|
||||
|
||||
#if QUANT_K > 1
|
||||
#include "dequant_funcs_cm2.glsl"
|
||||
#if defined(dequantFuncA_v) && defined(GL_NV_cooperative_matrix_decode_vector)
|
||||
#if defined(dequantFuncA_v) && defined(GGML_VULKAN_COOPMAT2_DECODE_VECTOR)
|
||||
#define DECODEFUNCA , dequantFuncA, dequantFuncA_v
|
||||
#else
|
||||
#define DECODEFUNCA , dequantFuncA
|
||||
@@ -113,11 +119,33 @@ B_TYPE decodeFuncB(const in decodeBufB bl, const in uint blockCoords[2], const i
|
||||
const uint row_i = blockCoords[0];
|
||||
|
||||
const u16vec4 row_idx = row_ids[row_i];
|
||||
B_TYPE ret = data_b[row_idx.y * p.batch_stride_b + row_idx.x * p.stride_b + blockCoords[1]];
|
||||
#if defined(GGML_VULKAN_COOPMAT2_DECODE_VECTOR)
|
||||
// The decode-vector path gives B a K-dimension tensor-layout block size of BK.
|
||||
const uint k = blockCoords[1] * BK + coordInBlock[1];
|
||||
#else
|
||||
const uint k = blockCoords[1];
|
||||
#endif
|
||||
B_TYPE ret = data_b[row_idx.y * p.batch_stride_b + row_idx.x * p.stride_b + k];
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
#if defined(GGML_VULKAN_COOPMAT2_DECODE_VECTOR)
|
||||
B_TYPEV4 decodeFuncB_v(const in decodeBufB bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
const uint row_i = blockCoords[0];
|
||||
|
||||
const u16vec4 row_idx = row_ids[row_i];
|
||||
const uint k = blockCoords[1] * BK + coordInBlock[1];
|
||||
const uint base = row_idx.y * p.batch_stride_b + row_idx.x * p.stride_b + k;
|
||||
|
||||
return data_b_v4[base >> 2];
|
||||
}
|
||||
#define DECODEFUNCB , decodeFuncB, decodeFuncB_v
|
||||
#else
|
||||
#define DECODEFUNCB , decodeFuncB
|
||||
#endif
|
||||
|
||||
D_TYPE perElemOpD(const in uint32_t r, const in uint32_t c, const in D_TYPE elem, const in uint32_t ir, const in uint32_t ic)
|
||||
{
|
||||
uint dr = ir * BM + r;
|
||||
@@ -287,6 +315,9 @@ void main() {
|
||||
tensorLayoutA = setTensorLayoutBlockSizeNV(tensorLayoutA, 1, QUANT_K);
|
||||
tensorLayoutAClamp = setTensorLayoutBlockSizeNV(tensorLayoutAClamp, 1, QUANT_K);
|
||||
#endif
|
||||
#if defined(MUL_MAT_ID) && defined(GGML_VULKAN_COOPMAT2_DECODE_VECTOR)
|
||||
tensorLayoutB = setTensorLayoutBlockSizeNV(tensorLayoutB, 1, BK);
|
||||
#endif
|
||||
|
||||
// Use end_k rather than p.K as the dimension because that's what
|
||||
// we need to bound check against when using split_k.
|
||||
@@ -499,7 +530,7 @@ void main() {
|
||||
coopmat<MAT_TYPE, gl_ScopeWorkgroup, BK, BNover4, gl_MatrixUseB> mat_b;
|
||||
|
||||
coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, block_k, BK) DECODEFUNCA);
|
||||
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, 0, BNover4, block_k, BK), tensorViewTranspose, decodeFuncB);
|
||||
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, 0, BNover4, block_k, BK), tensorViewTranspose DECODEFUNCB);
|
||||
|
||||
sum = coopMatMulAdd(mat_a, mat_b, sum);
|
||||
} else {
|
||||
@@ -507,7 +538,7 @@ void main() {
|
||||
coopmat<MAT_TYPE, gl_ScopeWorkgroup, BK, BNover4, gl_MatrixUseB> mat_b;
|
||||
|
||||
coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutAClamp, ir * BM, BM, block_k, BK) DECODEFUNCA);
|
||||
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, 0, BNover4, block_k, BK), tensorViewTranspose, decodeFuncB);
|
||||
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, 0, BNover4, block_k, BK), tensorViewTranspose DECODEFUNCB);
|
||||
|
||||
sum = coopMatMulAdd(mat_a, mat_b, sum);
|
||||
}
|
||||
@@ -543,7 +574,7 @@ void main() {
|
||||
coopmat<MAT_TYPE, gl_ScopeWorkgroup, BK, BNover2, gl_MatrixUseB> mat_b;
|
||||
|
||||
coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, block_k, BK) DECODEFUNCA);
|
||||
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, 0, BNover2, block_k, BK), tensorViewTranspose, decodeFuncB);
|
||||
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, 0, BNover2, block_k, BK), tensorViewTranspose DECODEFUNCB);
|
||||
|
||||
sum = coopMatMulAdd(mat_a, mat_b, sum);
|
||||
} else {
|
||||
@@ -551,7 +582,7 @@ void main() {
|
||||
coopmat<MAT_TYPE, gl_ScopeWorkgroup, BK, BNover2, gl_MatrixUseB> mat_b;
|
||||
|
||||
coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutAClamp, ir * BM, BM, block_k, BK) DECODEFUNCA);
|
||||
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, 0, BNover2, block_k, BK), tensorViewTranspose, decodeFuncB);
|
||||
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, 0, BNover2, block_k, BK), tensorViewTranspose DECODEFUNCB);
|
||||
|
||||
sum = coopMatMulAdd(mat_a, mat_b, sum);
|
||||
}
|
||||
@@ -588,7 +619,7 @@ void main() {
|
||||
|
||||
coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, block_k, BK) DECODEFUNCA);
|
||||
#ifdef MUL_MAT_ID
|
||||
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, 0, BN, block_k, BK), tensorViewTranspose, decodeFuncB);
|
||||
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, 0, BN, block_k, BK), tensorViewTranspose DECODEFUNCB);
|
||||
#else
|
||||
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutBClamp, ic * BN, BN, block_k, BK), tensorViewTranspose);
|
||||
#endif
|
||||
@@ -600,7 +631,7 @@ void main() {
|
||||
|
||||
coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutAClamp, ir * BM, BM, block_k, BK) DECODEFUNCA);
|
||||
#ifdef MUL_MAT_ID
|
||||
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, 0, BN, block_k, BK), tensorViewTranspose, decodeFuncB);
|
||||
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, 0, BN, block_k, BK), tensorViewTranspose DECODEFUNCB);
|
||||
#else
|
||||
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutBClamp, ic * BN, BN, block_k, BK), tensorViewTranspose);
|
||||
#endif
|
||||
|
||||
@@ -1,20 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "generic_head.glsl"
|
||||
#include "types.glsl"
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
void main() {
|
||||
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
|
||||
if (i >= p.KX) {
|
||||
return;
|
||||
}
|
||||
data_d[i] = D_TYPE(-float(data_a[i]));
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user