mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-07-02 18:47:43 +02:00
Compare commits
51 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| d304f459d8 | |||
| 0320ac5264 | |||
| a7a98e0fff | |||
| 8f8f2274ee | |||
| c959b676be | |||
| cd08fc3ecc | |||
| cb5bb6cc05 | |||
| a91d035b90 | |||
| 745cbcf2fe | |||
| 1cbd80f8cf | |||
| 85286f3548 | |||
| d5fabe3682 | |||
| 8ff206097c | |||
| 77475530b8 | |||
| 3913f8730e | |||
| 76888d202e | |||
| f1fbffb5c0 | |||
| 51abc96bdc | |||
| 07808ebb07 | |||
| 6d758839ff | |||
| 3d4053f77f | |||
| dc381aa9a6 | |||
| 10d197409b | |||
| b907255f4b | |||
| 28c39da7c6 | |||
| 106220562a | |||
| a68f31edd7 | |||
| b8e09f08b9 | |||
| 6c019cb04e | |||
| 9dcd200d57 | |||
| 0fa154e350 | |||
| 261e6a20ff | |||
| a0e13dcbe5 | |||
| a14bd35014 | |||
| 918b26f197 | |||
| 9ecb884346 | |||
| d1c6f11f47 | |||
| 6380d6a3e7 | |||
| aa0c461efe | |||
| b9c9c9f789 | |||
| 50f4281a6f | |||
| 55758b00ca | |||
| f161463a54 | |||
| 84d7b2fca1 | |||
| 40be51152d | |||
| 4bf5549269 | |||
| f4e664f838 | |||
| f088b6a84f | |||
| 304ac5693d | |||
| 6c88ad8fa7 | |||
| 704d90c987 |
@@ -22,6 +22,13 @@ AllowShortIfStatementsOnASingleLine: Never
|
||||
AllowShortLambdasOnASingleLine: Inline
|
||||
AllowShortLoopsOnASingleLine: false
|
||||
AlwaysBreakBeforeMultilineStrings: true
|
||||
# Treat CUDA keywords/attributes as "attribute macros" and avoid breaking lines inside them
|
||||
AttributeMacros:
|
||||
- __host__
|
||||
- __device__
|
||||
- __global__
|
||||
- __forceinline__
|
||||
- __launch_bounds__
|
||||
BinPackArguments: true
|
||||
BinPackParameters: false # OnePerLine
|
||||
BitFieldColonSpacing: Both
|
||||
|
||||
+14
-9
@@ -4,7 +4,7 @@ ARG UBUNTU_VERSION=24.04
|
||||
ARG ROCM_VERSION=6.4
|
||||
ARG AMDGPU_VERSION=6.4
|
||||
|
||||
# Target the CUDA build image
|
||||
# Target the ROCm build image
|
||||
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
||||
|
||||
### Build image
|
||||
@@ -15,16 +15,13 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
||||
# This is mostly tied to rocBLAS supported archs.
|
||||
# gfx803, gfx900, gfx1032, gfx1101, gfx1102,not officialy supported
|
||||
# gfx906 is deprecated
|
||||
#check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.2.4/reference/system-requirements.html
|
||||
#check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.1/reference/system-requirements.html
|
||||
|
||||
ARG ROCM_DOCKER_ARCH='gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102'
|
||||
#ARG ROCM_DOCKER_ARCH=gfx1100
|
||||
ARG ROCM_DOCKER_ARCH='gfx803;gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1010;gfx1030;gfx1032;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx1151'
|
||||
#ARG ROCM_DOCKER_ARCH='gfx1151'
|
||||
|
||||
# Set nvcc architectured
|
||||
# Set ROCm architectures
|
||||
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
|
||||
# Enable ROCm
|
||||
# ENV CC=/opt/rocm/llvm/bin/clang
|
||||
# ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y \
|
||||
@@ -39,8 +36,16 @@ WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN git clone https://github.com/rocm/rocwmma --branch develop --depth 1
|
||||
|
||||
RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
|
||||
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_BUILD_TESTS=OFF \
|
||||
cmake -S . -B build \
|
||||
-DGGML_HIP=ON \
|
||||
-DGGML_HIP_ROCWMMA_FATTN=ON \
|
||||
-DCMAKE_HIP_FLAGS="-I$(pwd)/rocwmma/library/include/" \
|
||||
-DAMDGPU_TARGETS="$ROCM_DOCKER_ARCH" \
|
||||
-DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON \
|
||||
-DCMAKE_BUILD_TYPE=Release -DLLAMA_BUILD_TESTS=OFF \
|
||||
&& cmake --build build --config Release -j$(nproc)
|
||||
|
||||
RUN mkdir -p /app/lib \
|
||||
|
||||
@@ -52,3 +52,11 @@ insert_final_newline = unset
|
||||
[vendor/miniaudio/miniaudio.h]
|
||||
trim_trailing_whitespace = unset
|
||||
insert_final_newline = unset
|
||||
|
||||
[tools/server/webui/**]
|
||||
indent_style = unset
|
||||
indent_size = unset
|
||||
end_of_line = unset
|
||||
charset = unset
|
||||
trim_trailing_whitespace = unset
|
||||
insert_final_newline = unset
|
||||
|
||||
+30
-15
@@ -56,7 +56,7 @@ env:
|
||||
|
||||
jobs:
|
||||
macOS-latest-cmake-arm64:
|
||||
runs-on: macos-14
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -88,6 +88,7 @@ jobs:
|
||||
-DGGML_METAL_SHADER_DEBUG=ON \
|
||||
-DGGML_RPC=ON
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
leaks -atExit -- ./build/bin/test-thread-safety -hf ggml-org/gemma-3-270m-qat-GGUF -ngl 99 -p "$(printf 'hello %.0s' {1..128})" -n 16 -c 512 -ub 32 -np 2 -t 2 -lv 1
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
@@ -126,7 +127,8 @@ jobs:
|
||||
-DCMAKE_BUILD_RPATH="@loader_path" \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DGGML_METAL=OFF \
|
||||
-DGGML_RPC=ON
|
||||
-DGGML_RPC=ON \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
- name: Test
|
||||
@@ -136,7 +138,7 @@ jobs:
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
macOS-latest-cmake-arm64-webgpu:
|
||||
runs-on: macos-14
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -709,6 +711,7 @@ jobs:
|
||||
|
||||
macOS-latest-swift:
|
||||
runs-on: macos-latest
|
||||
needs: ios-xcode-build
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
@@ -725,6 +728,12 @@ jobs:
|
||||
key: macOS-latest-swift
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Download xcframework artifact
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: llama-xcframework
|
||||
path: build-apple/llama.xcframework/
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
continue-on-error: true
|
||||
@@ -746,11 +755,6 @@ jobs:
|
||||
-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
- name: xcodebuild for swift package
|
||||
id: xcodebuild
|
||||
run: |
|
||||
./build-xcframework.sh
|
||||
|
||||
windows-msys2:
|
||||
runs-on: windows-2025
|
||||
|
||||
@@ -1050,9 +1054,13 @@ jobs:
|
||||
run: examples/sycl/win-build-sycl.bat
|
||||
|
||||
windows-latest-cmake-hip:
|
||||
if: ${{ github.event.inputs.create_release != 'true' }}
|
||||
runs-on: windows-2022
|
||||
|
||||
env:
|
||||
# The ROCm version must correspond to the version used in the HIP SDK.
|
||||
ROCM_VERSION: "6.4.2"
|
||||
HIPSDK_INSTALLER_VERSION: "25.Q3"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
@@ -1061,16 +1069,14 @@ jobs:
|
||||
- name: Clone rocWMMA repository
|
||||
id: clone_rocwmma
|
||||
run: |
|
||||
git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
|
||||
git clone https://github.com/rocm/rocwmma --branch rocm-${{ env.ROCM_VERSION }} --depth 1
|
||||
|
||||
- name: Cache ROCm Installation
|
||||
id: cache-rocm
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: C:\Program Files\AMD\ROCm
|
||||
key: rocm-6.1-${{ runner.os }}-v1
|
||||
restore-keys: |
|
||||
rocm-6.1-${{ runner.os }}-
|
||||
key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Install ROCm
|
||||
if: steps.cache-rocm.outputs.cache-hit != 'true'
|
||||
@@ -1078,7 +1084,7 @@ jobs:
|
||||
run: |
|
||||
$ErrorActionPreference = "Stop"
|
||||
write-host "Downloading AMD HIP SDK Installer"
|
||||
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
|
||||
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
|
||||
write-host "Installing AMD HIP SDK"
|
||||
$proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
|
||||
$completed = $proc.WaitForExit(600000)
|
||||
@@ -1166,8 +1172,17 @@ jobs:
|
||||
run: |
|
||||
./build-xcframework.sh
|
||||
|
||||
- name: Upload xcframework artifact
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: llama-xcframework
|
||||
path: build-apple/llama.xcframework/
|
||||
retention-days: 1
|
||||
|
||||
- name: Build Xcode project
|
||||
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
|
||||
run: |
|
||||
xcodebuild -downloadPlatform iOS
|
||||
xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
|
||||
|
||||
android-build:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
@@ -108,7 +108,8 @@ jobs:
|
||||
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DGGML_METAL=OFF \
|
||||
-DGGML_RPC=ON
|
||||
-DGGML_RPC=ON \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
- name: Determine tag name
|
||||
@@ -528,11 +529,14 @@ jobs:
|
||||
windows-hip:
|
||||
runs-on: windows-2022
|
||||
|
||||
env:
|
||||
HIPSDK_INSTALLER_VERSION: "25.Q3"
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- name: "radeon"
|
||||
gpu_targets: "gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
|
||||
gpu_targets: "gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -542,21 +546,19 @@ jobs:
|
||||
- name: Clone rocWMMA repository
|
||||
id: clone_rocwmma
|
||||
run: |
|
||||
git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
|
||||
git clone https://github.com/rocm/rocwmma --branch develop --depth 1
|
||||
|
||||
- name: Cache ROCm Installation
|
||||
id: cache-rocm
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: C:\Program Files\AMD\ROCm
|
||||
key: rocm-6.1-${{ runner.os }}-v1
|
||||
restore-keys: |
|
||||
rocm-6.1-${{ runner.os }}-
|
||||
key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-latest-cmake-hip-${{ matrix.name }}-x64
|
||||
key: windows-latest-cmake-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}-x64
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Install ROCm
|
||||
@@ -565,7 +567,7 @@ jobs:
|
||||
run: |
|
||||
$ErrorActionPreference = "Stop"
|
||||
write-host "Downloading AMD HIP SDK Installer"
|
||||
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
|
||||
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
|
||||
write-host "Installing AMD HIP SDK"
|
||||
$proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
|
||||
$completed = $proc.WaitForExit(600000)
|
||||
@@ -610,9 +612,12 @@ jobs:
|
||||
-DLLAMA_CURL=OFF
|
||||
cmake --build build --target ggml-hip -j ${env:NUMBER_OF_PROCESSORS}
|
||||
md "build\bin\rocblas\library\"
|
||||
md "build\bin\hipblaslt\library"
|
||||
cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
|
||||
cp "${env:HIP_PATH}\bin\hipblaslt.dll" "build\bin\"
|
||||
cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
|
||||
cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
|
||||
cp "${env:HIP_PATH}\bin\hipblaslt\library\*" "build\bin\hipblaslt\library\"
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
|
||||
+192
-37
@@ -76,51 +76,206 @@ jobs:
|
||||
run: |
|
||||
pip install -r tools/server/tests/requirements.txt
|
||||
|
||||
# Setup nodejs (to be used for verifying bundled index.html)
|
||||
- uses: actions/setup-node@v4
|
||||
webui-setup:
|
||||
name: WebUI Setup
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
node-version: '22.11.0'
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: WebUI - Install dependencies
|
||||
id: webui_lint
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: "22"
|
||||
cache: "npm"
|
||||
cache-dependency-path: "tools/server/webui/package-lock.json"
|
||||
|
||||
- name: Cache node_modules
|
||||
uses: actions/cache@v4
|
||||
id: cache-node-modules
|
||||
with:
|
||||
path: tools/server/webui/node_modules
|
||||
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
|
||||
restore-keys: |
|
||||
${{ runner.os }}-node-modules-
|
||||
|
||||
- name: Install dependencies
|
||||
if: steps.cache-node-modules.outputs.cache-hit != 'true'
|
||||
run: npm ci
|
||||
working-directory: tools/server/webui
|
||||
|
||||
webui-check:
|
||||
needs: webui-setup
|
||||
name: WebUI Check
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: "22"
|
||||
|
||||
- name: Restore node_modules cache
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: tools/server/webui/node_modules
|
||||
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
|
||||
restore-keys: |
|
||||
${{ runner.os }}-node-modules-
|
||||
|
||||
- name: Run type checking
|
||||
run: npm run check
|
||||
working-directory: tools/server/webui
|
||||
|
||||
- name: Run linting
|
||||
run: npm run lint
|
||||
working-directory: tools/server/webui
|
||||
|
||||
webui-build:
|
||||
needs: webui-check
|
||||
name: WebUI Build
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: "22"
|
||||
|
||||
- name: Restore node_modules cache
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: tools/server/webui/node_modules
|
||||
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
|
||||
restore-keys: |
|
||||
${{ runner.os }}-node-modules-
|
||||
|
||||
- name: Build application
|
||||
run: npm run build
|
||||
working-directory: tools/server/webui
|
||||
|
||||
webui-tests:
|
||||
needs: webui-build
|
||||
name: Run WebUI tests
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: "22"
|
||||
|
||||
- name: Restore node_modules cache
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: tools/server/webui/node_modules
|
||||
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
|
||||
restore-keys: |
|
||||
${{ runner.os }}-node-modules-
|
||||
|
||||
- name: Install Playwright browsers
|
||||
run: npx playwright install --with-deps
|
||||
working-directory: tools/server/webui
|
||||
|
||||
- name: Build Storybook
|
||||
run: npm run build-storybook
|
||||
working-directory: tools/server/webui
|
||||
|
||||
- name: Run Client tests
|
||||
run: npm run test:client
|
||||
working-directory: tools/server/webui
|
||||
|
||||
- name: Run Server tests
|
||||
run: npm run test:server
|
||||
working-directory: tools/server/webui
|
||||
|
||||
- name: Run UI tests
|
||||
run: npm run test:ui
|
||||
working-directory: tools/server/webui
|
||||
|
||||
- name: Run E2E tests
|
||||
run: npm run test:e2e
|
||||
working-directory: tools/server/webui
|
||||
|
||||
server-build:
|
||||
needs: [webui-tests]
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
|
||||
build_type: [RelWithDebInfo]
|
||||
include:
|
||||
- build_type: Release
|
||||
sanitizer: ""
|
||||
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
|
||||
|
||||
steps:
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
cd tools/server/webui
|
||||
npm ci
|
||||
sudo apt-get update
|
||||
sudo apt-get -y install \
|
||||
build-essential \
|
||||
xxd \
|
||||
git \
|
||||
cmake \
|
||||
curl \
|
||||
wget \
|
||||
language-pack-en \
|
||||
libcurl4-openssl-dev
|
||||
|
||||
- name: WebUI - Check code format
|
||||
id: webui_format
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Python setup
|
||||
id: setup_python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.11'
|
||||
|
||||
- name: Tests dependencies
|
||||
id: test_dependencies
|
||||
run: |
|
||||
git config --global --add safe.directory $(realpath .)
|
||||
cd tools/server/webui
|
||||
git status
|
||||
pip install -r tools/server/tests/requirements.txt
|
||||
|
||||
npm run format
|
||||
git status
|
||||
modified_files="$(git status -s)"
|
||||
echo "Modified files: ${modified_files}"
|
||||
if [ -n "${modified_files}" ]; then
|
||||
echo "Files do not follow coding style. To fix: npm run format"
|
||||
echo "${modified_files}"
|
||||
exit 1
|
||||
fi
|
||||
- name: Setup Node.js for WebUI
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: "22"
|
||||
cache: "npm"
|
||||
cache-dependency-path: "tools/server/webui/package-lock.json"
|
||||
|
||||
- name: Verify bundled index.html
|
||||
id: verify_server_index_html
|
||||
run: |
|
||||
git config --global --add safe.directory $(realpath .)
|
||||
cd tools/server/webui
|
||||
git status
|
||||
- name: Install WebUI dependencies
|
||||
run: npm ci
|
||||
working-directory: tools/server/webui
|
||||
|
||||
npm run build
|
||||
git status
|
||||
modified_files="$(git status -s)"
|
||||
echo "Modified files: ${modified_files}"
|
||||
if [ -n "${modified_files}" ]; then
|
||||
echo "Repository is dirty or server/webui is not built as expected"
|
||||
echo "Hint: You may need to follow Web UI build guide in server/README.md"
|
||||
echo "${modified_files}"
|
||||
exit 1
|
||||
fi
|
||||
- name: Build WebUI
|
||||
run: npm run build
|
||||
working-directory: tools/server/webui
|
||||
|
||||
- name: Build (no OpenMP)
|
||||
id: cmake_build_no_openmp
|
||||
|
||||
@@ -148,3 +148,7 @@ poetry.toml
|
||||
/run-vim.sh
|
||||
/run-chat.sh
|
||||
.ccache/
|
||||
|
||||
# Code Workspace
|
||||
*.code-workspace
|
||||
|
||||
|
||||
@@ -0,0 +1,7 @@
|
||||
---
|
||||
trigger: manual
|
||||
---
|
||||
|
||||
#### Tailwind & CSS
|
||||
|
||||
- We are using Tailwind v4 which uses oklch colors so we now want to refer to the CSS vars directly, without wrapping it with any color function like `hsla/hsl`, `rgba` etc.
|
||||
@@ -0,0 +1,48 @@
|
||||
---
|
||||
trigger: manual
|
||||
---
|
||||
|
||||
# Coding rules
|
||||
|
||||
## Svelte & SvelteKit
|
||||
|
||||
### Services vs Stores Separation Pattern
|
||||
|
||||
#### `lib/services/` - Pure Business Logic
|
||||
|
||||
- **Purpose**: Stateless business logic and external communication
|
||||
- **Contains**:
|
||||
- API calls to external services (ApiService)
|
||||
- Pure business logic functions (ChatService, etc.)
|
||||
- **Rules**:
|
||||
- NO Svelte runes ($state, $derived, $effect)
|
||||
- NO reactive state management
|
||||
- Pure functions and classes only
|
||||
- Can import types but not stores
|
||||
- Focus on "how" - implementation details
|
||||
|
||||
#### `lib/stores/` - Reactive State Management
|
||||
|
||||
- **Purpose**: Svelte-specific reactive state with runes
|
||||
- **Contains**:
|
||||
- Reactive state classes with $state, $derived, $effect
|
||||
- Database operations (DatabaseStore)
|
||||
- UI-focused state management
|
||||
- Store orchestration logic
|
||||
- **Rules**:
|
||||
- USE Svelte runes for reactivity
|
||||
- Import and use services for business logic
|
||||
- NO direct database operations
|
||||
- NO direct API calls (use services)
|
||||
- Focus on "what" - reactive state for UI
|
||||
|
||||
#### Enforcement
|
||||
|
||||
- Services should be testable without Svelte
|
||||
- Stores should leverage Svelte's reactivity system
|
||||
- Clear separation: services handle data, stores handle state
|
||||
- Services can be reused across multiple stores
|
||||
|
||||
#### Misc
|
||||
|
||||
- Always use `let` for $derived state variables
|
||||
@@ -0,0 +1,9 @@
|
||||
---
|
||||
trigger: manual
|
||||
---
|
||||
|
||||
# Automated Tests
|
||||
|
||||
## General rules
|
||||
|
||||
- NEVER include any test code in the production code - we should always have it in a separate dedicated files
|
||||
@@ -0,0 +1,7 @@
|
||||
---
|
||||
trigger: manual
|
||||
---
|
||||
|
||||
## TypeScript
|
||||
|
||||
- Add JSDocs for functions
|
||||
@@ -58,6 +58,12 @@ if (MSVC)
|
||||
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/bigobj>")
|
||||
endif()
|
||||
|
||||
if (CMAKE_SYSTEM_NAME STREQUAL "iOS")
|
||||
set(LLAMA_TOOLS_INSTALL_DEFAULT OFF)
|
||||
else()
|
||||
set(LLAMA_TOOLS_INSTALL_DEFAULT ${LLAMA_STANDALONE})
|
||||
endif()
|
||||
|
||||
#
|
||||
# option list
|
||||
#
|
||||
@@ -82,6 +88,7 @@ option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT})
|
||||
|
||||
# 3rd party libs
|
||||
option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON)
|
||||
|
||||
@@ -45,7 +45,7 @@ SRC=`pwd`
|
||||
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON"
|
||||
|
||||
if [ ! -z ${GG_BUILD_METAL} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON"
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_CUDA} ]; then
|
||||
@@ -270,7 +270,9 @@ function gg_run_ctest_with_model_debug {
|
||||
local model; model=$(gg_get_model)
|
||||
cd build-ci-debug
|
||||
set -e
|
||||
|
||||
(LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
|
||||
set +e
|
||||
cd ..
|
||||
}
|
||||
@@ -281,7 +283,15 @@ function gg_run_ctest_with_model_release {
|
||||
local model; model=$(gg_get_model)
|
||||
cd build-ci-release
|
||||
set -e
|
||||
|
||||
(LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
|
||||
# test memory leaks
|
||||
#if [[ ! -z ${GG_BUILD_METAL} ]]; then
|
||||
# # TODO: this hangs for some reason ...
|
||||
# (time leaks -quiet -atExit -- ./bin/test-thread-safety -m $model --parallel 2 -t 2 -p "hello") 2>&1 | tee -a $OUT/${ci}-leaks.log
|
||||
#fi
|
||||
|
||||
set +e
|
||||
cd ..
|
||||
}
|
||||
@@ -860,10 +870,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||
fi
|
||||
|
||||
ret=0
|
||||
if [ -z ${GG_BUILD_SYCL} ]; then
|
||||
# SYCL build breaks with debug build flags
|
||||
test $ret -eq 0 && gg_run ctest_debug
|
||||
fi
|
||||
test $ret -eq 0 && gg_run ctest_debug
|
||||
test $ret -eq 0 && gg_run ctest_release
|
||||
|
||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||
@@ -871,9 +878,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||
test $ret -eq 0 && gg_run rerank_tiny
|
||||
|
||||
if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
|
||||
if [ -z ${GG_BUILD_SYCL} ]; then
|
||||
test $ret -eq 0 && gg_run test_scripts_debug
|
||||
fi
|
||||
test $ret -eq 0 && gg_run test_scripts_debug
|
||||
test $ret -eq 0 && gg_run test_scripts_release
|
||||
fi
|
||||
|
||||
@@ -884,9 +889,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||
test $ret -eq 0 && gg_run pythia_2_8b
|
||||
#test $ret -eq 0 && gg_run open_llama_7b_v2
|
||||
fi
|
||||
if [ -z ${GG_BUILD_SYCL} ]; then
|
||||
test $ret -eq 0 && gg_run ctest_with_model_debug
|
||||
fi
|
||||
test $ret -eq 0 && gg_run ctest_with_model_debug
|
||||
test $ret -eq 0 && gg_run ctest_with_model_release
|
||||
fi
|
||||
fi
|
||||
|
||||
+141
-21
@@ -745,6 +745,124 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
|
||||
|
||||
#endif // LLAMA_USE_CURL
|
||||
|
||||
//
|
||||
// Docker registry functions
|
||||
//
|
||||
|
||||
static std::string common_docker_get_token(const std::string & repo) {
|
||||
std::string url = "https://auth.docker.io/token?service=registry.docker.io&scope=repository:" + repo + ":pull";
|
||||
|
||||
common_remote_params params;
|
||||
auto res = common_remote_get_content(url, params);
|
||||
|
||||
if (res.first != 200) {
|
||||
throw std::runtime_error("Failed to get Docker registry token, HTTP code: " + std::to_string(res.first));
|
||||
}
|
||||
|
||||
std::string response_str(res.second.begin(), res.second.end());
|
||||
nlohmann::ordered_json response = nlohmann::ordered_json::parse(response_str);
|
||||
|
||||
if (!response.contains("token")) {
|
||||
throw std::runtime_error("Docker registry token response missing 'token' field");
|
||||
}
|
||||
|
||||
return response["token"].get<std::string>();
|
||||
}
|
||||
|
||||
static std::string common_docker_resolve_model(const std::string & docker) {
|
||||
// Parse ai/smollm2:135M-Q4_K_M
|
||||
size_t colon_pos = docker.find(':');
|
||||
std::string repo, tag;
|
||||
if (colon_pos != std::string::npos) {
|
||||
repo = docker.substr(0, colon_pos);
|
||||
tag = docker.substr(colon_pos + 1);
|
||||
} else {
|
||||
repo = docker;
|
||||
tag = "latest";
|
||||
}
|
||||
|
||||
// ai/ is the default
|
||||
size_t slash_pos = docker.find('/');
|
||||
if (slash_pos == std::string::npos) {
|
||||
repo.insert(0, "ai/");
|
||||
}
|
||||
|
||||
LOG_INF("%s: Downloading Docker Model: %s:%s\n", __func__, repo.c_str(), tag.c_str());
|
||||
try {
|
||||
// --- helper: digest validation ---
|
||||
auto validate_oci_digest = [](const std::string & digest) -> std::string {
|
||||
// Expected: algo:hex ; start with sha256 (64 hex chars)
|
||||
// You can extend this map if supporting other algorithms in future.
|
||||
static const std::regex re("^sha256:([a-fA-F0-9]{64})$");
|
||||
std::smatch m;
|
||||
if (!std::regex_match(digest, m, re)) {
|
||||
throw std::runtime_error("Invalid OCI digest format received in manifest: " + digest);
|
||||
}
|
||||
// normalize hex to lowercase
|
||||
std::string normalized = digest;
|
||||
std::transform(normalized.begin()+7, normalized.end(), normalized.begin()+7, [](unsigned char c){
|
||||
return std::tolower(c);
|
||||
});
|
||||
return normalized;
|
||||
};
|
||||
|
||||
std::string token = common_docker_get_token(repo); // Get authentication token
|
||||
|
||||
// Get manifest
|
||||
const std::string url_prefix = "https://registry-1.docker.io/v2/" + repo;
|
||||
std::string manifest_url = url_prefix + "/manifests/" + tag;
|
||||
common_remote_params manifest_params;
|
||||
manifest_params.headers.push_back("Authorization: Bearer " + token);
|
||||
manifest_params.headers.push_back(
|
||||
"Accept: application/vnd.docker.distribution.manifest.v2+json,application/vnd.oci.image.manifest.v1+json");
|
||||
auto manifest_res = common_remote_get_content(manifest_url, manifest_params);
|
||||
if (manifest_res.first != 200) {
|
||||
throw std::runtime_error("Failed to get Docker manifest, HTTP code: " + std::to_string(manifest_res.first));
|
||||
}
|
||||
|
||||
std::string manifest_str(manifest_res.second.begin(), manifest_res.second.end());
|
||||
nlohmann::ordered_json manifest = nlohmann::ordered_json::parse(manifest_str);
|
||||
std::string gguf_digest; // Find the GGUF layer
|
||||
if (manifest.contains("layers")) {
|
||||
for (const auto & layer : manifest["layers"]) {
|
||||
if (layer.contains("mediaType")) {
|
||||
std::string media_type = layer["mediaType"].get<std::string>();
|
||||
if (media_type == "application/vnd.docker.ai.gguf.v3" ||
|
||||
media_type.find("gguf") != std::string::npos) {
|
||||
gguf_digest = layer["digest"].get<std::string>();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (gguf_digest.empty()) {
|
||||
throw std::runtime_error("No GGUF layer found in Docker manifest");
|
||||
}
|
||||
|
||||
// Validate & normalize digest
|
||||
gguf_digest = validate_oci_digest(gguf_digest);
|
||||
LOG_DBG("%s: Using validated digest: %s\n", __func__, gguf_digest.c_str());
|
||||
|
||||
// Prepare local filename
|
||||
std::string model_filename = repo;
|
||||
std::replace(model_filename.begin(), model_filename.end(), '/', '_');
|
||||
model_filename += "_" + tag + ".gguf";
|
||||
std::string local_path = fs_get_cache_file(model_filename);
|
||||
|
||||
const std::string blob_url = url_prefix + "/blobs/" + gguf_digest;
|
||||
if (!common_download_file_single(blob_url, local_path, token, false)) {
|
||||
throw std::runtime_error("Failed to download Docker Model");
|
||||
}
|
||||
|
||||
LOG_INF("%s: Downloaded Docker Model to: %s\n", __func__, local_path.c_str());
|
||||
return local_path;
|
||||
} catch (const std::exception & e) {
|
||||
LOG_ERR("%s: Docker Model download failed: %s\n", __func__, e.what());
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// utils
|
||||
//
|
||||
@@ -795,7 +913,9 @@ static handle_model_result common_params_handle_model(
|
||||
handle_model_result result;
|
||||
// handle pre-fill default model path and url based on hf_repo and hf_file
|
||||
{
|
||||
if (!model.hf_repo.empty()) {
|
||||
if (!model.docker_repo.empty()) { // Handle Docker URLs by resolving them to local paths
|
||||
model.path = common_docker_resolve_model(model.docker_repo);
|
||||
} else if (!model.hf_repo.empty()) {
|
||||
// short-hand to avoid specifying --hf-file -> default it to --model
|
||||
if (model.hf_file.empty()) {
|
||||
if (model.path.empty()) {
|
||||
@@ -1184,7 +1304,7 @@ static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & val
|
||||
} else {
|
||||
for (const auto & device : dev_names) {
|
||||
auto * dev = ggml_backend_dev_by_name(device.c_str());
|
||||
if (!dev || ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_GPU) {
|
||||
if (!dev || ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
|
||||
throw std::invalid_argument(string_format("invalid device: %s", device.c_str()));
|
||||
}
|
||||
devices.push_back(dev);
|
||||
@@ -1194,7 +1314,7 @@ static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & val
|
||||
return devices;
|
||||
}
|
||||
|
||||
static void add_rpc_devices(std::string servers) {
|
||||
static void add_rpc_devices(const std::string & servers) {
|
||||
auto rpc_servers = string_split<std::string>(servers, ',');
|
||||
if (rpc_servers.empty()) {
|
||||
throw std::invalid_argument("no RPC servers specified");
|
||||
@@ -1584,7 +1704,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.system_prompt = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
|
||||
add_opt(common_arg(
|
||||
{"--no-perf"},
|
||||
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
|
||||
@@ -2396,24 +2516,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
{"--list-devices"},
|
||||
"print list of available devices and exit",
|
||||
[](common_params &) {
|
||||
std::vector<ggml_backend_dev_t> rpc_devices;
|
||||
std::vector<ggml_backend_dev_t> all_devices;
|
||||
std::vector<ggml_backend_dev_t> devices;
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
||||
auto * dev = ggml_backend_dev_get(i);
|
||||
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
|
||||
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
|
||||
if (ggml_backend_reg_name(reg) == std::string("RPC")) {
|
||||
rpc_devices.push_back(dev);
|
||||
} else {
|
||||
all_devices.push_back(dev);
|
||||
}
|
||||
if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) {
|
||||
devices.push_back(dev);
|
||||
}
|
||||
}
|
||||
// insert RPC devices in front
|
||||
all_devices.insert(all_devices.begin(), rpc_devices.begin(), rpc_devices.end());
|
||||
printf("Available devices:\n");
|
||||
for (size_t i = 0; i < all_devices.size(); ++i) {
|
||||
auto * dev = all_devices[i];
|
||||
for (auto * dev : devices) {
|
||||
size_t free, total;
|
||||
ggml_backend_dev_memory(dev, &free, &total);
|
||||
printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
|
||||
@@ -2437,7 +2548,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
{"--cpu-moe", "-cmoe"},
|
||||
"keep all Mixture of Experts (MoE) weights in the CPU",
|
||||
[](common_params & params) {
|
||||
params.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
|
||||
params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
|
||||
}
|
||||
).set_env("LLAMA_ARG_CPU_MOE"));
|
||||
add_opt(common_arg(
|
||||
@@ -2450,7 +2561,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
for (int i = 0; i < value; ++i) {
|
||||
// keep strings alive and avoid leaking memory by storing them in a static vector
|
||||
static std::list<std::string> buft_overrides;
|
||||
buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
|
||||
buft_overrides.push_back(llm_ffn_exps_block_regex(i));
|
||||
params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
|
||||
}
|
||||
}
|
||||
@@ -2459,7 +2570,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
{"--cpu-moe-draft", "-cmoed"},
|
||||
"keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
|
||||
[](common_params & params) {
|
||||
params.speculative.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
|
||||
params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
|
||||
add_opt(common_arg(
|
||||
@@ -2471,7 +2582,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
for (int i = 0; i < value; ++i) {
|
||||
static std::list<std::string> buft_overrides_draft;
|
||||
buft_overrides_draft.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
|
||||
buft_overrides_draft.push_back(llm_ffn_exps_block_regex(i));
|
||||
params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});
|
||||
}
|
||||
}
|
||||
@@ -2636,6 +2747,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.model.url = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_MODEL_URL"));
|
||||
add_opt(common_arg(
|
||||
{ "-dr", "--docker-repo" }, "[<repo>/]<model>[:quant]",
|
||||
"Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.\n"
|
||||
"example: gemma3\n"
|
||||
"(default: unused)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.model.docker_repo = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_DOCKER_REPO"));
|
||||
add_opt(common_arg(
|
||||
{"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
|
||||
"Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
|
||||
|
||||
+23
-8
@@ -193,10 +193,11 @@ struct common_params_sampling {
|
||||
};
|
||||
|
||||
struct common_params_model {
|
||||
std::string path = ""; // model local path // NOLINT
|
||||
std::string url = ""; // model url to download // NOLINT
|
||||
std::string hf_repo = ""; // HF repo // NOLINT
|
||||
std::string hf_file = ""; // HF file // NOLINT
|
||||
std::string path = ""; // model local path // NOLINT
|
||||
std::string url = ""; // model url to download // NOLINT
|
||||
std::string hf_repo = ""; // HF repo // NOLINT
|
||||
std::string hf_file = ""; // HF file // NOLINT
|
||||
std::string docker_repo = ""; // Docker repo // NOLINT
|
||||
};
|
||||
|
||||
struct common_params_speculative {
|
||||
@@ -287,9 +288,9 @@ struct common_params {
|
||||
float rope_freq_base = 0.0f; // RoPE base frequency
|
||||
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
|
||||
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
|
||||
float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
|
||||
float yarn_beta_fast = 32.0f; // YaRN low correction dim
|
||||
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
||||
float yarn_attn_factor = -1.0f; // YaRN magnitude scaling factor
|
||||
float yarn_beta_fast = -1.0f; // YaRN low correction dim
|
||||
float yarn_beta_slow = -1.0f; // YaRN high correction dim
|
||||
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||
|
||||
// offload params
|
||||
@@ -452,7 +453,7 @@ struct common_params {
|
||||
|
||||
std::string slot_save_path;
|
||||
|
||||
float slot_prompt_similarity = 0.5f;
|
||||
float slot_prompt_similarity = 0.1f;
|
||||
|
||||
// batched-bench params
|
||||
bool is_pp_shared = false;
|
||||
@@ -733,6 +734,20 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
||||
|
||||
}
|
||||
|
||||
//
|
||||
// MoE utils
|
||||
//
|
||||
|
||||
const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_exps";
|
||||
|
||||
static std::string llm_ffn_exps_block_regex(int idx) {
|
||||
return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
|
||||
}
|
||||
|
||||
static llama_model_tensor_buft_override llm_ffn_exps_cpu_override() {
|
||||
return { LLM_FFN_EXPS_REGEX, ggml_backend_cpu_buffer_type() };
|
||||
}
|
||||
|
||||
//
|
||||
// training utils
|
||||
//
|
||||
|
||||
@@ -257,12 +257,13 @@ std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
|
||||
};
|
||||
|
||||
static bool is_reserved_name(const std::string & name) {
|
||||
static std::unordered_set<std::string> RESERVED_NAMES;
|
||||
if (RESERVED_NAMES.empty()) {
|
||||
RESERVED_NAMES.insert("root");
|
||||
for (const auto &p : PRIMITIVE_RULES) RESERVED_NAMES.insert(p.first);
|
||||
for (const auto &p : STRING_FORMAT_RULES) RESERVED_NAMES.insert(p.first);
|
||||
}
|
||||
static const std::unordered_set<std::string> RESERVED_NAMES = [] {
|
||||
std::unordered_set<std::string> s;
|
||||
s.insert("root");
|
||||
for (const auto & p : PRIMITIVE_RULES) s.insert(p.first);
|
||||
for (const auto & p : STRING_FORMAT_RULES) s.insert(p.first);
|
||||
return s;
|
||||
}();
|
||||
return RESERVED_NAMES.find(name) != RESERVED_NAMES.end();
|
||||
}
|
||||
|
||||
|
||||
+189
-29
@@ -735,6 +735,9 @@ class TextModel(ModelBase):
|
||||
if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c":
|
||||
# ref: https://huggingface.co/Qwen/Qwen3-Embedding-0.6B
|
||||
res = "qwen2"
|
||||
if chkhsh == "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273":
|
||||
# ref: https://huggingface.co/alvarobartt/grok-2-tokenizer
|
||||
res = "grok-2"
|
||||
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
|
||||
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
|
||||
res = "llama-bpe"
|
||||
@@ -885,6 +888,9 @@ class TextModel(ModelBase):
|
||||
if chkhsh == "a1e163ecab2e718a4c829d1148b6e86824ec36163bb71941c3dca9cd5ac25756":
|
||||
# ref: https://huggingface.co/JetBrains/Mellum-4b-base
|
||||
res = "mellum"
|
||||
if chkhsh == "9b1be57e70d20d9501b2b3186e792d81181ae36ada3903c26f9fea418cf87206":
|
||||
# ref: https://huggingface.co/inclusionAI/LLaDA-MoE-7B-A1B-Base
|
||||
res = "llada-moe"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
@@ -2387,7 +2393,10 @@ class SmolVLMModel(MmprojModel):
|
||||
return [] # skip other tensors
|
||||
|
||||
|
||||
@ModelBase.register("Llama4ForConditionalGeneration")
|
||||
@ModelBase.register(
|
||||
"Llama4ForConditionalGeneration",
|
||||
"Llama4ForCausalLM",
|
||||
)
|
||||
class Llama4Model(LlamaModel):
|
||||
model_arch = gguf.MODEL_ARCH.LLAMA4
|
||||
undo_permute = False
|
||||
@@ -2405,6 +2414,10 @@ class Llama4Model(LlamaModel):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_interleave_moe_layer_step(self.hparams["interleave_moe_layer_step"])
|
||||
self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size_moe"])
|
||||
if "layer_types" in self.hparams:
|
||||
if all(lt == "full_attention" for lt in self.hparams["layer_types"]):
|
||||
# all layers are full attention (for MobileLLM), disable swa
|
||||
self.gguf_writer.add_sliding_window(0)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
|
||||
if name.startswith("language_model."):
|
||||
@@ -2682,12 +2695,20 @@ class BitnetModel(TextModel):
|
||||
yield (new_name, data_torch)
|
||||
|
||||
|
||||
@ModelBase.register("GrokForCausalLM")
|
||||
@ModelBase.register("GrokForCausalLM", "Grok1ForCausalLM")
|
||||
class GrokModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.GROK
|
||||
|
||||
def set_vocab(self):
|
||||
self._set_vocab_sentencepiece()
|
||||
if (self.dir_model / 'tokenizer.model').is_file():
|
||||
self._set_vocab_sentencepiece()
|
||||
return
|
||||
|
||||
if not (self.dir_model / 'tokenizer.json').is_file() or not (self.dir_model / 'chat_template.jinja').is_file():
|
||||
logger.error('Error: Missing vocab and chat template, download files from https://huggingface.co/alvarobartt/grok-2-tokenizer')
|
||||
sys.exit(1)
|
||||
|
||||
self._set_vocab_gpt2()
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
@@ -2695,11 +2716,46 @@ class GrokModel(TextModel):
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
|
||||
_experts: list[dict[str, Tensor]] | None = None
|
||||
self.gguf_writer.add_attn_logit_softcapping(self.hparams.get("attn_logit_softcapping", 30.0))
|
||||
self.gguf_writer.add_router_logit_softcapping(self.hparams.get("router_logit_softcapping", 30.0))
|
||||
if (final_logit_softcap := self.hparams.get("final_logit_softcapping")):
|
||||
self.gguf_writer.add_final_logit_softcapping(final_logit_softcap)
|
||||
|
||||
if (rope_dim := self.hparams.get("head_dim")) is None:
|
||||
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||
|
||||
if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
|
||||
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
|
||||
|
||||
# Treat "original" as "yarn", seems to have been a mistake
|
||||
if self.hparams.get("rope_type") in ("yarn", "original"):
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["scaling_factor"])
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["original_max_position_embeddings"])
|
||||
self.gguf_writer.add_rope_scaling_yarn_ext_factor(self.hparams["extrapolation_factor"])
|
||||
self.gguf_writer.add_rope_scaling_yarn_attn_factor(self.hparams["attn_factor"])
|
||||
self.gguf_writer.add_rope_scaling_yarn_beta_fast(self.hparams["beta_fast"])
|
||||
self.gguf_writer.add_rope_scaling_yarn_beta_slow(self.hparams["beta_slow"])
|
||||
|
||||
if temp_len := self.hparams.get("attn_temperature_len"):
|
||||
self.gguf_writer.add_attn_temperature_length(temp_len)
|
||||
|
||||
self.gguf_writer.add_attn_output_scale(self.hparams.get("attn_output_multiplier", rope_dim**-0.5))
|
||||
self.gguf_writer.add_embedding_scale(self.hparams["embedding_multiplier_scale"])
|
||||
self.gguf_writer.add_logit_scale(self.hparams["output_multiplier_scale"])
|
||||
|
||||
_experts: list[dict[str, list[Tensor]]] | None = None
|
||||
_cur_expert = ""
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
tensors: list[tuple[str, Tensor]] = []
|
||||
is_expert = ".moe." in name or ".block_sparse_moe.experts." in name
|
||||
|
||||
if not is_expert:
|
||||
tensors.append((self.map_tensor_name(name), data_torch))
|
||||
|
||||
# process the experts separately
|
||||
if name.find(".moe.") != -1:
|
||||
if is_expert or self._cur_expert:
|
||||
n_experts = self.hparams["num_local_experts"]
|
||||
|
||||
assert bid is not None
|
||||
@@ -2707,32 +2763,41 @@ class GrokModel(TextModel):
|
||||
if self._experts is None:
|
||||
self._experts = [{} for _ in range(self.block_count)]
|
||||
|
||||
self._experts[bid][name] = data_torch
|
||||
|
||||
if len(self._experts[bid]) >= n_experts * 3:
|
||||
tensors: list[tuple[str, Tensor]] = []
|
||||
|
||||
# merge the experts into a single 3d tensor
|
||||
for wid in ["linear", "linear_1", "linear_v"]:
|
||||
datas: list[Tensor] = []
|
||||
|
||||
for xid in range(n_experts):
|
||||
ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight"
|
||||
datas.append(self._experts[bid][ename])
|
||||
del self._experts[bid][ename]
|
||||
|
||||
data_torch = torch.stack(datas, dim=0)
|
||||
|
||||
merged_name = f"transformer.decoder_layer.{bid}.moe.{wid}.weight"
|
||||
|
||||
new_name = self.map_tensor_name(merged_name)
|
||||
|
||||
tensors.append((new_name, data_torch))
|
||||
return tensors
|
||||
else:
|
||||
# concatenate split tensors
|
||||
if name in self._experts[bid]:
|
||||
self._cur_expert = name
|
||||
self._experts[bid][name].append(data_torch)
|
||||
return []
|
||||
elif is_expert:
|
||||
self._cur_expert = name
|
||||
self._experts[bid][name] = [data_torch]
|
||||
return []
|
||||
else:
|
||||
self._cur_expert = ""
|
||||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
for bid in range(self.block_count):
|
||||
if len(self._experts[bid]) >= n_experts * 3:
|
||||
# merge the experts into a single 3d tensor
|
||||
for wid in [("linear", "w1", 0), ("linear_1", "w2", 1), ("linear_v", "w3", 0)]:
|
||||
datas: list[Tensor] = []
|
||||
|
||||
for xid in range(n_experts):
|
||||
ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid[0]}.weight"
|
||||
if ename not in self._experts[bid]:
|
||||
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid[1]}.weight"
|
||||
tensor_list = self._experts[bid][ename]
|
||||
datas.append(torch.cat(tensor_list, dim=wid[2]) if len(tensor_list) > 1 else tensor_list[0])
|
||||
del self._experts[bid][ename]
|
||||
|
||||
data_torch = torch.stack(datas, dim=0)
|
||||
|
||||
merged_name = f"transformer.decoder_layer.{bid}.moe.{wid[0]}.weight"
|
||||
|
||||
new_name = self.map_tensor_name(merged_name)
|
||||
|
||||
yield (new_name, data_torch)
|
||||
|
||||
yield from tensors
|
||||
|
||||
|
||||
@ModelBase.register("DbrxForCausalLM")
|
||||
@@ -5951,9 +6016,34 @@ class SeedOssModel(TextModel):
|
||||
|
||||
|
||||
@ModelBase.register("Olmo2ForCausalLM")
|
||||
@ModelBase.register("Olmo3ForCausalLM")
|
||||
class Olmo2Model(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.OLMO2
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
|
||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
self.gguf_writer.add_rope_scaling_attn_factors(rope_scaling["attention_factor"])
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
||||
|
||||
if "sliding_window" in self.hparams:
|
||||
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
|
||||
|
||||
sliding_window_pattern = []
|
||||
if "layer_types" in self.hparams:
|
||||
sliding_window_pattern = [t == "sliding_attention" for t in self.hparams["layer_types"]]
|
||||
else:
|
||||
# Olmo2 does not use sliding window attention.
|
||||
# Olmo3 defaults to using sliding window for all layers except every 4th.
|
||||
for i in range(self.hparams["num_hidden_layers"]):
|
||||
sliding_window_pattern.append((i + 1) % 4 != 0)
|
||||
|
||||
self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
|
||||
|
||||
|
||||
@ModelBase.register("OlmoeForCausalLM")
|
||||
class OlmoeModel(TextModel):
|
||||
@@ -8184,6 +8274,76 @@ class HunYuanMoEModel(TextModel):
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("LLaDAMoEModel", "LLaDAMoEModelLM")
|
||||
class LLaDAMoEModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.LLADA_MOE
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
if (n_experts := self.hparams.get("num_experts")) is not None:
|
||||
self.gguf_writer.add_expert_count(n_experts)
|
||||
|
||||
if (expert_intermediate_size := self.hparams.get("expert_intermediate_size")) is not None:
|
||||
self.gguf_writer.add_expert_feed_forward_length(expert_intermediate_size)
|
||||
|
||||
# number of experts used per token (top-k)
|
||||
if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
|
||||
self.gguf_writer.add_expert_used_count(n_experts_used)
|
||||
|
||||
self.gguf_writer.add_mask_token_id(156895)
|
||||
self.gguf_writer.add_causal_attention(False)
|
||||
self.gguf_writer.add_diffusion_shift_logits(False)
|
||||
|
||||
_experts: list[dict[str, Tensor]] | None = None
|
||||
|
||||
# Copied from: Qwen2MoeModel
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# process the experts separately
|
||||
if name.find("experts") != -1:
|
||||
n_experts = self.hparams["num_experts"]
|
||||
assert bid is not None
|
||||
|
||||
if self._experts is None:
|
||||
self._experts = [{} for _ in range(self.block_count)]
|
||||
|
||||
self._experts[bid][name] = data_torch
|
||||
|
||||
if len(self._experts[bid]) >= n_experts * 3:
|
||||
tensors: list[tuple[str, Tensor]] = []
|
||||
|
||||
# merge the experts into a single 3d tensor
|
||||
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
||||
datas: list[Tensor] = []
|
||||
|
||||
for xid in range(n_experts):
|
||||
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
||||
datas.append(self._experts[bid][ename])
|
||||
del self._experts[bid][ename]
|
||||
|
||||
data_torch = torch.stack(datas, dim=0)
|
||||
|
||||
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
||||
|
||||
new_name = self.map_tensor_name(merged_name)
|
||||
|
||||
tensors.append((new_name, data_torch))
|
||||
return tensors
|
||||
else:
|
||||
return []
|
||||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
# Copied from: Qwen2MoeModel
|
||||
def prepare_tensors(self):
|
||||
super().prepare_tensors()
|
||||
|
||||
if self._experts is not None:
|
||||
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
||||
experts = [k for d in self._experts for k in d.keys()]
|
||||
if len(experts) > 0:
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("HunYuanDenseV1ForCausalLM")
|
||||
class HunYuanModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
|
||||
|
||||
@@ -139,6 +139,7 @@ models = [
|
||||
{"name": "lfm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
|
||||
{"name": "exaone4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
|
||||
{"name": "mellum", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", },
|
||||
{"name": "llada-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/LLaDA-MoE-7B-A1B-Base", },
|
||||
]
|
||||
|
||||
# some models are known to be broken upstream, so we will skip them as exceptions
|
||||
@@ -158,6 +159,7 @@ pre_computed_hashes = [
|
||||
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"},
|
||||
{"name": "kimi-k2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base", "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890"},
|
||||
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3-Embedding-0.6B", "chkhsh": "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c"},
|
||||
{"name": "grok-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/alvarobartt/grok-2-tokenizer", "chkhsh": "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273"},
|
||||
]
|
||||
|
||||
|
||||
|
||||
+3
-3
@@ -241,8 +241,8 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
|
||||
| | VX/VXE/VXE2 | zDNN | Spyre |
|
||||
|------------|-------------|------|-------|
|
||||
| FP32 | ✅ | ✅ | ❓ |
|
||||
| FP16 | ✅ | ❓ | ❓ |
|
||||
| BF16 | 🚫 | ❓ | ❓ |
|
||||
| FP16 | ✅ | ✅ | ❓ |
|
||||
| BF16 | 🚫 | ✅ | ❓ |
|
||||
| Q4_0 | ✅ | ❓ | ❓ |
|
||||
| Q4_1 | ✅ | ❓ | ❓ |
|
||||
| MXFP4 | 🚫 | ❓ | ❓ |
|
||||
@@ -272,4 +272,4 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
|
||||
- 🚫 - acceleration unavailable, will still run using scalar implementation
|
||||
- ❓ - acceleration unknown, please contribute if you can test it yourself
|
||||
|
||||
Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on Sep 6, 2025.
|
||||
Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on Sep 7, 2025.
|
||||
|
||||
@@ -18,6 +18,7 @@ Legend:
|
||||
| ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
|
||||
| ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| ADD_ID | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
@@ -26,6 +27,7 @@ Legend:
|
||||
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
|
||||
| CONV_2D | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ |
|
||||
| CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
|
||||
| CONV_3D | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ |
|
||||
@@ -49,9 +51,11 @@ Legend:
|
||||
| GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
|
||||
| GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| GROUP_NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
|
||||
| HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
|
||||
| IM2COL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ |
|
||||
| IM2COL_3D | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| LOG | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
@@ -61,7 +65,9 @@ Legend:
|
||||
| MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ |
|
||||
| NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
|
||||
| NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
|
||||
| NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
|
||||
| OPT_STEP_SGD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ |
|
||||
| PAD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| PAD_REFLECT_1D | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
@@ -98,6 +104,7 @@ Legend:
|
||||
| SUM | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
|
||||
| SUM_ROWS | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
|
||||
| SWIGLU_OAI | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | ❌ |
|
||||
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ |
|
||||
|
||||
+7667
-3447
File diff suppressed because it is too large
Load Diff
@@ -510,19 +510,27 @@ static void diffusion_generate(llama_context * ctx,
|
||||
n_generated = params.max_length;
|
||||
}
|
||||
|
||||
static std::string format_input_text(const std::string & prompt, bool use_chat_template, llama_model * model) {
|
||||
static std::string format_input_text(const std::string & prompt, const std::string & system_prompt, bool use_chat_template, llama_model * model) {
|
||||
if (!use_chat_template) {
|
||||
return prompt;
|
||||
}
|
||||
|
||||
auto chat_templates = common_chat_templates_init(model, "");
|
||||
|
||||
common_chat_templates_inputs inputs;
|
||||
common_chat_msg user_msg;
|
||||
user_msg.role = "user";
|
||||
user_msg.content = prompt;
|
||||
inputs.add_generation_prompt = true;
|
||||
common_chat_msg system_msg;
|
||||
|
||||
if (!system_prompt.empty()) {
|
||||
system_msg.role = "system";
|
||||
system_msg.content = system_prompt;
|
||||
inputs.messages.push_back(system_msg);
|
||||
}
|
||||
|
||||
common_chat_msg user_msg;
|
||||
user_msg.role = "user";
|
||||
user_msg.content = prompt;
|
||||
|
||||
inputs.messages.push_back(user_msg);
|
||||
inputs.add_generation_prompt = true;
|
||||
|
||||
auto result = common_chat_templates_apply(chat_templates.get(), inputs);
|
||||
|
||||
@@ -579,7 +587,8 @@ int main(int argc, char ** argv) {
|
||||
llama_set_n_threads(ctx, params.cpuparams.n_threads, params.cpuparams_batch.n_threads);
|
||||
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
std::string formatted_prompt = format_input_text(params.prompt, params.enable_chat_template, model);
|
||||
|
||||
std::string formatted_prompt = format_input_text(params.prompt, params.system_prompt, params.enable_chat_template, model);
|
||||
|
||||
std::vector<llama_token> input_tokens = common_tokenize(vocab,
|
||||
formatted_prompt,
|
||||
@@ -596,6 +605,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
llama_token mask_token_id = llama_vocab_mask(vocab);
|
||||
|
||||
GGML_ASSERT(mask_token_id != LLAMA_TOKEN_NULL);
|
||||
|
||||
bool visual_mode = params.diffusion.visual_mode;
|
||||
|
||||
@@ -145,6 +145,20 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
|
||||
|
||||
if (llama_model_has_encoder(model)) {
|
||||
if (llama_encode(ctx, batch)) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
|
||||
if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
|
||||
decoder_start_token_id = llama_vocab_bos(vocab);
|
||||
}
|
||||
|
||||
batch = llama_batch_get_one(&decoder_start_token_id, 1);
|
||||
}
|
||||
|
||||
// main loop
|
||||
|
||||
const auto t_main_start = ggml_time_us();
|
||||
|
||||
@@ -190,7 +190,6 @@ option(GGML_WEBGPU "ggml: use WebGPU"
|
||||
option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF)
|
||||
option(GGML_ZDNN "ggml: use zDNN" OFF)
|
||||
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
|
||||
option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
|
||||
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
|
||||
option(GGML_METAL_SHADER_DEBUG "ggml: compile Metal with -fno-fast-math" OFF)
|
||||
option(GGML_METAL_EMBED_LIBRARY "ggml: embed Metal library" ${GGML_METAL})
|
||||
|
||||
@@ -39,6 +39,7 @@ extern "C" {
|
||||
// user-code should use only these functions
|
||||
//
|
||||
|
||||
// TODO: remove in the future
|
||||
GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void);
|
||||
|
||||
GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
||||
|
||||
@@ -7,8 +7,6 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
GGML_BACKEND_API ggml_backend_t ggml_backend_zdnn_init(void);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zdnn_reg(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
+4
-4
@@ -284,19 +284,19 @@ __host__ __device__ constexpr inline void ggml_unused_vars_impl(Args&&...) noexc
|
||||
// GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
|
||||
//
|
||||
#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
|
||||
const type prefix##0 = (pointer)->array[0]; \
|
||||
const type prefix##0 = (pointer) ? (pointer)->array[0] : 0; \
|
||||
GGML_UNUSED(prefix##0);
|
||||
#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
|
||||
GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \
|
||||
const type prefix##1 = (pointer)->array[1]; \
|
||||
const type prefix##1 = (pointer) ? (pointer)->array[1] : 0; \
|
||||
GGML_UNUSED(prefix##1);
|
||||
#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
|
||||
GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \
|
||||
const type prefix##2 = (pointer)->array[2]; \
|
||||
const type prefix##2 = (pointer) ? (pointer)->array[2] : 0; \
|
||||
GGML_UNUSED(prefix##2);
|
||||
#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
|
||||
GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \
|
||||
const type prefix##3 = (pointer)->array[3]; \
|
||||
const type prefix##3 = (pointer) ? (pointer)->array[3] : 0; \
|
||||
GGML_UNUSED(prefix##3);
|
||||
|
||||
#define GGML_TENSOR_UNARY_OP_LOCALS \
|
||||
|
||||
@@ -526,7 +526,10 @@ struct ggml_backend_cann_context {
|
||||
*/
|
||||
aclrtStream stream(int stream) {
|
||||
if (streams[stream] == nullptr) {
|
||||
ggml_cann_set_device(device);
|
||||
// If the device is not set here, destroying the stream later may cause a mismatch
|
||||
// between the thread contexts where the stream was created and destroyed.
|
||||
// However, I printed the device_id, thread_id, and stream, and they are all consistent.
|
||||
ACL_CHECK(aclrtSetDevice(device));
|
||||
ACL_CHECK(aclrtCreateStream(&streams[stream]));
|
||||
}
|
||||
return streams[stream];
|
||||
|
||||
@@ -75,13 +75,12 @@
|
||||
* @param device The device ID to set.
|
||||
*/
|
||||
void ggml_cann_set_device(const int32_t device) {
|
||||
// TODO: uncomment these lines after empty context has fixed.
|
||||
// int current_device;
|
||||
// ACL_CHECK(aclrtGetDevice(¤t_device));
|
||||
int current_device = -1;
|
||||
aclrtGetDevice(¤t_device);
|
||||
|
||||
// if (device == current_device) {
|
||||
// return;
|
||||
// }
|
||||
if (device == current_device) {
|
||||
return;
|
||||
}
|
||||
ACL_CHECK(aclrtSetDevice(device));
|
||||
}
|
||||
|
||||
@@ -1729,6 +1728,7 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
||||
ggml_cann_get_rows(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_SET_ROWS:
|
||||
std::cout << "lcg GGML_OP_SET_ROWS"<< std::endl;
|
||||
ggml_cann_set_rows(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_DUP:
|
||||
|
||||
@@ -8599,7 +8599,6 @@ static void ggml_compute_forward_timestep_embedding_f32(
|
||||
}
|
||||
if (dim % 2 != 0 && ith == 0) {
|
||||
embed_data[2 * half] = 0.f;
|
||||
embed_data[dim] = 0.f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -75,6 +75,8 @@
|
||||
#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
|
||||
#define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1)
|
||||
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1)
|
||||
#define GGML_CUDA_CC_IS_CDNA1(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_CDNA2)
|
||||
#define GGML_CUDA_CC_IS_CDNA2(cc) (cc >= GGML_CUDA_CC_CDNA2 && cc < GGML_CUDA_CC_CDNA3)
|
||||
#define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1)
|
||||
|
||||
// Moore Threads
|
||||
@@ -325,6 +327,20 @@ static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
|
||||
#endif // defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__))
|
||||
}
|
||||
|
||||
// Maximum number of bytes that can be copied in a single instruction.
|
||||
static constexpr __device__ int ggml_cuda_get_max_cpy_bytes() {
|
||||
#ifdef GGML_USE_HIP
|
||||
return 16;
|
||||
#else
|
||||
#if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
||||
return 16;
|
||||
#else
|
||||
return 8;
|
||||
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
||||
#endif // GGML_USE_HIP
|
||||
}
|
||||
|
||||
|
||||
[[noreturn]]
|
||||
static __device__ void no_device_code(
|
||||
const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
|
||||
|
||||
@@ -647,9 +647,7 @@ static __global__ void flash_attn_stream_k_fixup(
|
||||
}
|
||||
|
||||
template<int D> // D == head size
|
||||
#if !defined(GGML_USE_HIP)
|
||||
__launch_bounds__(D, 1)
|
||||
#endif // !(defined(GGML_USE_HIP)
|
||||
static __global__ void flash_attn_combine_results(
|
||||
const float * __restrict__ VKQ_parts,
|
||||
const float2 * __restrict__ VKQ_meta,
|
||||
@@ -692,10 +690,7 @@ static __global__ void flash_attn_combine_results(
|
||||
float VKQ_numerator = 0.0f;
|
||||
float VKQ_denominator = 0.0f;
|
||||
for (int l = 0; l < parallel_blocks; ++l) {
|
||||
const float diff = meta[l].x - kqmax;
|
||||
float KQ_max_scale = expf(diff);
|
||||
const uint32_t ftz_mask = 0xFFFFFFFF * (diff > SOFTMAX_FTZ_THRESHOLD);
|
||||
*((uint32_t *) &KQ_max_scale) &= ftz_mask;
|
||||
const float KQ_max_scale = expf(meta[l].x - kqmax);
|
||||
|
||||
VKQ_numerator += KQ_max_scale * VKQ_parts[l*D + tid];
|
||||
VKQ_denominator += KQ_max_scale * meta[l].y;
|
||||
@@ -836,11 +831,10 @@ void launch_fattn(
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
int parallel_blocks = 1;
|
||||
|
||||
const dim3 block_dim(warp_size, nwarps, 1);
|
||||
int max_blocks_per_sm = 1; // Max. number of active blocks limited by occupancy.
|
||||
CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, fattn_kernel, block_dim.x * block_dim.y * block_dim.z, nbytes_shared));
|
||||
int parallel_blocks = max_blocks_per_sm;
|
||||
|
||||
dim3 blocks_num;
|
||||
if (stream_k) {
|
||||
@@ -862,9 +856,6 @@ void launch_fattn(
|
||||
GGML_ASSERT(K->ne[1] % KQ_row_granularity == 0);
|
||||
const int ntiles_KQ = K->ne[1] / KQ_row_granularity; // Max. number of parallel blocks limited by tensor size.
|
||||
|
||||
// parallel_blocks should be at least large enough to achieve max. occupancy for a single wave:
|
||||
parallel_blocks = std::max((nsm * max_blocks_per_sm) / ntiles_total, 1);
|
||||
|
||||
// parallel_blocks must not be larger than what the tensor size allows:
|
||||
parallel_blocks = std::min(parallel_blocks, ntiles_KQ);
|
||||
|
||||
|
||||
+324
-227
@@ -2,20 +2,30 @@
|
||||
#include "fattn-common.cuh"
|
||||
#include "fattn-tile.cuh"
|
||||
|
||||
#define FATTN_TILE_NTHREADS 256
|
||||
// kq_stride == number of KQ rows to process per iteration
|
||||
// kq_nbatch == number of K columns to load in parallel for KQ calculation
|
||||
|
||||
static int fattn_tile_get_kq_stride_host(const int D, const int ncols, const int cc, const int warp_size) {
|
||||
if (GGML_CUDA_CC_IS_AMD(cc)) {
|
||||
if (GGML_CUDA_CC_IS_RDNA(cc)) {
|
||||
switch (D) {
|
||||
case 64:
|
||||
return 128;
|
||||
case 128:
|
||||
case 256:
|
||||
return ncols <= 16 ? 128 : 64;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
switch (D) {
|
||||
case 64:
|
||||
return 64;
|
||||
return ncols == 32 ? 128 : 64;
|
||||
case 128:
|
||||
return ncols == 32 ? 64 : 32;
|
||||
case 256:
|
||||
if (GGML_CUDA_CC_IS_GCN(cc) || GGML_CUDA_CC_IS_CDNA(cc)) {
|
||||
return ncols <= 16 ? 64 : 32;
|
||||
} else {
|
||||
return 64;
|
||||
}
|
||||
return 32;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
return -1;
|
||||
@@ -49,24 +59,28 @@ static int fattn_tile_get_kq_stride_host(const int D, const int ncols, const int
|
||||
|
||||
static constexpr __device__ int fattn_tile_get_kq_stride_device(int D, int ncols, int warp_size) {
|
||||
#ifdef GGML_USE_HIP
|
||||
#ifdef RDNA
|
||||
switch (D) {
|
||||
case 64:
|
||||
return 64;
|
||||
return 128;
|
||||
case 128:
|
||||
#if defined(GCN) || defined(CDNA)
|
||||
return ncols <= 16 ? 64 : 32;
|
||||
#else
|
||||
return 64;
|
||||
#endif // defined(GCN) || defined(CDNA)
|
||||
case 256:
|
||||
#if defined(GCN) || defined(CDNA)
|
||||
return ncols <= 16 ? 64 : 32;
|
||||
#else
|
||||
return 64;
|
||||
#endif // defined(GCN) || defined(CDNA)
|
||||
return ncols <= 16 ? 128 : 64;
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
#else
|
||||
switch (D) {
|
||||
case 64:
|
||||
return ncols == 32 ? 128 : 64;
|
||||
case 128:
|
||||
return ncols == 32 ? 64 : 32;
|
||||
case 256:
|
||||
return 32;
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
#endif // RDNA
|
||||
#else
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
switch (D) {
|
||||
@@ -100,17 +114,8 @@ static constexpr __device__ int fattn_tile_get_kq_nbatch_device(int D, int ncols
|
||||
case 64:
|
||||
return 64;
|
||||
case 128:
|
||||
#if defined(GCN) || defined(CDNA)
|
||||
return ncols <= 16 ? 64 : 128;
|
||||
#else
|
||||
return 64;
|
||||
#endif // defined(GCN) || defined(CDNA)
|
||||
case 256:
|
||||
#if defined(GCN) || defined(CDNA)
|
||||
return ncols <= 16 ? 64 : 128;
|
||||
#else
|
||||
return ncols <= 16 ? 64 : 256;
|
||||
#endif // defined(GCN) || defined(CDNA)
|
||||
return 128;
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
@@ -120,9 +125,8 @@ static constexpr __device__ int fattn_tile_get_kq_nbatch_device(int D, int ncols
|
||||
case 64:
|
||||
return 64;
|
||||
case 128:
|
||||
return ncols <= 16 ? 128 : 64;
|
||||
case 256:
|
||||
return ncols <= 16 ? 64 : 128;
|
||||
return 128;
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
@@ -142,12 +146,27 @@ static constexpr __device__ int fattn_tile_get_kq_nbatch_device(int D, int ncols
|
||||
GGML_UNUSED_VARS(ncols, warp_size);
|
||||
}
|
||||
|
||||
template<int D, int ncols, bool use_logit_softcap> // D == head size
|
||||
#ifdef GGML_USE_HIP
|
||||
__launch_bounds__(FATTN_TILE_NTHREADS, 1)
|
||||
static int fattn_tile_get_nthreads_host(const int cc, const int ncols) {
|
||||
return 256;
|
||||
GGML_UNUSED_VARS(cc, ncols);
|
||||
}
|
||||
|
||||
static constexpr __device__ int fattn_tile_get_nthreads_device(int ncols) {
|
||||
return 256;
|
||||
GGML_UNUSED(ncols);
|
||||
}
|
||||
|
||||
static constexpr __device__ int fattn_tile_get_occupancy_device(int ncols) {
|
||||
#ifdef RDNA
|
||||
return 3;
|
||||
#else
|
||||
__launch_bounds__(FATTN_TILE_NTHREADS, 2)
|
||||
#endif // GGML_USE_HIP
|
||||
return ncols <= 16 ? 3 : 2;
|
||||
#endif // RDNA
|
||||
GGML_UNUSED(ncols);
|
||||
}
|
||||
|
||||
template<int D, int ncols, bool use_logit_softcap> // D == head size
|
||||
__launch_bounds__(fattn_tile_get_nthreads_device(ncols), fattn_tile_get_occupancy_device(ncols))
|
||||
static __global__ void flash_attn_tile(
|
||||
const char * __restrict__ Q,
|
||||
const char * __restrict__ K,
|
||||
@@ -193,7 +212,7 @@ static __global__ void flash_attn_tile(
|
||||
}
|
||||
|
||||
constexpr int warp_size = 32;
|
||||
constexpr int nwarps = FATTN_TILE_NTHREADS / warp_size;
|
||||
constexpr int nwarps = fattn_tile_get_nthreads_device(ncols) / warp_size;
|
||||
constexpr int kq_stride = fattn_tile_get_kq_stride_device(D, ncols, warp_size);
|
||||
static_assert(kq_stride % warp_size == 0, "kq_stride not divisable by warp_size.");
|
||||
constexpr int kq_nbatch = fattn_tile_get_kq_nbatch_device(D, ncols, warp_size);
|
||||
@@ -206,90 +225,126 @@ static __global__ void flash_attn_tile(
|
||||
const int sequence = blockIdx.z / ne02;
|
||||
const int head = blockIdx.z - sequence*ne02;
|
||||
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
|
||||
const float2 * Q_f2 = (const float2 *) (Q + nb03* sequence + nb02* head + nb01*ic0);
|
||||
const half2 * K_h2 = (const half2 *) (K + nb13* sequence + nb12*(head / gqa_ratio));
|
||||
const half2 * V_h2 = (const half2 *) (V + nb13* sequence + nb12*(head / gqa_ratio)); // K and V have same shape
|
||||
const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0);
|
||||
const float * sinksf = (const float *) (sinks);
|
||||
const float * Q_f = (const float *) (Q + nb03* sequence + nb02* head + nb01*ic0);
|
||||
const half2 * K_h2 = (const half2 *) (K + nb13* sequence + nb12*(head / gqa_ratio));
|
||||
const half2 * V_h2 = (const half2 *) (V + nb13* sequence + nb12*(head / gqa_ratio)); // K and V have same shape
|
||||
const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0);
|
||||
const float * sinksf = (const float *) (sinks);
|
||||
|
||||
const int stride_KV2 = nb11 / sizeof(half2);
|
||||
|
||||
const float slope = get_alibi_slope(max_bias, head, n_head_log2, m0, m1);
|
||||
|
||||
#if defined(GGML_USE_HIP)
|
||||
constexpr int cpy_nb = 16;
|
||||
#else
|
||||
constexpr int cpy_nb = 8;
|
||||
#endif // defined(GGML_USE_HIP) && defined(GCN)
|
||||
constexpr int cpy_nb = ggml_cuda_get_max_cpy_bytes();
|
||||
constexpr int cpy_ne = cpy_nb / 4;
|
||||
|
||||
__shared__ float KQ[ncols][kq_stride];
|
||||
constexpr int cpw = ncols/nwarps; // cols per warp
|
||||
|
||||
// softmax_iter_j == number of KQ columns for which to calculate softmax in parallel.
|
||||
// KQ is originall 2D but uses a Z-shaped memory pattern for larger reads/writes.
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
constexpr int softmax_iter_j = cpw < 2*cpy_ne ? cpw : 2*cpy_ne;
|
||||
|
||||
__shared__ half KQ[ncols/softmax_iter_j][kq_stride][softmax_iter_j];
|
||||
__shared__ half2 Q_tmp[ncols][D/2];
|
||||
__shared__ half2 KV_tmp_h2[kq_stride * (kq_nbatch/2 + cpy_ne)]; // Padded to avoid memory bank conflicts.
|
||||
half2 VKQ[ncols/nwarps][D/(2*warp_size)] = {{{0.0f, 0.0f}}};
|
||||
__shared__ half2 KV_tmp[kq_stride * (kq_nbatch/2 + cpy_ne)]; // Padded to avoid memory bank conflicts.
|
||||
half2 VKQ[cpw][D/(2*warp_size)] = {{{0.0f, 0.0f}}};
|
||||
#else
|
||||
constexpr int softmax_iter_j = cpw < 1*cpy_ne ? cpw : 1*cpy_ne;
|
||||
|
||||
__shared__ float KQ[ncols/softmax_iter_j][kq_stride][softmax_iter_j];
|
||||
__shared__ float Q_tmp[ncols][D];
|
||||
__shared__ float KV_tmp_f[kq_stride * (kq_nbatch + cpy_ne)]; // Padded to avoid memory bank conflicts.
|
||||
float2 * KV_tmp_f2 = (float2 *) KV_tmp_f;
|
||||
float2 VKQ[ncols/nwarps][D/(2*warp_size)] = {{{0.0f, 0.0f}}};
|
||||
__shared__ float KV_tmp[kq_stride * (kq_nbatch + cpy_ne)]; // Padded to avoid memory bank conflicts.
|
||||
float2 VKQ[cpw][D/(2*warp_size)] = {{{0.0f, 0.0f}}};
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
static_assert(cpw % softmax_iter_j == 0, "bad softmax_iter_j");
|
||||
|
||||
|
||||
float kqmax[ncols/nwarps];
|
||||
float KQ_max[cpw];
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
kqmax[j0/nwarps] = -FLT_MAX/2.0f;
|
||||
KQ_max[j0/nwarps] = -FLT_MAX/2.0f;
|
||||
}
|
||||
float kqsum[ncols/nwarps] = {0.0f};
|
||||
float KQ_sum[cpw] = {0.0f};
|
||||
|
||||
// Load Q data, convert to FP16 if fast.
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < cpw; ++j0) {
|
||||
const int j = j0 + threadIdx.y*cpw;
|
||||
|
||||
constexpr int cpy_ne_D = cpy_ne < D/warp_size ? cpy_ne : D/warp_size;
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
for (int i0 = 0; i0 < D; i0 += warp_size*cpy_ne_D) {
|
||||
float tmp_f[cpy_ne_D] = {0.0f};
|
||||
if (ic0 + j < ne01) {
|
||||
ggml_cuda_memcpy_1<sizeof(tmp_f)>(tmp_f, &Q_f[j*(nb01/sizeof(float)) + i0 + threadIdx.x*cpy_ne_D]);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
|
||||
const float2 tmp = ic0 + j < ne01 ? Q_f2[j*(nb01/sizeof(float2)) + i0 + threadIdx.x] : make_float2(0.0f, 0.0f);
|
||||
for (int i1 = 0; i1 < cpy_ne_D; ++i1) {
|
||||
tmp_f[i1] *= scale;
|
||||
}
|
||||
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
Q_tmp[j][i0 + threadIdx.x] = make_half2(tmp.x * scale, tmp.y * scale);
|
||||
half2 tmp_h2[cpy_ne_D/2];
|
||||
#pragma unroll
|
||||
for (int i1 = 0; i1 < cpy_ne_D; i1 += 2) {
|
||||
tmp_h2[i1/2] = make_half2(tmp_f[i1 + 0], tmp_f[i1 + 1]);
|
||||
}
|
||||
ggml_cuda_memcpy_1<sizeof(tmp_h2)>(&Q_tmp[j][i0/2 + threadIdx.x*(cpy_ne_D/2)], tmp_h2);
|
||||
#else
|
||||
Q_tmp[j][2*i0 + threadIdx.x] = tmp.x * scale;
|
||||
Q_tmp[j][2*i0 + warp_size + threadIdx.x] = tmp.y * scale;
|
||||
ggml_cuda_memcpy_1<sizeof(tmp_f)> (&Q_tmp[j][i0 + threadIdx.x* cpy_ne_D], tmp_f);
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Main loop over KV cache:
|
||||
const int k_VKQ_max = KV_max ? KV_max[sequence*gridDim.x + blockIdx.x] : ne11;
|
||||
for (int k_VKQ_0 = blockIdx.y*kq_stride; k_VKQ_0 < k_VKQ_max; k_VKQ_0 += gridDim.y*kq_stride) {
|
||||
// Calculate KQ tile and keep track of new maximum KQ values:
|
||||
|
||||
float kqmax_new[ncols/nwarps];
|
||||
float KQ_max_new[cpw];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols/nwarps; ++j) {
|
||||
kqmax_new[j] = kqmax[j];
|
||||
for (int j = 0; j < cpw; ++j) {
|
||||
KQ_max_new[j] = KQ_max[j];
|
||||
}
|
||||
|
||||
float sum[kq_stride/warp_size][ncols/nwarps] = {{0.0f}};
|
||||
float KQ_acc[kq_stride/warp_size][cpw] = {{0.0f}}; // Accumulators for KQ matrix multiplication.
|
||||
|
||||
// KQ = K @ Q matrix multiplication:
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D; k_KQ_0 += kq_nbatch) {
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < kq_stride; i_KQ_0 += nwarps) {
|
||||
const int i_KQ = i_KQ_0 + threadIdx.y;
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch/2; k_KQ_1 += warp_size) {
|
||||
const half2 tmp_h2 = K_h2[int64_t(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ_0/2 + k_KQ_1 + threadIdx.x];
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
KV_tmp_h2[i_KQ*(kq_nbatch/2 + cpy_ne) + k_KQ_1 + threadIdx.x] = tmp_h2;
|
||||
#else
|
||||
const float2 tmp_f2 = __half22float2(tmp_h2);
|
||||
KV_tmp_f[i_KQ*(kq_nbatch + cpy_ne) + 2*k_KQ_1 + threadIdx.x] = tmp_f2.x;
|
||||
KV_tmp_f[i_KQ*(kq_nbatch + cpy_ne) + 2*k_KQ_1 + warp_size + threadIdx.x] = tmp_f2.y;
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
constexpr int cpy_ne_kqnb = cpy_ne < kq_nbatch/(2*warp_size) ? cpy_ne : kq_nbatch/(2*warp_size);
|
||||
#pragma unroll
|
||||
for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch/2; k_KQ_1 += warp_size*cpy_ne_kqnb) {
|
||||
ggml_cuda_memcpy_1<cpy_ne_kqnb*4>(
|
||||
&KV_tmp[i_KQ*(kq_nbatch/2 + cpy_ne) + k_KQ_1 + threadIdx.x*cpy_ne_kqnb],
|
||||
&K_h2[int64_t(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ_0/2 + k_KQ_1 + threadIdx.x*cpy_ne_kqnb]);
|
||||
}
|
||||
#else
|
||||
constexpr int cpy_ne_kqnb = cpy_ne < kq_nbatch/warp_size ? cpy_ne : kq_nbatch/warp_size;
|
||||
#pragma unroll
|
||||
for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch; k_KQ_1 += warp_size*cpy_ne_kqnb) {
|
||||
half2 tmp_h2[cpy_ne_kqnb/2];
|
||||
ggml_cuda_memcpy_1<sizeof(tmp_h2)>(
|
||||
tmp_h2, &K_h2[int64_t(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ_0/2 + k_KQ_1/2 + threadIdx.x*(cpy_ne_kqnb/2)]);
|
||||
|
||||
float2 tmp_f2[cpy_ne_kqnb/2];
|
||||
#pragma unroll
|
||||
for (int k_KQ_2 = 0; k_KQ_2 < cpy_ne_kqnb/2; ++k_KQ_2) {
|
||||
tmp_f2[k_KQ_2] = __half22float2(tmp_h2[k_KQ_2]);
|
||||
}
|
||||
ggml_cuda_memcpy_1<sizeof(tmp_f2)>(
|
||||
&KV_tmp[i_KQ*(kq_nbatch + cpy_ne) + k_KQ_1 + threadIdx.x*cpy_ne_kqnb], tmp_f2);
|
||||
}
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
@@ -298,12 +353,12 @@ static __global__ void flash_attn_tile(
|
||||
#pragma unroll
|
||||
for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch/2; k_KQ_1 += cpy_ne) {
|
||||
half2 K_k[kq_stride/warp_size][cpy_ne];
|
||||
half2 Q_k[ncols/nwarps][cpy_ne];
|
||||
half2 Q_k[cpw][cpy_ne];
|
||||
#else
|
||||
#pragma unroll
|
||||
for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch; k_KQ_1 += cpy_ne) {
|
||||
float K_k[kq_stride/warp_size][cpy_ne];
|
||||
float Q_k[ncols/nwarps][cpy_ne];
|
||||
float Q_k[cpw][cpy_ne];
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
|
||||
#pragma unroll
|
||||
@@ -311,29 +366,29 @@ static __global__ void flash_attn_tile(
|
||||
const int i_KQ = i_KQ_0 + threadIdx.x;
|
||||
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
ggml_cuda_memcpy_1<cpy_nb>(&K_k[i_KQ_0/warp_size], &KV_tmp_h2[i_KQ*(kq_nbatch/2 + cpy_ne) + k_KQ_1]);
|
||||
ggml_cuda_memcpy_1<cpy_nb>(&K_k[i_KQ_0/warp_size], &KV_tmp[i_KQ*(kq_nbatch/2 + cpy_ne) + k_KQ_1]);
|
||||
#else
|
||||
ggml_cuda_memcpy_1<cpy_nb>(&K_k[i_KQ_0/warp_size], &KV_tmp_f [i_KQ*(kq_nbatch + cpy_ne) + k_KQ_1]);
|
||||
ggml_cuda_memcpy_1<cpy_nb>(&K_k[i_KQ_0/warp_size], &KV_tmp[i_KQ*(kq_nbatch + cpy_ne) + k_KQ_1]);
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
#pragma unroll
|
||||
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
|
||||
const int j_KQ = j_KQ_0 + threadIdx.y;
|
||||
for (int j_KQ_0 = 0; j_KQ_0 < cpw; ++j_KQ_0) {
|
||||
const int j_KQ = j_KQ_0 + threadIdx.y*cpw;
|
||||
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
ggml_cuda_memcpy_1<cpy_nb>(&Q_k[j_KQ_0/nwarps], &Q_tmp[j_KQ][k_KQ_0/2 + k_KQ_1]);
|
||||
ggml_cuda_memcpy_1<cpy_nb>(&Q_k[j_KQ_0], &Q_tmp[j_KQ][k_KQ_0/2 + k_KQ_1]);
|
||||
#else
|
||||
ggml_cuda_memcpy_1<cpy_nb>(&Q_k[j_KQ_0/nwarps], &Q_tmp[j_KQ][k_KQ_0 + k_KQ_1]);
|
||||
ggml_cuda_memcpy_1<cpy_nb>(&Q_k[j_KQ_0], &Q_tmp[j_KQ][k_KQ_0 + k_KQ_1]);
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < kq_stride; i_KQ_0 += warp_size) {
|
||||
#pragma unroll
|
||||
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
|
||||
for (int j_KQ_0 = 0; j_KQ_0 < cpw; ++j_KQ_0) {
|
||||
#pragma unroll
|
||||
for (int k = 0; k < cpy_ne; ++k) {
|
||||
ggml_cuda_mad(sum[i_KQ_0/warp_size][j_KQ_0/nwarps], K_k[i_KQ_0/warp_size][k], Q_k[j_KQ_0/nwarps][k]);
|
||||
ggml_cuda_mad(KQ_acc[i_KQ_0/warp_size][j_KQ_0], K_k[i_KQ_0/warp_size][k], Q_k[j_KQ_0][k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -344,104 +399,77 @@ static __global__ void flash_attn_tile(
|
||||
}
|
||||
}
|
||||
|
||||
// Apply logit softcap, mask, update KQ_max:
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < kq_stride; i_KQ_0 += warp_size) {
|
||||
const int i_KQ = i_KQ_0 + threadIdx.x;
|
||||
|
||||
#pragma unroll
|
||||
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
|
||||
const int j_KQ = j_KQ_0 + threadIdx.y;
|
||||
for (int j_KQ_0 = 0; j_KQ_0 < cpw; ++j_KQ_0) {
|
||||
const int j_KQ = j_KQ_0 + threadIdx.y*cpw;
|
||||
|
||||
if (use_logit_softcap) {
|
||||
sum[i_KQ_0/warp_size][j_KQ_0/nwarps] = logit_softcap * tanhf(sum[i_KQ_0/warp_size][j_KQ_0/nwarps]);
|
||||
KQ_acc[i_KQ_0/warp_size][j_KQ_0] = logit_softcap * tanhf(KQ_acc[i_KQ_0/warp_size][j_KQ_0]);
|
||||
}
|
||||
|
||||
sum[i_KQ_0/warp_size][j_KQ_0/nwarps] += mask ? slope*__half2float(maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;
|
||||
KQ_acc[i_KQ_0/warp_size][j_KQ_0] += mask ? slope*__half2float(maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;
|
||||
|
||||
kqmax_new[j_KQ_0/nwarps] = fmaxf(kqmax_new[j_KQ_0/nwarps], sum[i_KQ_0/warp_size][j_KQ_0/nwarps]);
|
||||
|
||||
KQ[j_KQ][i_KQ] = sum[i_KQ_0/warp_size][j_KQ_0/nwarps];
|
||||
KQ_max_new[j_KQ_0] = fmaxf(KQ_max_new[j_KQ_0], KQ_acc[i_KQ_0/warp_size][j_KQ_0]);
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Calculate KQ softmax, write to shared KQ buffer, re-scale VKQ accumulators:
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
kqmax_new[j0/nwarps] = warp_reduce_max<warp_size>(kqmax_new[j0/nwarps]);
|
||||
const float KQ_max_scale = expf(kqmax[j0/nwarps] - kqmax_new[j0/nwarps]);
|
||||
kqmax[j0/nwarps] = kqmax_new[j0/nwarps];
|
||||
|
||||
float kqsum_add = 0.0f;
|
||||
if (kq_stride % (4*warp_size) == 0 && cpy_ne % 4 == 0) {
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < kq_stride; i0 += 4*warp_size) {
|
||||
const int i = i0 + 4*threadIdx.x;
|
||||
|
||||
float4 val = *(const float4 *) &KQ[j][i];
|
||||
val.x = expf(val.x - kqmax[j0/nwarps]);
|
||||
val.y = expf(val.y - kqmax[j0/nwarps]);
|
||||
val.z = expf(val.z - kqmax[j0/nwarps]);
|
||||
val.w = expf(val.w - kqmax[j0/nwarps]);
|
||||
kqsum_add += val.x + val.y + val.z + val.w;
|
||||
|
||||
for (int j0 = 0; j0 < cpw; j0 += softmax_iter_j) {
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
const half2 tmp[2] = {make_half2(val.x, val.y), make_half2(val.z, val.w)};
|
||||
ggml_cuda_memcpy_1<sizeof(tmp)>(&KQ[j][i/2], &tmp);
|
||||
half tmp[kq_stride/warp_size][softmax_iter_j];
|
||||
#else
|
||||
ggml_cuda_memcpy_1<sizeof(val)>(&KQ[j][i], &val);
|
||||
float tmp[kq_stride/warp_size][softmax_iter_j];
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
} else if (kq_stride % (2*warp_size) == 0 && cpy_ne % 2 == 0) {
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < kq_stride; i0 += 2*warp_size) {
|
||||
const int i = i0 + 2*threadIdx.x;
|
||||
|
||||
float2 val = *(const float2 *) &KQ[j][i];
|
||||
val.x = expf(val.x - kqmax[j0/nwarps]);
|
||||
val.y = expf(val.y - kqmax[j0/nwarps]);
|
||||
kqsum_add += val.x + val.y;
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
const half2 tmp = make_half2(val.x, val.y);
|
||||
ggml_cuda_memcpy_1<sizeof(tmp)>(&KQ[j][i/2], &tmp);
|
||||
#else
|
||||
ggml_cuda_memcpy_1<sizeof(val)>(&KQ[j][i], &val);
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
} else {
|
||||
#pragma unroll
|
||||
for (int j1 = 0; j1 < softmax_iter_j; ++j1) {
|
||||
KQ_max_new[j0+j1] = warp_reduce_max<warp_size>(KQ_max_new[j0+j1]);
|
||||
const float KQ_max_scale = expf(KQ_max[j0+j1] - KQ_max_new[j0+j1]);
|
||||
KQ_max[j0+j1] = KQ_max_new[j0+j1];
|
||||
|
||||
float KQ_sum_add = 0.0f;
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < kq_stride; i0 += warp_size) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
const float diff = KQ[j][i] - kqmax[j0/nwarps];
|
||||
const float val = expf(diff);
|
||||
kqsum_add += val;
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
((half *) KQ[j])[i] = val;
|
||||
#else
|
||||
KQ[j][i] = val;
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
const float val = expf(KQ_acc[i0/warp_size][j0+j1] - KQ_max[j0+j1]);
|
||||
KQ_sum_add += val;
|
||||
tmp[i0/warp_size][j1] = val;
|
||||
}
|
||||
}
|
||||
kqsum[j0/nwarps] = kqsum[j0/nwarps]*KQ_max_scale + kqsum_add;
|
||||
KQ_sum[j0+j1] = KQ_sum[j0+j1]*KQ_max_scale + KQ_sum_add;
|
||||
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale, KQ_max_scale);
|
||||
const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale, KQ_max_scale);
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
|
||||
VKQ[j0/nwarps][i0/warp_size] *= KQ_max_scale_h2;
|
||||
}
|
||||
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
|
||||
VKQ[j0+j1][i0/warp_size] *= KQ_max_scale_h2;
|
||||
}
|
||||
#else
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
|
||||
VKQ[j0/nwarps][i0/warp_size].x *= KQ_max_scale;
|
||||
VKQ[j0/nwarps][i0/warp_size].y *= KQ_max_scale;
|
||||
}
|
||||
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
|
||||
VKQ[j0+j1][i0/warp_size].x *= KQ_max_scale;
|
||||
VKQ[j0+j1][i0/warp_size].y *= KQ_max_scale;
|
||||
}
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < kq_stride; i0 += warp_size) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
ggml_cuda_memcpy_1<sizeof(tmp[0])>(
|
||||
KQ[j0/softmax_iter_j + threadIdx.y*(cpw/softmax_iter_j)][i], tmp[i0/warp_size]);
|
||||
}
|
||||
}
|
||||
|
||||
constexpr int V_cols_per_iter = kq_stride*kq_nbatch / D;
|
||||
// VKQ = V @ KQ matrix multiplication:
|
||||
constexpr int V_cols_per_iter = kq_stride*kq_nbatch / D; // Number of V columns that fit in SRAM for K.
|
||||
static_assert(kq_stride % V_cols_per_iter == 0, "bad V_cols_per_iter");
|
||||
#pragma unroll
|
||||
for (int k0 = 0; k0 < kq_stride; k0 += V_cols_per_iter) {
|
||||
@@ -449,65 +477,96 @@ static __global__ void flash_attn_tile(
|
||||
for (int k1 = 0; k1 < V_cols_per_iter; k1 += nwarps) {
|
||||
const int k_tile = k1 + threadIdx.y;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
const half2 tmp = V_h2[int64_t(k_VKQ_0 + k0 + k_tile)*stride_KV2 + i];
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
KV_tmp_h2[k_tile*(D/2) + i] = tmp;
|
||||
#else
|
||||
KV_tmp_f2[k_tile*(D/2) + i] = __half22float2(tmp);
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
constexpr int cpy_ne_D = cpy_ne < D/(2*warp_size) ? cpy_ne : D/(2*warp_size);
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += warp_size*cpy_ne_D) {
|
||||
ggml_cuda_memcpy_1<cpy_ne_D*4>(
|
||||
&KV_tmp[k_tile*(D/2) + i0 + threadIdx.x*cpy_ne_D],
|
||||
&V_h2[int64_t(k_VKQ_0 + k0 + k_tile)*stride_KV2 + i0 + threadIdx.x*cpy_ne_D]);
|
||||
}
|
||||
#else
|
||||
constexpr int cpy_ne_D = cpy_ne < D/warp_size ? cpy_ne : D/warp_size;
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D; i0 += warp_size*cpy_ne_D) {
|
||||
half2 tmp_h2[cpy_ne_D/2];
|
||||
ggml_cuda_memcpy_1<sizeof(tmp_h2)>(
|
||||
tmp_h2, &V_h2[int64_t(k_VKQ_0 + k0 + k_tile)*stride_KV2 + i0/2 + threadIdx.x*(cpy_ne_D/2)]);
|
||||
|
||||
float2 tmp_f2[cpy_ne_D/2];
|
||||
#pragma unroll
|
||||
for (int i1 = 0; i1 < cpy_ne_D/2; ++i1) {
|
||||
tmp_f2[i1] = __half22float2(tmp_h2[i1]);
|
||||
}
|
||||
ggml_cuda_memcpy_1<sizeof(tmp_f2)>(
|
||||
&KV_tmp[k_tile*D + i0 + threadIdx.x*cpy_ne_D], tmp_f2);
|
||||
}
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
#pragma unroll
|
||||
for (int k1 = 0; k1 < V_cols_per_iter; ++k1) {
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
half2 V_k[(D/2)/warp_size];
|
||||
half2 KQ_k[ncols/nwarps];
|
||||
#else
|
||||
float2 V_k[(D/2)/warp_size];
|
||||
float KQ_k[ncols/nwarps];
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
half2 KQ_k[cpw];
|
||||
|
||||
constexpr int cpy_ne_D = cpy_ne/2 < (D/2)/warp_size ? cpy_ne/2 : (D/2)/warp_size;
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
V_k[i0/warp_size] = KV_tmp_h2[k1*(D/2) + i];
|
||||
#else
|
||||
V_k[i0/warp_size] = KV_tmp_f2[k1*(D/2) + i];
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
for (int i0 = 0; i0 < D/2; i0 += warp_size*cpy_ne_D) {
|
||||
ggml_cuda_memcpy_1<cpy_ne_D*4>(&V_k[i0/warp_size], &KV_tmp[k1*(D/2) + i0 + threadIdx.x*cpy_ne_D]);
|
||||
}
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
for (int j0 = 0; j0 < cpw; j0 += softmax_iter_j) {
|
||||
const int j = j0/softmax_iter_j + threadIdx.y*(cpw/softmax_iter_j);
|
||||
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
KQ_k[j0/nwarps] = __half2half2(((const half *)KQ[j])[k0 + k1]);
|
||||
#else
|
||||
KQ_k[j0/nwarps] = KQ[j][k0 + k1];
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
half tmp[softmax_iter_j];
|
||||
ggml_cuda_memcpy_1<softmax_iter_j*sizeof(half)>(
|
||||
&tmp, KQ[j][k0 + k1]);
|
||||
#pragma unroll
|
||||
for (int j1 = 0; j1 < softmax_iter_j; ++j1) {
|
||||
KQ_k[j0+j1] = __half2half2(tmp[j1]);
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
VKQ[j0/nwarps][i0/warp_size] += V_k[i0/warp_size] *KQ_k[j0/nwarps];
|
||||
#else
|
||||
VKQ[j0/nwarps][i0/warp_size].x += V_k[i0/warp_size].x*KQ_k[j0/nwarps];
|
||||
VKQ[j0/nwarps][i0/warp_size].y += V_k[i0/warp_size].y*KQ_k[j0/nwarps];
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
for (int j0 = 0; j0 < cpw; ++j0) {
|
||||
VKQ[j0][i0/warp_size] += V_k[i0/warp_size]*KQ_k[j0];
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
#pragma unroll
|
||||
for (int k1 = 0; k1 < V_cols_per_iter; ++k1) {
|
||||
float2 V_k[(D/2)/warp_size];
|
||||
float KQ_k[cpw];
|
||||
|
||||
constexpr int cpy_ne_D = cpy_ne < D/warp_size ? cpy_ne : D/warp_size;
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D; i0 += warp_size*cpy_ne_D) {
|
||||
ggml_cuda_memcpy_1<cpy_ne_D*4>(&V_k[i0/(2*warp_size)], &KV_tmp[k1*D + i0 + threadIdx.x*cpy_ne_D]);
|
||||
}
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < cpw; j0 += softmax_iter_j) {
|
||||
const int j = j0/softmax_iter_j + threadIdx.y*(cpw/softmax_iter_j);
|
||||
|
||||
ggml_cuda_memcpy_1<softmax_iter_j*sizeof(float)>(
|
||||
&KQ_k[j0], KQ[j][k0 + k1]);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < cpw; ++j0) {
|
||||
VKQ[j0][i0/warp_size].x += V_k[i0/warp_size].x*KQ_k[j0];
|
||||
VKQ[j0][i0/warp_size].y += V_k[i0/warp_size].y*KQ_k[j0];
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
@@ -519,69 +578,92 @@ static __global__ void flash_attn_tile(
|
||||
const float sink = sinksf[head];
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
float kqmax_new_j = fmaxf(kqmax[j0/nwarps], sink);
|
||||
kqmax_new_j = warp_reduce_max<warp_size>(kqmax_new_j);
|
||||
for (int j0 = 0; j0 < cpw; ++j0) {
|
||||
float KQ_max_new_j = fmaxf(KQ_max[j0], sink);
|
||||
KQ_max_new_j = warp_reduce_max<warp_size>(KQ_max_new_j);
|
||||
|
||||
const float KQ_max_scale = expf(kqmax[j0/nwarps] - kqmax_new_j);
|
||||
kqmax[j0/nwarps] = kqmax_new_j;
|
||||
const float KQ_max_scale = expf(KQ_max[j0] - KQ_max_new_j);
|
||||
KQ_max[j0] = KQ_max_new_j;
|
||||
|
||||
const float val = expf(sink - kqmax[j0/nwarps]);
|
||||
kqsum[j0/nwarps] = kqsum[j0/nwarps] * KQ_max_scale;
|
||||
const float val = expf(sink - KQ_max[j0]);
|
||||
KQ_sum[j0] = KQ_sum[j0] * KQ_max_scale;
|
||||
if (threadIdx.x == 0) {
|
||||
kqsum[j0/nwarps] += val;
|
||||
KQ_sum[j0] += val;
|
||||
}
|
||||
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale, KQ_max_scale);
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
|
||||
VKQ[j0/nwarps][i0/warp_size] *= KQ_max_scale_h2;
|
||||
VKQ[j0][i0/warp_size] *= KQ_max_scale_h2;
|
||||
}
|
||||
#else
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
|
||||
VKQ[j0/nwarps][i0/warp_size].x *= KQ_max_scale;
|
||||
VKQ[j0/nwarps][i0/warp_size].y *= KQ_max_scale;
|
||||
VKQ[j0][i0/warp_size].x *= KQ_max_scale;
|
||||
VKQ[j0][i0/warp_size].y *= KQ_max_scale;
|
||||
}
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
}
|
||||
|
||||
float2 * dst2 = (float2 *) dst;
|
||||
|
||||
#pragma unroll
|
||||
for (int j_VKQ_0 = 0; j_VKQ_0 < ncols; j_VKQ_0 += nwarps) {
|
||||
const int j_VKQ = j_VKQ_0 + threadIdx.y;
|
||||
for (int j_VKQ_0 = 0; j_VKQ_0 < cpw; ++j_VKQ_0) {
|
||||
KQ_sum[j_VKQ_0] = warp_reduce_sum<warp_size>(KQ_sum[j_VKQ_0]);
|
||||
}
|
||||
if (gridDim.y == 1) {
|
||||
#pragma unroll
|
||||
for (int j_VKQ_0 = 0; j_VKQ_0 < cpw; ++j_VKQ_0) {
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
const half2 KQ_sum_j_inv = make_half2(1.0f/KQ_sum[j_VKQ_0], 1.0f/KQ_sum[j_VKQ_0]);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < (D/2)/warp_size; ++i) {
|
||||
VKQ[j_VKQ_0][i] *= KQ_sum_j_inv;
|
||||
}
|
||||
#else
|
||||
const float KQ_sum_j_inv = 1.0f/KQ_sum[j_VKQ_0];
|
||||
#pragma unroll
|
||||
for (int i = 0; i < (D/2)/warp_size; ++i) {
|
||||
VKQ[j_VKQ_0][i].x *= KQ_sum_j_inv;
|
||||
VKQ[j_VKQ_0][i].y *= KQ_sum_j_inv;
|
||||
}
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
}
|
||||
|
||||
// Write back results:
|
||||
#pragma unroll
|
||||
for (int j_VKQ_0 = 0; j_VKQ_0 < cpw; ++j_VKQ_0) {
|
||||
const int j_VKQ = j_VKQ_0 + threadIdx.y*cpw;
|
||||
|
||||
if (ic0 + j_VKQ >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
float kqsum_j = kqsum[j_VKQ_0/nwarps];
|
||||
kqsum_j = warp_reduce_sum<warp_size>(kqsum_j);
|
||||
|
||||
const int j_dst_unrolled = ((sequence*ne01 + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y;
|
||||
|
||||
#pragma unroll
|
||||
for (int i00 = 0; i00 < D/2; i00 += warp_size) {
|
||||
const int i0 = i00 + threadIdx.x;
|
||||
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
float2 dst_val = __half22float2(VKQ[j_VKQ_0/nwarps][i0/warp_size]);
|
||||
constexpr int cpy_ne_D = cpy_ne/2 < (D/2)/warp_size ? cpy_ne/2 : (D/2)/warp_size;
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += warp_size*cpy_ne_D) {
|
||||
float2 tmp[cpy_ne_D];
|
||||
#pragma unroll
|
||||
for (int i1 = 0; i1 < cpy_ne_D; ++i1) {
|
||||
tmp[i1] = __half22float2(VKQ[j_VKQ_0][i0/warp_size + i1]);
|
||||
}
|
||||
ggml_cuda_memcpy_1<sizeof(tmp)>(&dst[j_dst_unrolled*D + 2*i0 + threadIdx.x*(2*cpy_ne_D)], tmp);
|
||||
}
|
||||
#else
|
||||
float2 dst_val = VKQ[j_VKQ_0/nwarps][i0/warp_size];
|
||||
constexpr int cpy_ne_D = cpy_ne < D/warp_size ? cpy_ne : D/warp_size;
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D; i0 += warp_size*cpy_ne_D) {
|
||||
ggml_cuda_memcpy_1<cpy_ne_D*4>(
|
||||
&dst[j_dst_unrolled*D + i0 + threadIdx.x*cpy_ne_D], &VKQ[j_VKQ_0][i0/(2*warp_size)]);
|
||||
}
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
|
||||
if (gridDim.y == 1) {
|
||||
dst_val.x /= kqsum_j;
|
||||
dst_val.y /= kqsum_j;
|
||||
}
|
||||
dst2[j_dst_unrolled*(D/2) + i0] = dst_val;
|
||||
}
|
||||
|
||||
if (gridDim.y != 1 && threadIdx.x == 0) {
|
||||
dst_meta[j_dst_unrolled] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
|
||||
dst_meta[j_dst_unrolled] = make_float2(KQ_max[j_VKQ_0], KQ_sum[j_VKQ_0]);
|
||||
}
|
||||
}
|
||||
#else
|
||||
@@ -602,15 +684,29 @@ template <int D, bool use_logit_softcap>
|
||||
static void launch_fattn_tile_switch_ncols(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
|
||||
const int id = ggml_cuda_get_device();
|
||||
const int cc = ggml_cuda_info().devices[id].cc;
|
||||
const int warp_size = 32;
|
||||
const int nwarps = FATTN_TILE_NTHREADS / warp_size;
|
||||
const int id = ggml_cuda_get_device();
|
||||
const int cc = ggml_cuda_info().devices[id].cc;
|
||||
const int warp_size = 32;
|
||||
|
||||
constexpr size_t nbytes_shared = 0;
|
||||
|
||||
#ifdef GGML_USE_HIP
|
||||
if constexpr (D <= 128) {
|
||||
if (Q->ne[1] > 32) {
|
||||
constexpr int cols_per_block = 64;
|
||||
const int nwarps = fattn_tile_get_nthreads_host(cc, cols_per_block) / warp_size;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_tile<D, cols_per_block, use_logit_softcap>;
|
||||
const int kq_stride = fattn_tile_get_kq_stride_host(D, cols_per_block, cc, warp_size);
|
||||
launch_fattn<D, cols_per_block, 1>
|
||||
(ctx, dst, fattn_kernel, nwarps, nbytes_shared, kq_stride, true, true, false, warp_size);
|
||||
return;
|
||||
}
|
||||
}
|
||||
#endif // GGML_USE_HIP
|
||||
|
||||
if (Q->ne[1] > 16) {
|
||||
constexpr int cols_per_block = 32;
|
||||
const int nwarps = fattn_tile_get_nthreads_host(cc, cols_per_block) / warp_size;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_tile<D, cols_per_block, use_logit_softcap>;
|
||||
const int kq_stride = fattn_tile_get_kq_stride_host(D, cols_per_block, cc, warp_size);
|
||||
launch_fattn<D, cols_per_block, 1>
|
||||
@@ -619,6 +715,7 @@ static void launch_fattn_tile_switch_ncols(ggml_backend_cuda_context & ctx, ggml
|
||||
}
|
||||
|
||||
constexpr int cols_per_block = 16;
|
||||
const int nwarps = fattn_tile_get_nthreads_host(cc, cols_per_block) / warp_size;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_tile<D, cols_per_block, use_logit_softcap>;
|
||||
const int kq_stride = fattn_tile_get_kq_stride_host(D, cols_per_block, cc, warp_size);
|
||||
launch_fattn<D, cols_per_block, 1>
|
||||
|
||||
@@ -122,11 +122,14 @@ static __global__ void im2col_3d_kernel(
|
||||
int64_t OH_OW, int64_t KD_KH_KW, int64_t ID_IH_IW, int64_t KH_KW, int64_t IH_IW, int64_t IC_ID_IH_IW,
|
||||
int64_t IC_KD_KH_KW, int64_t OW_KD_KH_KW, int64_t OD_OH_OW_IC_KD_KH_KW, int64_t OH_OW_IC_KD_KH_KW,
|
||||
int64_t OW_IC_KD_KH_KW, int64_t N_OD_OH, int64_t OD_OH,
|
||||
int64_t stride_q, int64_t stride_z, int64_t stride_y, int64_t stride_x,
|
||||
int s0, int s1, int s2, int p0, int p1, int p2, int d0, int d1, int d2) {
|
||||
const int64_t i = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (i >= IC_KD_KH_KW) {
|
||||
return;
|
||||
}
|
||||
GGML_UNUSED(N); GGML_UNUSED(OC); GGML_UNUSED(OH_OW); GGML_UNUSED(OD); GGML_UNUSED(OW); GGML_UNUSED(KD); GGML_UNUSED(KH);
|
||||
GGML_UNUSED(ID_IH_IW); GGML_UNUSED(IH_IW); GGML_UNUSED(IC_ID_IH_IW); GGML_UNUSED(OW_KD_KH_KW);
|
||||
|
||||
const int64_t iic = i / KD_KH_KW;
|
||||
const int64_t ikd = (i - iic * KD_KH_KW) / KH_KW;
|
||||
@@ -148,7 +151,7 @@ static __global__ void im2col_3d_kernel(
|
||||
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW || iid < 0 || iid >= ID) {
|
||||
dst[offset_dst] = 0.0f;
|
||||
} else {
|
||||
const int64_t offset_src = in*IC_ID_IH_IW + iic*ID_IH_IW + iid*IH_IW + iih*IW + iiw;
|
||||
const int64_t offset_src = ((in * IC + iic) * stride_q) + (iid * stride_z) + (iih * stride_y) + (iiw * stride_x);
|
||||
dst[offset_dst] = src[offset_src];
|
||||
}
|
||||
}
|
||||
@@ -159,6 +162,7 @@ template <typename T>
|
||||
static void im2col_3d_cuda(const float * src, T* dst,
|
||||
int64_t N, int64_t IC, int64_t ID, int64_t IH, int64_t IW, int64_t OC,
|
||||
int64_t KD, int64_t KH, int64_t KW, int64_t OD, int64_t OH, int64_t OW,
|
||||
int64_t stride_q, int64_t stride_z, int64_t stride_y, int64_t stride_x,
|
||||
int s0, int s1, int s2, int p0, int p1, int p2, int d0, int d1, int d2, cudaStream_t stream) {
|
||||
const int64_t OH_OW = OH*OW;
|
||||
const int64_t KD_KH_KW = KD*KH*KW;
|
||||
@@ -179,23 +183,30 @@ static void im2col_3d_cuda(const float * src, T* dst,
|
||||
OH_OW, KD_KH_KW, ID_IH_IW, KH_KW, IH_IW, IC_ID_IH_IW,
|
||||
IC_KD_KH_KW, OW_KD_KH_KW, OD_OH_OW_IC_KD_KH_KW,
|
||||
OH_OW_IC_KD_KH_KW, OW_IC_KD_KH_KW, N_OD_OH, OD_OH,
|
||||
stride_q, stride_z, stride_y, stride_x,
|
||||
s0, s1, s2, p0, p1, p2, d0, d1, d2);
|
||||
}
|
||||
|
||||
static void im2col_3d_cuda_f16(const float * src, half * dst,
|
||||
int64_t N, int64_t IC, int64_t ID, int64_t IH, int64_t IW, int64_t OC,
|
||||
int64_t KD, int64_t KH, int64_t KW, int64_t OD, int64_t OH, int64_t OW,
|
||||
int64_t stride_q, int64_t stride_z, int64_t stride_y, int64_t stride_x,
|
||||
int s0, int s1, int s2, int p0, int p1, int p2, int d0, int d1, int d2, cudaStream_t stream) {
|
||||
|
||||
im2col_3d_cuda<half>(src, dst, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW, s0, s1, s2, p0, p1, p2, d0, d1, d2, stream);
|
||||
im2col_3d_cuda<half>(src, dst, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW,
|
||||
stride_q, stride_z, stride_y, stride_x,
|
||||
s0, s1, s2, p0, p1, p2, d0, d1, d2, stream);
|
||||
}
|
||||
|
||||
static void im2col_3d_cuda_f32(const float * src, float * dst,
|
||||
int64_t N, int64_t IC, int64_t ID, int64_t IH, int64_t IW, int64_t OC,
|
||||
int64_t KD, int64_t KH, int64_t KW, int64_t OD, int64_t OH, int64_t OW,
|
||||
int64_t stride_q, int64_t stride_z, int64_t stride_y, int64_t stride_x,
|
||||
int s0, int s1, int s2, int p0, int p1, int p2, int d0, int d1, int d2, cudaStream_t stream) {
|
||||
|
||||
im2col_3d_cuda<float>(src, dst, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW, s0, s1, s2, p0, p1, p2, d0, d1, d2, stream);
|
||||
im2col_3d_cuda<float>(src, dst, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW,
|
||||
stride_q, stride_z, stride_y, stride_x,
|
||||
s0, s1, s2, p0, p1, p2, d0, d1, d2, stream);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_im2col_3d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
@@ -235,9 +246,19 @@ void ggml_cuda_op_im2col_3d(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
|
||||
const int64_t OH = ne2;
|
||||
const int64_t OW = ne1;
|
||||
|
||||
const size_t es = ggml_element_size(src1);
|
||||
const int64_t stride_x = src1->nb[0] / es;
|
||||
const int64_t stride_y = src1->nb[1] / es;
|
||||
const int64_t stride_z = src1->nb[2] / es;
|
||||
const int64_t stride_q = src1->nb[3] / es;
|
||||
|
||||
if(dst->type == GGML_TYPE_F16) {
|
||||
im2col_3d_cuda_f16(src1_d, (half *) dst_d, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW, s0, s1, s2, p0, p1, p2, d0, d1, d2, stream);
|
||||
im2col_3d_cuda_f16(src1_d, (half *) dst_d, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW,
|
||||
stride_q, stride_z, stride_y, stride_x,
|
||||
s0, s1, s2, p0, p1, p2, d0, d1, d2, stream);
|
||||
} else {
|
||||
im2col_3d_cuda_f32(src1_d, (float *) dst_d, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW, s0, s1, s2, p0, p1, p2, d0, d1, d2, stream);
|
||||
im2col_3d_cuda_f32(src1_d, (float *) dst_d, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW,
|
||||
stride_q, stride_z, stride_y, stride_x,
|
||||
s0, s1, s2, p0, p1, p2, d0, d1, d2, stream);
|
||||
}
|
||||
}
|
||||
|
||||
+23
-35
@@ -57,31 +57,33 @@ static __global__ void mul_mat_f(
|
||||
T * tile_xy = (T *) compute_base + threadIdx.y*(tile_A::I * tile_k_padded);
|
||||
|
||||
if constexpr (has_ids) {
|
||||
__shared__ int has_any;
|
||||
if (threadIdx.y == 0) {
|
||||
int local_has_any = 0;
|
||||
for (int j = threadIdx.x; j < cols_per_block; j += warp_size) {
|
||||
int slot = -1;
|
||||
for (int k = 0; k < nchannels_dst; ++k) {
|
||||
const int idv = ids[j*stride_row_id + k*stride_col_id];
|
||||
if (idv == expert_idx) {
|
||||
slot = k;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (j < cols_per_block) {
|
||||
local_has_any |= (slot >= 0);
|
||||
slot_map[j] = slot;
|
||||
int found = 0;
|
||||
|
||||
for (int j0 = 0; j0 < cols_per_block; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
const int32_t * __restrict__ id_row = ids + j*stride_row_id;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
slot_map[j] = -1;
|
||||
}
|
||||
|
||||
for (int k = threadIdx.x; k < nchannels_dst; k += warp_size) {
|
||||
int match = id_row[k*stride_col_id] == expert_idx;
|
||||
|
||||
if (match) {
|
||||
slot_map[j] = k;
|
||||
found = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
has_any = warp_reduce_any(local_has_any);
|
||||
}
|
||||
__syncthreads();
|
||||
if (has_any == 0) {
|
||||
|
||||
if (!__syncthreads_or(found)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
for (int col = threadIdx.y*warp_size + threadIdx.x; col < ncols; col += nwarps*warp_size) {
|
||||
tile_A A[ntA][warp_size / tile_A::J];
|
||||
#pragma unroll
|
||||
@@ -106,14 +108,7 @@ static __global__ void mul_mat_f(
|
||||
if constexpr (!has_ids) {
|
||||
tile_xy[j0*tile_k_padded + threadIdx.x] = j < cols_per_block ? y[j*stride_col_y + col] : 0.0f;
|
||||
} else {
|
||||
float val = 0.0f;
|
||||
if (j < cols_per_block) {
|
||||
const int slot = slot_map[j];
|
||||
if (slot >= 0) {
|
||||
val = y[slot*stride_channel_y + j*stride_col_y + col];
|
||||
}
|
||||
}
|
||||
tile_xy[j0*tile_k_padded + threadIdx.x] = val;
|
||||
tile_xy[j0*tile_k_padded + threadIdx.x] = j < cols_per_block ? y[slot_map[j]*stride_channel_y + j*stride_col_y + col] : 0.0f;
|
||||
}
|
||||
}
|
||||
} else if constexpr (std::is_same_v<T, half2> || std::is_same_v<T, nv_bfloat162>) {
|
||||
@@ -125,14 +120,7 @@ static __global__ void mul_mat_f(
|
||||
const float2 tmp = j < cols_per_block ? y2[j*stride_col_y + col] : make_float2(0.0f, 0.0f);
|
||||
tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y};
|
||||
} else {
|
||||
float2 tmp = make_float2(0.0f, 0.0f);
|
||||
if (j < cols_per_block) {
|
||||
const int slot = slot_map[j];
|
||||
if (slot >= 0) {
|
||||
const float2 * y2_slot = (const float2 *)(y + slot*stride_channel_y);
|
||||
tmp = y2_slot[j*stride_col_y + col];
|
||||
}
|
||||
}
|
||||
float2 tmp = j < cols_per_block && slot_map[j] >= 0 ? *(const float2*) &y[slot_map[j]*stride_channel_y + 2*(j*stride_col_y + col)] : make_float2(0.0f, 0.0f);
|
||||
tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y};
|
||||
}
|
||||
}
|
||||
@@ -221,7 +209,7 @@ static inline void mul_mat_f_switch_ids(
|
||||
const dim3 & block_nums, const dim3 & block_dims, const int nbytes_shared_total, cudaStream_t stream) {
|
||||
if (ids) {
|
||||
mul_mat_f<T, MMF_ROWS_PER_BLOCK, cols_per_block, nwarps, true><<<block_nums, block_dims, nbytes_shared_total, stream>>>
|
||||
(x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
(x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} else {
|
||||
|
||||
@@ -7,11 +7,11 @@ static __global__ void timestep_embedding_f32(const float * timesteps, float * d
|
||||
int j = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
float * embed_data = (float *)((char *)dst + i*nb1);
|
||||
|
||||
if (dim % 2 != 0 && j == ((dim + 1) / 2)) {
|
||||
embed_data[dim] = 0.f;
|
||||
int half = dim / 2;
|
||||
if (dim % 2 != 0 && j == half) {
|
||||
embed_data[2 * half] = 0.f;
|
||||
}
|
||||
|
||||
int half = dim / 2;
|
||||
if (j >= half) {
|
||||
return;
|
||||
}
|
||||
|
||||
Vendored
+19
-15
@@ -158,41 +158,41 @@
|
||||
|
||||
#define __CUDA_ARCH__ 1300
|
||||
|
||||
#if defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__)
|
||||
#define GCN
|
||||
#endif
|
||||
|
||||
#if defined(__gfx900__) || defined(__gfx906__)
|
||||
#define GCN5
|
||||
#endif
|
||||
#endif // defined(__gfx900__) || defined(__gfx906__)
|
||||
|
||||
#if defined(__gfx803__)
|
||||
#define GCN4
|
||||
#endif
|
||||
#endif // defined(__gfx803__)
|
||||
|
||||
#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__)
|
||||
#define CDNA // For the entire family
|
||||
#endif
|
||||
#if defined(GCN5) || defined(GCN4)
|
||||
#define GCN
|
||||
#endif // defined(GCN5) || defined(GCN4)
|
||||
|
||||
#if defined(__gfx942__)
|
||||
#define CDNA3
|
||||
#endif
|
||||
#endif // defined(__gfx942__)
|
||||
|
||||
#if defined(__gfx90a__)
|
||||
#define CDNA2
|
||||
#endif
|
||||
#endif // defined(__gfx90a__)
|
||||
|
||||
#if defined(__gfx908__)
|
||||
#define CDNA1
|
||||
#endif
|
||||
#endif // defined(__gfx908__)
|
||||
|
||||
#if defined(CDNA3) || defined(CDNA2) || defined(CDNA1)
|
||||
#define CDNA // For the entire family
|
||||
#endif // defined(CDNA3) || defined(CDNA2) || defined(CDNA1)
|
||||
|
||||
#if defined(__GFX12__)
|
||||
#define RDNA4
|
||||
#endif
|
||||
#endif // defined(__GFX12__)
|
||||
|
||||
#if defined(__GFX11__)
|
||||
#define RDNA3
|
||||
#endif
|
||||
#endif // defined(__GFX11__)
|
||||
|
||||
#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
|
||||
defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
|
||||
@@ -201,7 +201,11 @@
|
||||
|
||||
#if defined(__gfx1010__) || defined(__gfx1012__)
|
||||
#define RDNA1
|
||||
#endif
|
||||
#endif // defined(__gfx1010__) || defined(__gfx1012__)
|
||||
|
||||
#if defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(RDNA1)
|
||||
#define RDNA // For the entire family
|
||||
#endif // defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(RDNA1)
|
||||
|
||||
#ifndef __has_builtin
|
||||
#define __has_builtin(x) 0
|
||||
|
||||
@@ -5,7 +5,12 @@ find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
||||
message(STATUS "Metal framework found")
|
||||
|
||||
ggml_add_backend_library(ggml-metal
|
||||
ggml-metal.m
|
||||
ggml-metal.cpp
|
||||
ggml-metal-device.m
|
||||
ggml-metal-device.cpp
|
||||
ggml-metal-common.cpp
|
||||
ggml-metal-context.m
|
||||
ggml-metal-ops.cpp
|
||||
)
|
||||
|
||||
target_link_libraries(ggml-metal PRIVATE
|
||||
@@ -18,10 +23,6 @@ if (GGML_METAL_NDEBUG)
|
||||
add_compile_definitions(GGML_METAL_NDEBUG)
|
||||
endif()
|
||||
|
||||
if (GGML_METAL_USE_BF16)
|
||||
add_compile_definitions(GGML_METAL_USE_BF16)
|
||||
endif()
|
||||
|
||||
# copy metal files to bin directory
|
||||
configure_file(../ggml-common.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h COPYONLY)
|
||||
configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
|
||||
|
||||
@@ -0,0 +1,458 @@
|
||||
#include "ggml-metal-common.h"
|
||||
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
// represents a memory range (i.e. an interval from a starting address p0 to an ending address p1 in a given buffer pb)
|
||||
// the type indicates whether it is a source range (i.e. ops read data from it) or a destination range (i.e. ops write data to it)
|
||||
struct ggml_mem_range {
|
||||
uint64_t pb; // buffer id
|
||||
|
||||
uint64_t p0; // begin
|
||||
uint64_t p1; // end
|
||||
|
||||
ggml_mem_range_type pt;
|
||||
};
|
||||
|
||||
struct ggml_mem_ranges {
|
||||
std::vector<ggml_mem_range> ranges;
|
||||
|
||||
int debug = 0;
|
||||
};
|
||||
|
||||
ggml_mem_ranges_t ggml_mem_ranges_init(int debug) {
|
||||
auto * res = new ggml_mem_ranges;
|
||||
|
||||
res->ranges.reserve(256);
|
||||
res->debug = debug;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
void ggml_mem_ranges_free(ggml_mem_ranges_t mrs) {
|
||||
delete mrs;
|
||||
}
|
||||
|
||||
void ggml_mem_ranges_reset(ggml_mem_ranges_t mrs) {
|
||||
mrs->ranges.clear();
|
||||
}
|
||||
|
||||
static bool ggml_mem_ranges_add(ggml_mem_ranges_t mrs, ggml_mem_range mr) {
|
||||
mrs->ranges.push_back(mr);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static ggml_mem_range ggml_mem_range_from_tensor(const ggml_tensor * tensor, ggml_mem_range_type pt) {
|
||||
// always use the base tensor
|
||||
tensor = tensor->view_src ? tensor->view_src : tensor;
|
||||
|
||||
GGML_ASSERT(!tensor->view_src);
|
||||
|
||||
ggml_mem_range mr;
|
||||
|
||||
if (tensor->buffer) {
|
||||
// when the tensor is allocated, use the actual memory address range in the buffer
|
||||
//
|
||||
// take the actual allocated size with ggml_backend_buft_get_alloc_size()
|
||||
// this can be larger than the tensor size if the buffer type allocates extra memory
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/15966
|
||||
mr = {
|
||||
/*.pb =*/ (uint64_t) tensor->buffer,
|
||||
/*.p0 =*/ (uint64_t) tensor->data,
|
||||
/*.p1 =*/ (uint64_t) tensor->data + ggml_backend_buft_get_alloc_size(tensor->buffer->buft, tensor),
|
||||
/*.pt =*/ pt,
|
||||
};
|
||||
} else {
|
||||
// otherwise, the pointer address is used as an unique id of the memory ranges
|
||||
// that the tensor will be using when it is allocated
|
||||
mr = {
|
||||
/*.pb =*/ (uint64_t) tensor,
|
||||
/*.p0 =*/ 0, //
|
||||
/*.p1 =*/ 1024, // [0, 1024) is a dummy range, not used
|
||||
/*.pt =*/ pt,
|
||||
};
|
||||
};
|
||||
|
||||
return mr;
|
||||
}
|
||||
|
||||
static ggml_mem_range ggml_mem_range_from_tensor_src(const ggml_tensor * tensor) {
|
||||
return ggml_mem_range_from_tensor(tensor, MEM_RANGE_TYPE_SRC);
|
||||
}
|
||||
|
||||
static ggml_mem_range ggml_mem_range_from_tensor_dst(const ggml_tensor * tensor) {
|
||||
return ggml_mem_range_from_tensor(tensor, MEM_RANGE_TYPE_DST);
|
||||
}
|
||||
|
||||
static bool ggml_mem_ranges_add_src(ggml_mem_ranges_t mrs, const ggml_tensor * tensor) {
|
||||
GGML_ASSERT(tensor);
|
||||
|
||||
ggml_mem_range mr = ggml_mem_range_from_tensor_src(tensor);
|
||||
|
||||
if (mrs->debug > 2) {
|
||||
GGML_LOG_DEBUG("%s: add src range buf=%lld, [%lld, %lld)\n", __func__, mr.pb, mr.p0, mr.p1);
|
||||
}
|
||||
|
||||
return ggml_mem_ranges_add(mrs, mr);
|
||||
}
|
||||
|
||||
static bool ggml_mem_ranges_add_dst(ggml_mem_ranges_t mrs, const ggml_tensor * tensor) {
|
||||
GGML_ASSERT(tensor);
|
||||
|
||||
ggml_mem_range mr = ggml_mem_range_from_tensor_dst(tensor);
|
||||
|
||||
if (mrs->debug > 2) {
|
||||
GGML_LOG_DEBUG("%s: add dst range buf=%lld, [%lld, %lld)\n", __func__, mr.pb, mr.p0, mr.p1);
|
||||
}
|
||||
|
||||
return ggml_mem_ranges_add(mrs, mr);
|
||||
}
|
||||
|
||||
bool ggml_mem_ranges_add(ggml_mem_ranges_t mrs, const ggml_tensor * tensor) {
|
||||
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||
if (tensor->src[i]) {
|
||||
ggml_mem_ranges_add_src(mrs, tensor->src[i]);
|
||||
}
|
||||
}
|
||||
|
||||
return ggml_mem_ranges_add_dst(mrs, tensor);
|
||||
}
|
||||
|
||||
static bool ggml_mem_ranges_check(ggml_mem_ranges_t mrs, ggml_mem_range mr) {
|
||||
for (size_t i = 0; i < mrs->ranges.size(); i++) {
|
||||
const auto & cmp = mrs->ranges[i];
|
||||
|
||||
// two memory ranges cannot intersect if they are in different buffers
|
||||
if (mr.pb != cmp.pb) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// intersecting source ranges are allowed
|
||||
if (mr.pt == MEM_RANGE_TYPE_SRC && cmp.pt == MEM_RANGE_TYPE_SRC) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (mr.p0 < cmp.p1 && mr.p1 >= cmp.p0) {
|
||||
if (mrs->debug > 2) {
|
||||
GGML_LOG_DEBUG("%s: the %s range buf=%lld, [%lld, %lld) overlaps with a previous %s range buf=%lld, [%lld, %lld)\n",
|
||||
__func__,
|
||||
mr.pt == MEM_RANGE_TYPE_SRC ? "src" : "dst",
|
||||
mr.pb, mr.p0, mr.p1,
|
||||
cmp.pt == MEM_RANGE_TYPE_SRC ? "src" : "dst",
|
||||
cmp.pb, cmp.p0, cmp.p1);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_mem_ranges_check_src(ggml_mem_ranges_t mrs, const ggml_tensor * tensor) {
|
||||
GGML_ASSERT(tensor);
|
||||
|
||||
ggml_mem_range mr = ggml_mem_range_from_tensor_src(tensor);
|
||||
|
||||
const bool res = ggml_mem_ranges_check(mrs, mr);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
static bool ggml_mem_ranges_check_dst(ggml_mem_ranges_t mrs, const ggml_tensor * tensor) {
|
||||
GGML_ASSERT(tensor);
|
||||
|
||||
ggml_mem_range mr = ggml_mem_range_from_tensor_dst(tensor);
|
||||
|
||||
const bool res = ggml_mem_ranges_check(mrs, mr);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
bool ggml_mem_ranges_check(ggml_mem_ranges_t mrs, const ggml_tensor * tensor) {
|
||||
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||
if (tensor->src[i]) {
|
||||
if (!ggml_mem_ranges_check_src(mrs, tensor->src[i])) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ggml_mem_ranges_check_dst(mrs, tensor);
|
||||
}
|
||||
|
||||
// TODO: move to ggml.h?
|
||||
static bool is_empty(ggml_op op) {
|
||||
switch (op) {
|
||||
case GGML_OP_NONE:
|
||||
case GGML_OP_RESHAPE:
|
||||
case GGML_OP_TRANSPOSE:
|
||||
case GGML_OP_VIEW:
|
||||
case GGML_OP_PERMUTE:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
struct node_info {
|
||||
ggml_tensor * node;
|
||||
|
||||
std::vector<ggml_tensor *> fused;
|
||||
|
||||
ggml_op op() const {
|
||||
return node->op;
|
||||
}
|
||||
|
||||
const ggml_tensor * dst() const {
|
||||
return fused.empty() ? node : fused.back();
|
||||
}
|
||||
|
||||
bool is_empty() const {
|
||||
return ::is_empty(node->op);
|
||||
}
|
||||
|
||||
void add_fused(ggml_tensor * t) {
|
||||
fused.push_back(t);
|
||||
}
|
||||
};
|
||||
|
||||
static std::vector<int> ggml_metal_graph_optimize_reorder(const std::vector<node_info> & nodes) {
|
||||
// helper to add node src and dst ranges
|
||||
const auto & h_add = [](ggml_mem_ranges_t mrs, const node_info & node) {
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (node.node->src[i]) {
|
||||
if (!ggml_mem_ranges_add_src(mrs, node.node->src[i])) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// keep track of the sources of the fused nodes as well
|
||||
for (const auto * fused : node.fused) {
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (fused->src[i]) {
|
||||
if (!ggml_mem_ranges_add_src(mrs, fused->src[i])) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ggml_mem_ranges_add_dst(mrs, node.dst());
|
||||
};
|
||||
|
||||
// helper to check if a node can run concurrently with the existing set of nodes
|
||||
const auto & h_check = [](ggml_mem_ranges_t mrs, const node_info & node) {
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (node.node->src[i]) {
|
||||
if (!ggml_mem_ranges_check_src(mrs, node.node->src[i])) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto * fused : node.fused) {
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (fused->src[i]) {
|
||||
if (!ggml_mem_ranges_check_src(mrs, fused->src[i])) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ggml_mem_ranges_check_dst(mrs, node.dst());
|
||||
};
|
||||
|
||||
// perform reorders only across these types of ops
|
||||
// can be expanded when needed
|
||||
// IMPORTANT: do not add ops such as GGML_OP_CPY or GGML_OP_SET_ROWS
|
||||
// the dependencies from such ops are not always represented in the graph
|
||||
const auto & h_safe = [](ggml_op op) {
|
||||
switch (op) {
|
||||
case GGML_OP_MUL_MAT:
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
case GGML_OP_ROPE:
|
||||
case GGML_OP_NORM:
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_GROUP_NORM:
|
||||
case GGML_OP_SUM_ROWS:
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_DIV:
|
||||
case GGML_OP_GLU:
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_GET_ROWS:
|
||||
return true;
|
||||
default:
|
||||
return is_empty(op);
|
||||
}
|
||||
};
|
||||
|
||||
const int n = nodes.size();
|
||||
|
||||
std::vector<int> res;
|
||||
res.reserve(n);
|
||||
|
||||
std::vector<bool> used(n, false);
|
||||
|
||||
// the memory ranges for the set of currently concurrent nodes
|
||||
ggml_mem_ranges_t mrs0 = ggml_mem_ranges_init(0);
|
||||
|
||||
// the memory ranges for the set of nodes that haven't been processed yet, when looking forward for a node to reorder
|
||||
ggml_mem_ranges_t mrs1 = ggml_mem_ranges_init(0);
|
||||
|
||||
for (int i0 = 0; i0 < n; i0++) {
|
||||
if (used[i0]) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const auto & node0 = nodes[i0];
|
||||
|
||||
// the node is not concurrent with the existing concurrent set, so we have to "put a barrier" (i.e reset mrs0)
|
||||
// but before we do that, look forward for some other nodes that can be added to the concurrent set mrs0
|
||||
//
|
||||
// note: we can always add empty nodes to the concurrent set as they don't read nor write anything
|
||||
if (!node0.is_empty() && !h_check(mrs0, node0)) {
|
||||
// this will hold the set of memory ranges from the nodes that haven't been processed yet
|
||||
// if a node is not concurrent with this set, we cannot reorder it
|
||||
ggml_mem_ranges_reset(mrs1);
|
||||
|
||||
// initialize it with the current node
|
||||
h_add(mrs1, node0);
|
||||
|
||||
// that many nodes forward to search for a concurrent node
|
||||
constexpr int N_FORWARD = 8;
|
||||
|
||||
for (int i1 = i0 + 1; i1 < i0 + N_FORWARD && i1 < n; i1++) {
|
||||
if (used[i1]) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const auto & node1 = nodes[i1];
|
||||
|
||||
// disallow reordering of certain ops
|
||||
if (!h_safe(node1.op())) {
|
||||
break;
|
||||
}
|
||||
|
||||
const bool is_empty = node1.is_empty();
|
||||
|
||||
// to reorder a node and add it to the concurrent set, it has to be:
|
||||
// + empty or concurrent with all nodes in the existing concurrent set (mrs0)
|
||||
// + concurrent with all nodes prior to it that haven't been processed yet (mrs1)
|
||||
if ((is_empty || h_check(mrs0, node1)) && h_check(mrs1, node1)) {
|
||||
// add the node to the existing concurrent set (i.e. reorder it for early execution)
|
||||
h_add(mrs0, node1);
|
||||
res.push_back(i1);
|
||||
|
||||
// mark as used, so we skip re-processing it later
|
||||
used[i1] = true;
|
||||
} else {
|
||||
// expand the set of nodes that haven't been processed yet
|
||||
h_add(mrs1, node1);
|
||||
}
|
||||
}
|
||||
|
||||
// finalize the concurrent set and begin a new one
|
||||
ggml_mem_ranges_reset(mrs0);
|
||||
}
|
||||
|
||||
// expand the concurrent set with the current node
|
||||
{
|
||||
h_add(mrs0, node0);
|
||||
res.push_back(i0);
|
||||
}
|
||||
}
|
||||
|
||||
ggml_mem_ranges_free(mrs0);
|
||||
ggml_mem_ranges_free(mrs1);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
void ggml_graph_optimize(ggml_cgraph * gf) {
|
||||
constexpr int MAX_FUSE = 16;
|
||||
|
||||
const int n = gf->n_nodes;
|
||||
|
||||
enum ggml_op ops[MAX_FUSE];
|
||||
|
||||
std::vector<node_info> nodes;
|
||||
nodes.reserve(gf->n_nodes);
|
||||
|
||||
// fuse nodes:
|
||||
// we don't want to make reorders that break fusing, so we first pack all fusable tensors
|
||||
// and perform the reorder over the fused nodes. after the reorder is done, we unfuse
|
||||
for (int i = 0; i < n; i++) {
|
||||
node_info node = {
|
||||
/*.node =*/ gf->nodes[i],
|
||||
/*.fused =*/ {},
|
||||
};
|
||||
|
||||
// fuse only ops that start with these operations
|
||||
// can be expanded when needed
|
||||
if (node.op() == GGML_OP_ADD ||
|
||||
node.op() == GGML_OP_RMS_NORM) {
|
||||
ops[0] = node.op();
|
||||
|
||||
int f = i + 1;
|
||||
while (f < n && f < i + MAX_FUSE) {
|
||||
// conservatively allow fusing only these ops
|
||||
// can be expanded when needed
|
||||
if (gf->nodes[f]->op != GGML_OP_ADD &&
|
||||
gf->nodes[f]->op != GGML_OP_MUL &&
|
||||
gf->nodes[f]->op != GGML_OP_RMS_NORM) {
|
||||
break;
|
||||
}
|
||||
ops[f - i] = gf->nodes[f]->op;
|
||||
f++;
|
||||
}
|
||||
|
||||
f -= i;
|
||||
for (; f > 1; f--) {
|
||||
if (ggml_can_fuse(gf, i, ops, f)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// add the fused tensors into the node info so we can unfuse them later
|
||||
for (int k = 1; k < f; k++) {
|
||||
++i;
|
||||
|
||||
// the .dst() becomes the last fused tensor
|
||||
node.add_fused(gf->nodes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
nodes.push_back(std::move(node));
|
||||
}
|
||||
|
||||
#if 1
|
||||
// reorder to improve concurrency
|
||||
const auto order = ggml_metal_graph_optimize_reorder(nodes);
|
||||
#else
|
||||
std::vector<int> order(nodes.size());
|
||||
for (size_t i = 0; i < nodes.size(); i++) {
|
||||
order[i] = i;
|
||||
}
|
||||
#endif
|
||||
|
||||
// unfuse
|
||||
{
|
||||
int j = 0;
|
||||
for (const auto i : order) {
|
||||
const auto & node = nodes[i];
|
||||
|
||||
gf->nodes[j++] = node.node;
|
||||
|
||||
for (auto * fused : node.fused) {
|
||||
gf->nodes[j++] = fused;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,52 @@
|
||||
// helper functions for ggml-metal that are too difficult to implement in Objective-C
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <stdbool.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct ggml_tensor;
|
||||
struct ggml_cgraph;
|
||||
|
||||
enum ggml_mem_range_type {
|
||||
MEM_RANGE_TYPE_SRC = 0,
|
||||
MEM_RANGE_TYPE_DST = 1,
|
||||
};
|
||||
|
||||
// a helper object that can be used for reordering operations to improve concurrency
|
||||
//
|
||||
// the fundamental idea is that a set of tasks (either ggml ops, or something else) can run concurrently if they
|
||||
// don't write to a memory that is being read by another task or written to by another task in the set
|
||||
//
|
||||
// with this structure, we can add tasks to the set, setting memory constraints. we can also check if a new task
|
||||
// can be added to the set without violating the constraints (i.e. if it can be executed concurrently with the
|
||||
// tasks already in the set)
|
||||
//
|
||||
typedef struct ggml_mem_ranges * ggml_mem_ranges_t;
|
||||
|
||||
ggml_mem_ranges_t ggml_mem_ranges_init(int debug);
|
||||
void ggml_mem_ranges_free(ggml_mem_ranges_t mrs);
|
||||
|
||||
// remove all ranges from the set
|
||||
void ggml_mem_ranges_reset(ggml_mem_ranges_t mrs);
|
||||
|
||||
// add src or dst ranges to track
|
||||
bool ggml_mem_ranges_add(ggml_mem_ranges_t mrs, const struct ggml_tensor * tensor);
|
||||
|
||||
// return false if:
|
||||
// - new src range overlaps with any existing dst range
|
||||
// - new dst range overlaps with any existing range (src or dst)
|
||||
bool ggml_mem_ranges_check(ggml_mem_ranges_t mrs, const struct ggml_tensor * tensor);
|
||||
|
||||
// reorder the nodes in the graph to improve concurrency, while respecting fusion
|
||||
//
|
||||
// note: this implementation is generic and not specific to metal
|
||||
// if it proves to work well, we can start using it for other backends in the future
|
||||
void ggml_graph_optimize(struct ggml_cgraph * gf);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@@ -0,0 +1,33 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml-metal-device.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//
|
||||
// backend context
|
||||
//
|
||||
|
||||
typedef struct ggml_metal * ggml_metal_t;
|
||||
|
||||
ggml_metal_t ggml_metal_init(ggml_metal_device_t dev);
|
||||
void ggml_metal_free(ggml_metal_t ctx);
|
||||
|
||||
void ggml_metal_synchronize(ggml_metal_t ctx);
|
||||
|
||||
void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
|
||||
enum ggml_status ggml_metal_graph_compute (ggml_metal_t ctx, struct ggml_cgraph * gf);
|
||||
void ggml_metal_graph_optimize(ggml_metal_t ctx, struct ggml_cgraph * gf);
|
||||
|
||||
void ggml_metal_set_n_cb (ggml_metal_t ctx, int n_cb);
|
||||
void ggml_metal_set_abort_callback (ggml_metal_t ctx, ggml_abort_callback abort_callback, void * user_data);
|
||||
bool ggml_metal_supports_family (ggml_metal_t ctx, int family);
|
||||
void ggml_metal_capture_next_compute(ggml_metal_t ctx);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@@ -0,0 +1,575 @@
|
||||
#import "ggml-metal-context.h"
|
||||
|
||||
#import "ggml-impl.h"
|
||||
#import "ggml-backend-impl.h"
|
||||
|
||||
#import "ggml-metal-impl.h"
|
||||
#import "ggml-metal-common.h"
|
||||
#import "ggml-metal-ops.h"
|
||||
|
||||
#import <Foundation/Foundation.h>
|
||||
|
||||
#import <Metal/Metal.h>
|
||||
|
||||
#undef MIN
|
||||
#undef MAX
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
|
||||
// max number of MTLCommandBuffer used to submit a graph for processing
|
||||
#define GGML_METAL_MAX_COMMAND_BUFFERS 8
|
||||
|
||||
struct ggml_metal_command_buffer {
|
||||
id<MTLCommandBuffer> obj;
|
||||
};
|
||||
|
||||
struct ggml_metal {
|
||||
id<MTLDevice> device;
|
||||
id<MTLCommandQueue> queue; // currently a pointer to the device queue, but might become separate queue [TAG_QUEUE_PER_BACKEND]
|
||||
|
||||
ggml_metal_device_t dev;
|
||||
ggml_metal_library_t lib;
|
||||
|
||||
dispatch_queue_t d_queue;
|
||||
|
||||
// additional, inference-time compiled pipelines
|
||||
ggml_metal_pipelines_t pipelines_ext;
|
||||
|
||||
bool use_bfloat;
|
||||
bool use_fusion;
|
||||
bool use_concurrency;
|
||||
bool use_graph_optimize;
|
||||
|
||||
int debug_graph;
|
||||
int debug_fusion;
|
||||
|
||||
// how many times a given op was fused
|
||||
uint64_t fuse_cnt[GGML_OP_COUNT];
|
||||
|
||||
// capture state
|
||||
bool capture_next_compute;
|
||||
bool capture_started;
|
||||
|
||||
id<MTLCaptureScope> capture_scope;
|
||||
|
||||
// command buffer state
|
||||
int n_cb; // number of extra threads used to submit the command buffers
|
||||
int n_nodes_0; // number of nodes submitted by the main thread
|
||||
int n_nodes_1; // remaining number of nodes submitted by the n_cb threads
|
||||
int n_nodes_per_cb;
|
||||
|
||||
struct ggml_cgraph * gf;
|
||||
|
||||
// the callback given to the thread pool
|
||||
void (^encode_async)(size_t ith);
|
||||
|
||||
// n_cb command buffers + 1 used by the main thread
|
||||
struct ggml_metal_command_buffer cmd_bufs[GGML_METAL_MAX_COMMAND_BUFFERS + 1];
|
||||
|
||||
// extra command buffers for things like getting, setting and copying tensors
|
||||
NSMutableArray * cmd_bufs_ext;
|
||||
|
||||
// the last command buffer queued into the Metal queue with operations relevant to the current Metal backend
|
||||
id<MTLCommandBuffer> cmd_buf_last;
|
||||
|
||||
// abort ggml_metal_graph_compute if callback returns true
|
||||
ggml_abort_callback abort_callback;
|
||||
void * abort_callback_data;
|
||||
};
|
||||
|
||||
ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
|
||||
GGML_LOG_INFO("%s: allocating\n", __func__);
|
||||
|
||||
#if TARGET_OS_OSX && !GGML_METAL_NDEBUG
|
||||
// Show all the Metal device instances in the system
|
||||
NSArray * devices = MTLCopyAllDevices();
|
||||
for (id<MTLDevice> device in devices) {
|
||||
GGML_LOG_INFO("%s: found device: %s\n", __func__, [[device name] UTF8String]);
|
||||
}
|
||||
[devices release]; // since it was created by a *Copy* C method
|
||||
#endif
|
||||
|
||||
// init context
|
||||
ggml_metal_t res = calloc(1, sizeof(struct ggml_metal));
|
||||
|
||||
res->device = ggml_metal_device_get_obj(dev);
|
||||
|
||||
GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[res->device name] UTF8String]);
|
||||
|
||||
// TODO: would it be better to have one queue for the backend and one queue for the device?
|
||||
// the graph encoders and async ops would use the backend queue while the sync ops would use the device queue?
|
||||
//res->queue = [device newCommandQueue]; [TAG_QUEUE_PER_BACKEND]
|
||||
res->queue = ggml_metal_device_get_queue(dev);
|
||||
if (res->queue == nil) {
|
||||
GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
res->dev = dev;
|
||||
res->lib = ggml_metal_device_get_library(dev);
|
||||
if (res->lib == NULL) {
|
||||
GGML_LOG_WARN("%s: the device does not have a precompiled Metal library - this is unexpected\n", __func__);
|
||||
GGML_LOG_WARN("%s: will try to compile it on the fly\n", __func__);
|
||||
|
||||
res->lib = ggml_metal_library_init(dev);
|
||||
if (res->lib == NULL) {
|
||||
GGML_LOG_ERROR("%s: error: failed to initialize the Metal library\n", __func__);
|
||||
|
||||
free(res);
|
||||
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev);
|
||||
|
||||
res->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
|
||||
|
||||
res->use_bfloat = props_dev->has_bfloat;
|
||||
res->use_fusion = getenv("GGML_METAL_FUSION_DISABLE") == nil;
|
||||
res->use_concurrency = getenv("GGML_METAL_CONCURRENCY_DISABLE") == nil;
|
||||
|
||||
{
|
||||
const char * val = getenv("GGML_METAL_GRAPH_DEBUG");
|
||||
res->debug_graph = val ? atoi(val) : 0;
|
||||
}
|
||||
|
||||
{
|
||||
const char * val = getenv("GGML_METAL_FUSION_DEBUG");
|
||||
res->debug_fusion = val ? atoi(val) : 0;
|
||||
}
|
||||
|
||||
res->use_graph_optimize = true;
|
||||
|
||||
if (getenv("GGML_METAL_GRAPH_OPTIMIZE_DISABLE") != NULL) {
|
||||
res->use_graph_optimize = false;
|
||||
}
|
||||
|
||||
memset(res->fuse_cnt, 0, sizeof(res->fuse_cnt));
|
||||
|
||||
GGML_LOG_INFO("%s: use bfloat = %s\n", __func__, res->use_bfloat ? "true" : "false");
|
||||
GGML_LOG_INFO("%s: use fusion = %s\n", __func__, res->use_fusion ? "true" : "false");
|
||||
GGML_LOG_INFO("%s: use concurrency = %s\n", __func__, res->use_concurrency ? "true" : "false");
|
||||
GGML_LOG_INFO("%s: use graph optimize = %s\n", __func__, res->use_graph_optimize ? "true" : "false");
|
||||
|
||||
res->capture_next_compute = false;
|
||||
res->capture_started = false;
|
||||
res->capture_scope = nil;
|
||||
|
||||
res->gf = nil;
|
||||
res->encode_async = nil;
|
||||
for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) {
|
||||
res->cmd_bufs[i].obj = nil;
|
||||
}
|
||||
|
||||
res->cmd_bufs_ext = [[NSMutableArray alloc] init];
|
||||
|
||||
res->cmd_buf_last = nil;
|
||||
|
||||
res->pipelines_ext = ggml_metal_pipelines_init();
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
void ggml_metal_free(ggml_metal_t ctx) {
|
||||
GGML_LOG_INFO("%s: deallocating\n", __func__);
|
||||
|
||||
for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) {
|
||||
if (ctx->cmd_bufs[i].obj) {
|
||||
[ctx->cmd_bufs[i].obj release];
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < (int) ctx->cmd_bufs_ext.count; ++i) {
|
||||
if (ctx->cmd_bufs_ext[i]) {
|
||||
[ctx->cmd_bufs_ext[i] release];
|
||||
}
|
||||
}
|
||||
|
||||
[ctx->cmd_bufs_ext removeAllObjects];
|
||||
[ctx->cmd_bufs_ext release];
|
||||
|
||||
if (ctx->pipelines_ext) {
|
||||
ggml_metal_pipelines_free(ctx->pipelines_ext);
|
||||
ctx->pipelines_ext = nil;
|
||||
}
|
||||
|
||||
if (ctx->debug_fusion > 0) {
|
||||
GGML_LOG_DEBUG("%s: fusion stats:\n", __func__);
|
||||
for (int i = 0; i < GGML_OP_COUNT; i++) {
|
||||
if (ctx->fuse_cnt[i] == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// note: cannot use ggml_log here
|
||||
GGML_LOG_DEBUG("%s: - %s: %" PRIu64 "\n", __func__, ggml_op_name((enum ggml_op) i), ctx->fuse_cnt[i]);
|
||||
}
|
||||
}
|
||||
|
||||
Block_release(ctx->encode_async);
|
||||
|
||||
//[ctx->queue release]; // [TAG_QUEUE_PER_BACKEND]
|
||||
|
||||
dispatch_release(ctx->d_queue);
|
||||
|
||||
free(ctx);
|
||||
}
|
||||
|
||||
void ggml_metal_synchronize(ggml_metal_t ctx) {
|
||||
// wait for any backend operations to finish
|
||||
if (ctx->cmd_buf_last) {
|
||||
[ctx->cmd_buf_last waitUntilCompleted];
|
||||
ctx->cmd_buf_last = nil;
|
||||
}
|
||||
|
||||
// release any completed command buffers
|
||||
if (ctx->cmd_bufs_ext.count > 0) {
|
||||
for (size_t i = 0; i < ctx->cmd_bufs_ext.count; ++i) {
|
||||
id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs_ext[i];
|
||||
|
||||
MTLCommandBufferStatus status = [cmd_buf status];
|
||||
if (status != MTLCommandBufferStatusCompleted) {
|
||||
GGML_LOG_ERROR("%s: error: command buffer %d failed with status %d\n", __func__, (int) i, (int) status);
|
||||
if (status == MTLCommandBufferStatusError) {
|
||||
GGML_LOG_ERROR("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
|
||||
}
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
[cmd_buf release];
|
||||
}
|
||||
|
||||
[ctx->cmd_bufs_ext removeAllObjects];
|
||||
}
|
||||
}
|
||||
|
||||
static struct ggml_metal_buffer_id ggml_metal_get_buffer_id(const struct ggml_tensor * t) {
|
||||
if (!t) {
|
||||
return (struct ggml_metal_buffer_id) { nil, 0 };
|
||||
}
|
||||
|
||||
ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
|
||||
|
||||
return ggml_metal_buffer_get_id(buffer->context, t);
|
||||
}
|
||||
|
||||
void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
@autoreleasepool {
|
||||
// wrap the source data into a Metal buffer
|
||||
id<MTLBuffer> buf_src = [ctx->device newBufferWithBytes:data
|
||||
length:size
|
||||
options:MTLResourceStorageModeShared];
|
||||
|
||||
struct ggml_metal_buffer_id bid_dst = ggml_metal_get_buffer_id(tensor);
|
||||
if (bid_dst.metal == nil) {
|
||||
GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name);
|
||||
}
|
||||
|
||||
bid_dst.offs += offset;
|
||||
|
||||
// queue the copy operation into the queue of the Metal context
|
||||
// this will be queued at the end, after any currently ongoing GPU operations
|
||||
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
|
||||
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
|
||||
|
||||
[encoder copyFromBuffer:buf_src
|
||||
sourceOffset:0
|
||||
toBuffer:bid_dst.metal
|
||||
destinationOffset:bid_dst.offs
|
||||
size:size];
|
||||
|
||||
[encoder endEncoding];
|
||||
[cmd_buf commit];
|
||||
|
||||
// do not wait here for completion
|
||||
//[cmd_buf waitUntilCompleted];
|
||||
|
||||
// instead, remember a reference to the command buffer and wait for it later if needed
|
||||
[ctx->cmd_bufs_ext addObject:cmd_buf];
|
||||
ctx->cmd_buf_last = cmd_buf;
|
||||
|
||||
[cmd_buf retain];
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
@autoreleasepool {
|
||||
id<MTLBuffer> buf_dst = [ctx->device newBufferWithBytesNoCopy:data
|
||||
length:size
|
||||
options:MTLResourceStorageModeShared
|
||||
deallocator:nil];
|
||||
|
||||
struct ggml_metal_buffer_id bid_src = ggml_metal_get_buffer_id(tensor);
|
||||
if (bid_src.metal == nil) {
|
||||
GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name);
|
||||
}
|
||||
|
||||
bid_src.offs += offset;
|
||||
|
||||
// queue the copy operation into the queue of the Metal context
|
||||
// this will be queued at the end, after any currently ongoing GPU operations
|
||||
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
|
||||
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
|
||||
|
||||
[encoder copyFromBuffer:bid_src.metal
|
||||
sourceOffset:bid_src.offs
|
||||
toBuffer:buf_dst
|
||||
destinationOffset:0
|
||||
size:size];
|
||||
|
||||
[encoder endEncoding];
|
||||
[cmd_buf commit];
|
||||
|
||||
// do not wait here for completion
|
||||
//[cmd_buf waitUntilCompleted];
|
||||
|
||||
// instead, remember a reference to the command buffer and wait for it later if needed
|
||||
[ctx->cmd_bufs_ext addObject:cmd_buf];
|
||||
ctx->cmd_buf_last = cmd_buf;
|
||||
|
||||
[cmd_buf retain];
|
||||
}
|
||||
}
|
||||
|
||||
enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph * gf) {
|
||||
// number of nodes encoded by the main thread (empirically determined)
|
||||
const int n_main = 64;
|
||||
|
||||
// number of threads in addition to the main thread
|
||||
const int n_cb = ctx->n_cb;
|
||||
|
||||
// submit the ggml compute graph to the GPU by creating command buffers and encoding the ops in them
|
||||
// the first n_nodes_0 are encoded and submitted for processing directly by the calling thread
|
||||
// while these nodes are processing, we start n_cb threads to enqueue the rest of the nodes
|
||||
// each thread creates it's own command buffer and enqueues the ops in parallel
|
||||
//
|
||||
// tests on M1 Pro and M2 Ultra using LLaMA models, show that optimal values for n_cb are 1 or 2
|
||||
|
||||
@autoreleasepool {
|
||||
ctx->gf = gf;
|
||||
|
||||
ctx->n_nodes_0 = MIN(n_main, gf->n_nodes);
|
||||
ctx->n_nodes_1 = gf->n_nodes - ctx->n_nodes_0;
|
||||
|
||||
ctx->n_nodes_per_cb = (ctx->n_nodes_1 + ctx->n_cb - 1) / ctx->n_cb;
|
||||
|
||||
const bool use_capture = ctx->capture_next_compute;
|
||||
if (use_capture) {
|
||||
ctx->capture_next_compute = false;
|
||||
|
||||
// make sure all previous computations have finished before starting the capture
|
||||
if (ctx->cmd_buf_last) {
|
||||
[ctx->cmd_buf_last waitUntilCompleted];
|
||||
ctx->cmd_buf_last = nil;
|
||||
}
|
||||
|
||||
if (!ctx->capture_started) {
|
||||
// create capture scope
|
||||
ctx->capture_scope = [[MTLCaptureManager sharedCaptureManager] newCaptureScopeWithDevice:ctx->device];
|
||||
|
||||
MTLCaptureDescriptor * descriptor = [MTLCaptureDescriptor new];
|
||||
descriptor.captureObject = ctx->capture_scope;
|
||||
descriptor.destination = MTLCaptureDestinationGPUTraceDocument;
|
||||
descriptor.outputURL = [NSURL fileURLWithPath:[NSString stringWithFormat:@"/tmp/perf-metal.gputrace"]];
|
||||
|
||||
NSError * error = nil;
|
||||
if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) {
|
||||
GGML_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]);
|
||||
} else {
|
||||
[ctx->capture_scope beginScope];
|
||||
ctx->capture_started = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// the main thread commits the first few commands immediately
|
||||
// cmd_buf[n_cb]
|
||||
{
|
||||
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
|
||||
[cmd_buf retain];
|
||||
|
||||
if (ctx->cmd_bufs[n_cb].obj) {
|
||||
[ctx->cmd_bufs[n_cb].obj release];
|
||||
}
|
||||
ctx->cmd_bufs[n_cb].obj = cmd_buf;
|
||||
|
||||
[cmd_buf enqueue];
|
||||
|
||||
ctx->encode_async(n_cb);
|
||||
}
|
||||
|
||||
// remember the command buffer for the next iteration
|
||||
ctx->cmd_buf_last = ctx->cmd_bufs[n_cb].obj;
|
||||
|
||||
// prepare the rest of the command buffers asynchronously (optional)
|
||||
// cmd_buf[0.. n_cb)
|
||||
for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
|
||||
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
|
||||
[cmd_buf retain];
|
||||
|
||||
if (ctx->cmd_bufs[cb_idx].obj) {
|
||||
[ctx->cmd_bufs[cb_idx].obj release];
|
||||
}
|
||||
ctx->cmd_bufs[cb_idx].obj = cmd_buf;
|
||||
|
||||
// always enqueue the first two command buffers
|
||||
// enqueue all of the command buffers if we don't need to abort
|
||||
if (cb_idx < 2 || ctx->abort_callback == NULL) {
|
||||
[cmd_buf enqueue];
|
||||
|
||||
// update the pointer to the last queued command buffer
|
||||
// this is needed to implement synchronize()
|
||||
ctx->cmd_buf_last = cmd_buf;
|
||||
}
|
||||
}
|
||||
|
||||
dispatch_apply(n_cb, ctx->d_queue, ctx->encode_async);
|
||||
|
||||
// for debugging: block until graph is computed
|
||||
//[ctx->cmd_buf_last waitUntilCompleted];
|
||||
|
||||
// enter here only when capturing in order to wait for all computation to finish
|
||||
// otherwise, we leave the graph to compute asynchronously
|
||||
if (!use_capture && ctx->capture_started) {
|
||||
// wait for completion and check status of each command buffer
|
||||
// needed to detect if the device ran out-of-memory for example (#1881)
|
||||
{
|
||||
id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[n_cb].obj;
|
||||
[cmd_buf waitUntilCompleted];
|
||||
|
||||
MTLCommandBufferStatus status = [cmd_buf status];
|
||||
if (status != MTLCommandBufferStatusCompleted) {
|
||||
GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status);
|
||||
if (status == MTLCommandBufferStatusError) {
|
||||
GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
|
||||
}
|
||||
|
||||
return GGML_STATUS_FAILED;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_cb; ++i) {
|
||||
id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[i].obj;
|
||||
[cmd_buf waitUntilCompleted];
|
||||
|
||||
MTLCommandBufferStatus status = [cmd_buf status];
|
||||
if (status != MTLCommandBufferStatusCompleted) {
|
||||
GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
|
||||
if (status == MTLCommandBufferStatusError) {
|
||||
GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
|
||||
}
|
||||
|
||||
return GGML_STATUS_FAILED;
|
||||
}
|
||||
|
||||
id<MTLCommandBuffer> next_buffer = (i + 1 < n_cb ? ctx->cmd_bufs[i + 1].obj : nil);
|
||||
if (!next_buffer) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued);
|
||||
if (next_queued) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) {
|
||||
GGML_LOG_INFO("%s: command buffer %d aborted", __func__, i);
|
||||
return GGML_STATUS_ABORTED;
|
||||
}
|
||||
|
||||
[next_buffer commit];
|
||||
}
|
||||
|
||||
[ctx->capture_scope endScope];
|
||||
[[MTLCaptureManager sharedCaptureManager] stopCapture];
|
||||
}
|
||||
}
|
||||
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
void ggml_metal_graph_optimize(ggml_metal_t ctx, struct ggml_cgraph * gf) {
|
||||
//const int64_t t_start = ggml_time_us();
|
||||
|
||||
if (ctx->use_graph_optimize) {
|
||||
ggml_graph_optimize(gf);
|
||||
}
|
||||
|
||||
//printf("%s: graph optimize took %.3f ms\n", __func__, (ggml_time_us() - t_start) / 1000.0);
|
||||
}
|
||||
|
||||
void ggml_metal_set_n_cb(ggml_metal_t ctx, int n_cb) {
|
||||
if (ctx->n_cb != n_cb) {
|
||||
ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_COMMAND_BUFFERS);
|
||||
|
||||
if (ctx->n_cb > 2) {
|
||||
GGML_LOG_WARN("%s: n_cb = %d, using n_cb > 2 is not recommended and can degrade the performance in some cases\n", __func__, n_cb);
|
||||
}
|
||||
}
|
||||
|
||||
if (ctx->encode_async) {
|
||||
Block_release(ctx->encode_async);
|
||||
}
|
||||
|
||||
ctx->encode_async = Block_copy(^(size_t iter) {
|
||||
const int cb_idx = iter;
|
||||
const int n_cb_l = ctx->n_cb;
|
||||
|
||||
const int n_nodes_0 = ctx->n_nodes_0;
|
||||
const int n_nodes_1 = ctx->n_nodes_1;
|
||||
|
||||
const int n_nodes_per_cb = ctx->n_nodes_per_cb;
|
||||
|
||||
int idx_start = 0;
|
||||
int idx_end = n_nodes_0;
|
||||
|
||||
if (cb_idx < n_cb_l) {
|
||||
idx_start = n_nodes_0 + ( (cb_idx + 0) * n_nodes_per_cb);
|
||||
idx_end = n_nodes_0 + (MIN((cb_idx == n_cb_l - 1) ? n_nodes_1 : (cb_idx + 1) * n_nodes_per_cb, n_nodes_1));
|
||||
}
|
||||
|
||||
id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[cb_idx].obj;
|
||||
|
||||
ggml_metal_op_t ctx_op = ggml_metal_op_init(
|
||||
ctx->dev,
|
||||
cmd_buf,
|
||||
ctx->gf,
|
||||
idx_start,
|
||||
idx_end,
|
||||
ctx->use_fusion,
|
||||
ctx->use_concurrency,
|
||||
ctx->capture_next_compute,
|
||||
ctx->debug_graph,
|
||||
ctx->debug_fusion);
|
||||
|
||||
for (int idx = idx_start; idx < idx_end;) {
|
||||
const int res = ggml_metal_op_encode(ctx_op, idx);
|
||||
if (res == 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
idx += res;
|
||||
}
|
||||
|
||||
ggml_metal_op_free(ctx_op);
|
||||
|
||||
if (cb_idx < 2 || ctx->abort_callback == NULL) {
|
||||
[cmd_buf commit];
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
void ggml_metal_set_abort_callback(ggml_metal_t ctx, ggml_abort_callback abort_callback, void * user_data) {
|
||||
ctx->abort_callback = abort_callback;
|
||||
ctx->abort_callback_data = user_data;
|
||||
}
|
||||
|
||||
bool ggml_metal_supports_family(ggml_metal_t ctx, int family) {
|
||||
GGML_ASSERT(ctx->device != nil);
|
||||
|
||||
return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
|
||||
}
|
||||
|
||||
void ggml_metal_capture_next_compute(ggml_metal_t ctx) {
|
||||
ctx->capture_next_compute = true;
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,226 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct ggml_metal_buffer_id {
|
||||
void * metal; // id<MTLBuffer>
|
||||
size_t offs;
|
||||
};
|
||||
|
||||
typedef struct ggml_metal_device * ggml_metal_device_t;
|
||||
|
||||
//
|
||||
// MTLFunctionConstantValues wrapper
|
||||
//
|
||||
|
||||
typedef struct ggml_metal_cv * ggml_metal_cv_t;
|
||||
|
||||
ggml_metal_cv_t ggml_metal_cv_init(void);
|
||||
void ggml_metal_cv_free(ggml_metal_cv_t cv);
|
||||
|
||||
void ggml_metal_cv_set_int32(ggml_metal_cv_t cv, int32_t value, int32_t idx);
|
||||
void ggml_metal_cv_set_bool (ggml_metal_cv_t cv, bool value, int32_t idx);
|
||||
|
||||
//
|
||||
// MTLComputePipelineState wrapper
|
||||
//
|
||||
|
||||
typedef struct ggml_metal_pipeline * ggml_metal_pipeline_t;
|
||||
|
||||
ggml_metal_pipeline_t ggml_metal_pipeline_init(void);
|
||||
void ggml_metal_pipeline_free(ggml_metal_pipeline_t pipeline);
|
||||
|
||||
void ggml_metal_pipeline_set_nsg(ggml_metal_pipeline_t pipeline, int nsg);
|
||||
int ggml_metal_pipeline_get_nsg(ggml_metal_pipeline_t pipeline);
|
||||
|
||||
void ggml_metal_pipeline_set_nr0(ggml_metal_pipeline_t pipeline, int nr0);
|
||||
int ggml_metal_pipeline_get_nr0(ggml_metal_pipeline_t pipeline);
|
||||
|
||||
void ggml_metal_pipeline_set_nr1(ggml_metal_pipeline_t pipeline, int nr1);
|
||||
int ggml_metal_pipeline_get_nr1(ggml_metal_pipeline_t pipeline);
|
||||
|
||||
void ggml_metal_pipeline_set_smem(ggml_metal_pipeline_t pipeline, size_t smem);
|
||||
size_t ggml_metal_pipeline_get_smem(ggml_metal_pipeline_t pipeline);
|
||||
|
||||
int ggml_metal_pipeline_max_theads_per_threadgroup(ggml_metal_pipeline_t pipeline);
|
||||
|
||||
// a collection of pipelines
|
||||
typedef struct ggml_metal_pipelines * ggml_metal_pipelines_t;
|
||||
|
||||
ggml_metal_pipelines_t ggml_metal_pipelines_init(void);
|
||||
void ggml_metal_pipelines_free(ggml_metal_pipelines_t ppls);
|
||||
|
||||
void ggml_metal_pipelines_add(ggml_metal_pipelines_t ppls, const char * name, ggml_metal_pipeline_t pipeline);
|
||||
ggml_metal_pipeline_t ggml_metal_pipelines_get(ggml_metal_pipelines_t ppls, const char * name);
|
||||
|
||||
//
|
||||
// MTLCommandBuffer wrapper
|
||||
//
|
||||
|
||||
typedef void * ggml_metal_cmd_buf_t;
|
||||
|
||||
//
|
||||
// MTLComputeCommandEncoder wrapper
|
||||
//
|
||||
|
||||
typedef struct ggml_metal_encoder * ggml_metal_encoder_t;
|
||||
|
||||
ggml_metal_encoder_t ggml_metal_encoder_init(ggml_metal_cmd_buf_t cmd_buf_raw, bool concurrent);
|
||||
void ggml_metal_encoder_free(ggml_metal_encoder_t encoder);
|
||||
|
||||
void ggml_metal_encoder_debug_group_push(ggml_metal_encoder_t encoder, const char * name);
|
||||
void ggml_metal_encoder_debug_group_pop (ggml_metal_encoder_t encoder);
|
||||
|
||||
void ggml_metal_encoder_set_pipeline(ggml_metal_encoder_t encoder, ggml_metal_pipeline_t pipeline);
|
||||
|
||||
void ggml_metal_encoder_set_bytes (ggml_metal_encoder_t encoder, void * data, size_t size, int idx);
|
||||
void ggml_metal_encoder_set_buffer(ggml_metal_encoder_t encoder, struct ggml_metal_buffer_id buffer, int idx);
|
||||
|
||||
void ggml_metal_encoder_set_threadgroup_memory_size(ggml_metal_encoder_t encoder, size_t size, int idx);
|
||||
|
||||
void ggml_metal_encoder_dispatch_threadgroups(ggml_metal_encoder_t encoder, int tg0, int tg1, int tg2, int tptg0, int tptg1, int tptg2);
|
||||
|
||||
void ggml_metal_encoder_memory_barrier(ggml_metal_encoder_t encoder);
|
||||
|
||||
void ggml_metal_encoder_end_encoding(ggml_metal_encoder_t encoder);
|
||||
|
||||
//
|
||||
// MTLLibrary wrapper
|
||||
//
|
||||
|
||||
typedef struct ggml_metal_library * ggml_metal_library_t;
|
||||
|
||||
ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev);
|
||||
void ggml_metal_library_free(ggml_metal_library_t lib);
|
||||
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline (ggml_metal_library_t lib, const char * name);
|
||||
ggml_metal_pipeline_t ggml_metal_library_compile_pipeline(ggml_metal_library_t lib, const char * base, const char * name, ggml_metal_cv_t cv);
|
||||
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_base (ggml_metal_library_t lib, enum ggml_op op);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_cpy (ggml_metal_library_t lib, enum ggml_type tsrc, enum ggml_type tdst);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_pool_2d (ggml_metal_library_t lib, const struct ggml_tensor * op, enum ggml_op_pool op_pool);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_get_rows (ggml_metal_library_t lib, enum ggml_type tsrc);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_set_rows (ggml_metal_library_t lib, enum ggml_type tdst);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_repeat (ggml_metal_library_t lib, enum ggml_type tsrc);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_unary (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_glu (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_sum_rows (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_soft_max (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_ssm_conv (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_ssm_scan (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_rwkv (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_mul_mv_ext (ggml_metal_library_t lib, enum ggml_type tsrc0, enum ggml_type tsrc1, int r1ptg);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_mul_mm (ggml_metal_library_t lib, enum ggml_type tsrc0, enum ggml_type tsrc1);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_mul_mv (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_mul_mm_id_map0 (ggml_metal_library_t lib, int ne02, int ne20);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_mul_mm_id (ggml_metal_library_t lib, enum ggml_type tsrc0, enum ggml_type tsrc1);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_mul_mv_id (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_argmax (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_argsort (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_bin (ggml_metal_library_t lib, enum ggml_op op, int32_t n_fuse, bool row);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_rms_norm (ggml_metal_library_t lib, const struct ggml_tensor * op, int32_t n_fuse);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_l2_norm (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_group_norm (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_norm (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_rope (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_im2col (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_conv_transpose_1d (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_upscale (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_pad (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_pad_reflect_1d (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_arange (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_timestep_embedding(ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext(
|
||||
ggml_metal_library_t lib,
|
||||
const struct ggml_tensor * op,
|
||||
bool has_mask,
|
||||
bool has_sinks,
|
||||
bool has_bias,
|
||||
bool has_scap,
|
||||
int32_t nsg);
|
||||
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext_vec(
|
||||
ggml_metal_library_t lib,
|
||||
const struct ggml_tensor * op,
|
||||
bool has_mask,
|
||||
bool has_sinks,
|
||||
bool has_bias,
|
||||
bool has_scap,
|
||||
int32_t nsg,
|
||||
int32_t nwg);
|
||||
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext_vec_reduce(
|
||||
ggml_metal_library_t lib,
|
||||
const struct ggml_tensor * op,
|
||||
int32_t dv,
|
||||
int32_t nwg);
|
||||
|
||||
//
|
||||
// device
|
||||
//
|
||||
|
||||
struct ggml_metal_device_props {
|
||||
char name[128];
|
||||
|
||||
size_t max_buffer_size;
|
||||
size_t max_working_set_size;
|
||||
size_t max_theadgroup_memory_size;
|
||||
|
||||
bool has_simdgroup_reduction;
|
||||
bool has_simdgroup_mm;
|
||||
bool has_unified_memory;
|
||||
bool has_bfloat;
|
||||
bool use_residency_sets;
|
||||
bool use_shared_buffers;
|
||||
|
||||
bool supports_gpu_family_apple7;
|
||||
};
|
||||
|
||||
ggml_metal_device_t ggml_metal_device_init(void);
|
||||
void ggml_metal_device_free(ggml_metal_device_t dev);
|
||||
|
||||
// return a singleton that is automatically destroyed when the program exits
|
||||
ggml_metal_device_t ggml_metal_device_get(void);
|
||||
|
||||
void * ggml_metal_device_get_obj (ggml_metal_device_t dev); // id<MTLDevice>
|
||||
void * ggml_metal_device_get_queue(ggml_metal_device_t dev); // id<MTLCommandQueue>
|
||||
|
||||
ggml_metal_library_t ggml_metal_device_get_library(ggml_metal_device_t dev);
|
||||
|
||||
void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total);
|
||||
bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_tensor * op);
|
||||
|
||||
const struct ggml_metal_device_props * ggml_metal_device_get_props(ggml_metal_device_t dev);
|
||||
|
||||
//
|
||||
// device buffers
|
||||
//
|
||||
|
||||
typedef struct ggml_metal_buffer * ggml_metal_buffer_t;
|
||||
|
||||
ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size, bool shared);
|
||||
ggml_metal_buffer_t ggml_metal_buffer_map (ggml_metal_device_t dev, void * ptr, size_t size, size_t max_tensor_size);
|
||||
|
||||
void ggml_metal_buffer_free (ggml_metal_buffer_t buf);
|
||||
void * ggml_metal_buffer_get_base (ggml_metal_buffer_t buf);
|
||||
bool ggml_metal_buffer_is_shared(ggml_metal_buffer_t buf);
|
||||
|
||||
void ggml_metal_buffer_memset_tensor(ggml_metal_buffer_t buf, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
|
||||
void ggml_metal_buffer_set_tensor (ggml_metal_buffer_t buf, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
void ggml_metal_buffer_get_tensor (ggml_metal_buffer_t buf, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
void ggml_metal_buffer_clear (ggml_metal_buffer_t buf, uint8_t value);
|
||||
|
||||
// finds the Metal buffer that contains the tensor data on the GPU device
|
||||
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
|
||||
// Metal buffer based on the host memory pointer
|
||||
//
|
||||
struct ggml_metal_buffer_id ggml_metal_buffer_get_id(ggml_metal_buffer_t buf, const struct ggml_tensor * t);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
@@ -165,6 +165,16 @@ typedef struct {
|
||||
uint64_t nb3;
|
||||
} ggml_metal_kargs_repeat;
|
||||
|
||||
typedef struct {
|
||||
float scale;
|
||||
float bias;
|
||||
} ggml_metal_kargs_scale;
|
||||
|
||||
typedef struct {
|
||||
float min;
|
||||
float max;
|
||||
} ggml_metal_kargs_clamp;
|
||||
|
||||
typedef struct {
|
||||
int64_t ne00;
|
||||
int64_t ne01;
|
||||
@@ -453,7 +463,7 @@ typedef struct {
|
||||
uint64_t nb00;
|
||||
uint64_t nb01;
|
||||
uint64_t nb02;
|
||||
int32_t n_groups;
|
||||
int32_t ngrp;
|
||||
float eps;
|
||||
} ggml_metal_kargs_group_norm;
|
||||
|
||||
@@ -506,14 +516,6 @@ typedef struct {
|
||||
uint64_t nb01;
|
||||
uint64_t nb02;
|
||||
uint64_t nb03;
|
||||
int64_t ne10;
|
||||
int64_t ne11;
|
||||
int64_t ne12;
|
||||
int64_t ne13;
|
||||
uint64_t nb10;
|
||||
uint64_t nb11;
|
||||
uint64_t nb12;
|
||||
uint64_t nb13;
|
||||
int64_t ne0;
|
||||
int64_t ne1;
|
||||
int64_t ne2;
|
||||
@@ -547,12 +549,6 @@ typedef struct {
|
||||
int32_t n_head_log2;
|
||||
} ggml_metal_kargs_soft_max;
|
||||
|
||||
typedef struct {
|
||||
int64_t ne00;
|
||||
int64_t ne01;
|
||||
int n_past;
|
||||
} ggml_metal_kargs_diag_mask_inf;
|
||||
|
||||
typedef struct {
|
||||
int64_t ne00;
|
||||
int64_t ne01;
|
||||
@@ -579,7 +575,7 @@ typedef struct {
|
||||
int64_t n_group;
|
||||
int64_t n_seq_tokens;
|
||||
int64_t n_seqs;
|
||||
int64_t s_off;
|
||||
uint64_t s_off;
|
||||
uint64_t nb01;
|
||||
uint64_t nb02;
|
||||
uint64_t nb03;
|
||||
@@ -719,7 +715,12 @@ typedef struct {
|
||||
int64_t IW;
|
||||
int64_t OH;
|
||||
int64_t OW;
|
||||
int64_t parallel_elements;
|
||||
int64_t np;
|
||||
} ggml_metal_kargs_pool_2d;
|
||||
|
||||
typedef struct {
|
||||
int64_t ne00;
|
||||
uint64_t nb01;
|
||||
} ggml_metal_kargs_argmax;
|
||||
|
||||
#endif // GGML_METAL_IMPL
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,81 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml-metal-device.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct ggml_metal_op * ggml_metal_op_t;
|
||||
|
||||
ggml_metal_op_t ggml_metal_op_init(
|
||||
ggml_metal_device_t dev,
|
||||
ggml_metal_cmd_buf_t cmd_buf,
|
||||
struct ggml_cgraph * gf,
|
||||
int idx_start,
|
||||
int idx_end,
|
||||
bool use_fusion,
|
||||
bool use_concurrency,
|
||||
bool use_capture,
|
||||
int debug_graph,
|
||||
int debug_fusion);
|
||||
|
||||
void ggml_metal_op_free(ggml_metal_op_t ctx);
|
||||
|
||||
int ggml_metal_op_encode(ggml_metal_op_t ctx, int idx);
|
||||
|
||||
//
|
||||
// available ops:
|
||||
//
|
||||
|
||||
// tokens per expert
|
||||
size_t ggml_metal_op_mul_mat_id_extra_tpe(const struct ggml_tensor * op);
|
||||
|
||||
// id map [n_tokens, n_expert]
|
||||
size_t ggml_metal_op_mul_mat_id_extra_ids(const struct ggml_tensor * op);
|
||||
|
||||
// return true if we should use the FA vector kernel for this op
|
||||
bool ggml_metal_op_flash_attn_ext_use_vec(const struct ggml_tensor * op);
|
||||
|
||||
size_t ggml_metal_op_flash_attn_ext_extra_tmp(const struct ggml_tensor * op);
|
||||
|
||||
int ggml_metal_op_concat (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_repeat (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_acc (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_scale (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_clamp (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_unary (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_glu (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_sum_rows (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_get_rows (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_set_rows (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_soft_max (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_ssm_conv (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_ssm_scan (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_rwkv (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_cpy (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_pool_2d (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_mul_mat (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_mul_mat_id (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_add_id (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_flash_attn_ext (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_bin (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_rms_norm (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_l2_norm (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_group_norm (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_norm (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_rope (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_im2col (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_conv_transpose_1d (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_upscale (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_pad (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_pad_reflect_1d (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_arange (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_timestep_embedding(ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_argmax (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_argsort (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_leaky_relu (ggml_metal_op_t ctx, int idx);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@@ -0,0 +1,718 @@
|
||||
#include "ggml-metal.h"
|
||||
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#include "ggml-metal-device.h"
|
||||
#include "ggml-metal-context.h"
|
||||
#include "ggml-metal-ops.h"
|
||||
|
||||
// globals
|
||||
|
||||
// initialized in ggml_backend_metal_reg
|
||||
static ggml_backend_reg g_ggml_metal_reg;
|
||||
static ggml_backend_device g_ggml_metal_device;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// backend interface
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// shared buffer
|
||||
|
||||
static void ggml_backend_metal_buffer_shared_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_metal_buffer_t ctx = (ggml_metal_buffer_t)buffer->context;
|
||||
|
||||
GGML_ASSERT(ggml_metal_buffer_is_shared(ctx));
|
||||
|
||||
ggml_metal_buffer_free(ctx);
|
||||
}
|
||||
|
||||
static void * ggml_backend_metal_buffer_shared_get_base(ggml_backend_buffer_t buffer) {
|
||||
ggml_metal_buffer_t ctx = (ggml_metal_buffer_t)buffer->context;
|
||||
|
||||
GGML_ASSERT(ggml_metal_buffer_is_shared(ctx));
|
||||
|
||||
return ggml_metal_buffer_get_base(ctx);
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_buffer_shared_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
||||
ggml_metal_buffer_t ctx = (ggml_metal_buffer_t)buffer->context;
|
||||
|
||||
GGML_ASSERT(ggml_metal_buffer_is_shared(ctx));
|
||||
|
||||
ggml_metal_buffer_memset_tensor(ctx, tensor, value, offset, size);
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_buffer_shared_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
ggml_metal_buffer_t ctx = (ggml_metal_buffer_t)buffer->context;
|
||||
|
||||
GGML_ASSERT(ggml_metal_buffer_is_shared(ctx));
|
||||
|
||||
ggml_metal_buffer_set_tensor(ctx, tensor, data, offset, size);
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_buffer_shared_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
ggml_metal_buffer_t ctx = (ggml_metal_buffer_t)buffer->context;
|
||||
|
||||
GGML_ASSERT(ggml_metal_buffer_is_shared(ctx));
|
||||
|
||||
ggml_metal_buffer_get_tensor(ctx, tensor, data, offset, size);
|
||||
}
|
||||
|
||||
static bool ggml_backend_metal_buffer_shared_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
||||
ggml_metal_buffer_t ctx = (ggml_metal_buffer_t)buffer->context;
|
||||
|
||||
GGML_ASSERT(ggml_metal_buffer_is_shared(ctx));
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
GGML_UNUSED(src);
|
||||
GGML_UNUSED(dst);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_buffer_shared_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
ggml_metal_buffer_t ctx = (ggml_metal_buffer_t)buffer->context;
|
||||
|
||||
GGML_ASSERT(ggml_metal_buffer_is_shared(ctx));
|
||||
|
||||
ggml_metal_buffer_clear(ctx, value);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_i ggml_backend_metal_buffer_shared_i = {
|
||||
/* .free_buffer = */ ggml_backend_metal_buffer_shared_free_buffer,
|
||||
/* .get_base = */ ggml_backend_metal_buffer_shared_get_base,
|
||||
/* .init_tensor = */ NULL,
|
||||
/* .memset_tensor = */ ggml_backend_metal_buffer_shared_memset_tensor,
|
||||
/* .set_tensor = */ ggml_backend_metal_buffer_shared_set_tensor,
|
||||
/* .get_tensor = */ ggml_backend_metal_buffer_shared_get_tensor,
|
||||
/* .cpy_tensor = */ ggml_backend_metal_buffer_shared_cpy_tensor,
|
||||
/* .clear = */ ggml_backend_metal_buffer_shared_clear,
|
||||
/* .reset = */ NULL,
|
||||
};
|
||||
|
||||
// private buffer
|
||||
|
||||
static void ggml_backend_metal_buffer_private_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_metal_buffer_t ctx = (ggml_metal_buffer_t)buffer->context;
|
||||
|
||||
GGML_ASSERT(!ggml_metal_buffer_is_shared(ctx));
|
||||
|
||||
ggml_metal_buffer_free(ctx);
|
||||
}
|
||||
|
||||
static void * ggml_backend_metal_buffer_private_get_base(ggml_backend_buffer_t buffer) {
|
||||
ggml_metal_buffer_t ctx = (ggml_metal_buffer_t)buffer->context;
|
||||
|
||||
GGML_ASSERT(!ggml_metal_buffer_is_shared(ctx));
|
||||
|
||||
return ggml_metal_buffer_get_base(ctx);
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_buffer_private_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
||||
ggml_metal_buffer_t ctx = (ggml_metal_buffer_t)buffer->context;
|
||||
|
||||
GGML_ASSERT(!ggml_metal_buffer_is_shared(ctx));
|
||||
|
||||
ggml_metal_buffer_memset_tensor(ctx, tensor, value, offset, size);
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_buffer_private_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
ggml_metal_buffer_t ctx = (ggml_metal_buffer_t)buffer->context;
|
||||
|
||||
GGML_ASSERT(!ggml_metal_buffer_is_shared(ctx));
|
||||
|
||||
ggml_metal_buffer_set_tensor(ctx, tensor, data, offset, size);
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_buffer_private_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
ggml_metal_buffer_t ctx = (ggml_metal_buffer_t)buffer->context;
|
||||
|
||||
GGML_ASSERT(!ggml_metal_buffer_is_shared(ctx));
|
||||
|
||||
ggml_metal_buffer_get_tensor(ctx, tensor, data, offset, size);
|
||||
}
|
||||
|
||||
static bool ggml_backend_metal_buffer_private_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
||||
ggml_metal_buffer_t ctx = (ggml_metal_buffer_t)buffer->context;
|
||||
|
||||
GGML_ASSERT(!ggml_metal_buffer_is_shared(ctx));
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
GGML_UNUSED(src);
|
||||
GGML_UNUSED(dst);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_buffer_private_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
ggml_metal_buffer_t ctx = (ggml_metal_buffer_t)buffer->context;
|
||||
|
||||
GGML_ASSERT(!ggml_metal_buffer_is_shared(ctx));
|
||||
|
||||
ggml_metal_buffer_clear(ctx, value);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_i ggml_backend_metal_buffer_private_i = {
|
||||
/* .free_buffer = */ ggml_backend_metal_buffer_private_free_buffer,
|
||||
/* .get_base = */ ggml_backend_metal_buffer_private_get_base,
|
||||
/* .init_tensor = */ NULL,
|
||||
/* .memset_tensor = */ ggml_backend_metal_buffer_private_memset_tensor,
|
||||
/* .set_tensor = */ ggml_backend_metal_buffer_private_set_tensor,
|
||||
/* .get_tensor = */ ggml_backend_metal_buffer_private_get_tensor,
|
||||
/* .cpy_tensor = */ ggml_backend_metal_buffer_private_cpy_tensor,
|
||||
/* .clear = */ ggml_backend_metal_buffer_private_clear,
|
||||
/* .reset = */ NULL,
|
||||
};
|
||||
|
||||
//
|
||||
// buffer types
|
||||
//
|
||||
|
||||
// common method for allocating shread or private Metal buffers
|
||||
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size, bool shared) {
|
||||
ggml_metal_device_t ctx_dev = (ggml_metal_device_t)buft->device->context;
|
||||
ggml_metal_buffer_t res = ggml_metal_buffer_init(ctx_dev, size, shared);
|
||||
|
||||
ggml_backend_buffer_i buf_i = ggml_metal_buffer_is_shared(res)
|
||||
? ggml_backend_metal_buffer_shared_i
|
||||
: ggml_backend_metal_buffer_private_i;
|
||||
|
||||
return ggml_backend_buffer_init(buft, buf_i, res, size);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_metal_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
||||
size_t res = ggml_nbytes(tensor);
|
||||
|
||||
// some operations require additional memory for fleeting data:
|
||||
switch (tensor->op) {
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
{
|
||||
res += ggml_metal_op_mul_mat_id_extra_tpe(tensor);
|
||||
res += ggml_metal_op_mul_mat_id_extra_ids(tensor);
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
{
|
||||
if (ggml_metal_op_flash_attn_ext_use_vec(tensor)) {
|
||||
res += ggml_metal_op_flash_attn_ext_extra_tmp(tensor);
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return res;
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
// default (shared) buffer type
|
||||
|
||||
static const char * ggml_backend_metal_buffer_type_shared_get_name(ggml_backend_buffer_type_t buft) {
|
||||
return "Metal";
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_shared_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
return ggml_backend_metal_buffer_type_alloc_buffer(buft, size, true);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_metal_buffer_type_shared_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
return 32;
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_metal_buffer_type_shared_get_max_size(ggml_backend_buffer_type_t buft) {
|
||||
ggml_metal_device_t ctx_dev = (ggml_metal_device_t)buft->device->context;
|
||||
|
||||
return ggml_metal_device_get_props(ctx_dev)->max_buffer_size;
|
||||
}
|
||||
|
||||
static size_t ggml_backend_metal_buffer_type_shared_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
||||
return ggml_backend_metal_buffer_type_get_alloc_size(buft, tensor);
|
||||
}
|
||||
|
||||
static bool ggml_backend_metal_buffer_type_shared_is_host(ggml_backend_buffer_type_t buft) {
|
||||
return false;
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_shared(void) {
|
||||
static ggml_backend_buffer_type ggml_backend_buffer_type_metal = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_metal_buffer_type_shared_get_name,
|
||||
/* .alloc_buffer = */ ggml_backend_metal_buffer_type_shared_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_metal_buffer_type_shared_get_alignment,
|
||||
/* .get_max_size = */ ggml_backend_metal_buffer_type_shared_get_max_size,
|
||||
/* .get_alloc_size = */ ggml_backend_metal_buffer_type_shared_get_alloc_size,
|
||||
/* .is_host = */ ggml_backend_metal_buffer_type_shared_is_host,
|
||||
},
|
||||
/* .device = */ &g_ggml_metal_device,
|
||||
/* .context = */ NULL,
|
||||
};
|
||||
|
||||
return &ggml_backend_buffer_type_metal;
|
||||
}
|
||||
|
||||
// default (private) buffer type
|
||||
|
||||
static const char * ggml_backend_metal_buffer_type_private_get_name(ggml_backend_buffer_type_t buft) {
|
||||
return "Metal_Private";
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_private_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
return ggml_backend_metal_buffer_type_alloc_buffer(buft, size, false);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_metal_buffer_type_private_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
return 32;
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_metal_buffer_type_private_get_max_size(ggml_backend_buffer_type_t buft) {
|
||||
ggml_metal_device_t ctx_dev = (ggml_metal_device_t)buft->device->context;
|
||||
|
||||
return ggml_metal_device_get_props(ctx_dev)->max_buffer_size;
|
||||
}
|
||||
|
||||
static size_t ggml_backend_metal_buffer_type_private_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
||||
return ggml_backend_metal_buffer_type_get_alloc_size(buft, tensor);
|
||||
}
|
||||
|
||||
static bool ggml_backend_metal_buffer_type_private_is_host(ggml_backend_buffer_type_t buft) {
|
||||
return false;
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_private(void) {
|
||||
static ggml_backend_buffer_type ggml_backend_buffer_type_metal = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_metal_buffer_type_private_get_name,
|
||||
/* .alloc_buffer = */ ggml_backend_metal_buffer_type_private_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_metal_buffer_type_private_get_alignment,
|
||||
/* .get_max_size = */ ggml_backend_metal_buffer_type_private_get_max_size,
|
||||
/* .get_alloc_size = */ ggml_backend_metal_buffer_type_private_get_alloc_size,
|
||||
/* .is_host = */ ggml_backend_metal_buffer_type_private_is_host,
|
||||
},
|
||||
/* .device = */ &g_ggml_metal_device,
|
||||
/* .context = */ NULL,
|
||||
};
|
||||
|
||||
return &ggml_backend_buffer_type_metal;
|
||||
}
|
||||
|
||||
// mapped buffer type
|
||||
|
||||
static const char * ggml_backend_metal_buffer_type_mapped_get_name(ggml_backend_buffer_type_t buft) {
|
||||
return "Metal_Mapped";
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_mapped_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
// for mapped buffers, prefer shared memory
|
||||
return ggml_backend_metal_buffer_type_alloc_buffer(buft, size, true);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_metal_buffer_type_mapped_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
return 32;
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_metal_buffer_type_mapped_get_max_size(ggml_backend_buffer_type_t buft) {
|
||||
ggml_metal_device_t ctx_dev = (ggml_metal_device_t)buft->device->context;
|
||||
|
||||
return ggml_metal_device_get_props(ctx_dev)->max_buffer_size;
|
||||
}
|
||||
|
||||
static size_t ggml_backend_metal_buffer_type_mapped_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
||||
return ggml_backend_metal_buffer_type_get_alloc_size(buft, tensor);
|
||||
}
|
||||
|
||||
static bool ggml_backend_metal_buffer_type_mapped_is_host(ggml_backend_buffer_type_t buft) {
|
||||
return false;
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_mapped(void) {
|
||||
// note: not obvious, but this buffer type still needs to implement .alloc_buffer:
|
||||
// https://github.com/ggml-org/llama.cpp/pull/15832#discussion_r2333177099
|
||||
static ggml_backend_buffer_type ggml_backend_buffer_type_mapped_metal = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_metal_buffer_type_mapped_get_name,
|
||||
/* .alloc_buffer = */ ggml_backend_metal_buffer_type_mapped_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_metal_buffer_type_mapped_get_alignment,
|
||||
/* .get_max_size = */ ggml_backend_metal_buffer_type_mapped_get_max_size,
|
||||
/* .get_alloc_size = */ ggml_backend_metal_buffer_type_mapped_get_alloc_size,
|
||||
/* .is_host = */ ggml_backend_metal_buffer_type_mapped_is_host,
|
||||
},
|
||||
/* .device = */ &g_ggml_metal_device,
|
||||
/* .context = */ NULL,
|
||||
};
|
||||
|
||||
return &ggml_backend_buffer_type_mapped_metal;
|
||||
}
|
||||
|
||||
// backend
|
||||
|
||||
static const char * ggml_backend_metal_name(ggml_backend_t backend) {
|
||||
return "Metal";
|
||||
|
||||
GGML_UNUSED(backend);
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_free(ggml_backend_t backend) {
|
||||
ggml_metal_t ctx = (ggml_metal_t)backend->context;
|
||||
|
||||
// wait for any ongoing async operations to finish
|
||||
ggml_metal_synchronize(ctx);
|
||||
|
||||
ggml_metal_free(ctx);
|
||||
|
||||
free(backend);
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_synchronize(ggml_backend_t backend) {
|
||||
ggml_metal_t ctx = (ggml_metal_t)backend->context;
|
||||
|
||||
ggml_metal_synchronize(ctx);
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
ggml_metal_t ctx = (ggml_metal_t)backend->context;
|
||||
|
||||
ggml_metal_set_tensor_async(ctx, tensor, data, offset, size);
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
ggml_metal_t ctx = (ggml_metal_t)backend->context;
|
||||
|
||||
ggml_metal_get_tensor_async(ctx, tensor, data, offset, size);
|
||||
}
|
||||
|
||||
static bool ggml_backend_metal_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
|
||||
return false;
|
||||
|
||||
GGML_UNUSED(backend_src);
|
||||
GGML_UNUSED(backend_dst);
|
||||
GGML_UNUSED(src);
|
||||
GGML_UNUSED(dst);
|
||||
}
|
||||
|
||||
static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
ggml_metal_t ctx = (ggml_metal_t)backend->context;
|
||||
|
||||
return ggml_metal_graph_compute(ctx, cgraph);
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
ggml_metal_t ctx = (ggml_metal_t)backend->context;
|
||||
|
||||
ggml_metal_graph_optimize(ctx, cgraph);
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
|
||||
GGML_ASSERT(ggml_backend_is_metal(backend));
|
||||
|
||||
ggml_metal_t ctx = (ggml_metal_t)backend->context;
|
||||
|
||||
ggml_metal_set_n_cb(ctx, n_cb);
|
||||
|
||||
}
|
||||
|
||||
static ggml_backend_i ggml_backend_metal_i = {
|
||||
/* .get_name = */ ggml_backend_metal_name,
|
||||
/* .free = */ ggml_backend_metal_free,
|
||||
/* .set_tensor_async = */ ggml_backend_metal_set_tensor_async,
|
||||
/* .get_tensor_async = */ ggml_backend_metal_get_tensor_async,
|
||||
/* .cpy_tensor_async = */ ggml_backend_metal_cpy_tensor_async, // only needed for multi-GPU setups
|
||||
/* .synchronize = */ ggml_backend_metal_synchronize,
|
||||
/* .graph_plan_create = */ NULL,
|
||||
/* .graph_plan_free = */ NULL,
|
||||
/* .graph_plan_update = */ NULL,
|
||||
/* .graph_plan_compute = */ NULL,
|
||||
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
||||
|
||||
// the events API is needed only for multi-GPU setups, so likely no need to implement it for Metal
|
||||
// in any case, these docs seem relevant if we ever decide to implement it:
|
||||
// https://developer.apple.com/documentation/metal/mtlcommandbuffer#Synchronizing-Passes-with-Events
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
/* .optimize_graph = */ ggml_backend_metal_graph_optimize,
|
||||
};
|
||||
|
||||
static ggml_guid_t ggml_backend_metal_guid(void) {
|
||||
static ggml_guid guid = { 0x81, 0xa1, 0x8b, 0x1e, 0x71, 0xec, 0x79, 0xed, 0x2b, 0x85, 0xdc, 0x8a, 0x61, 0x98, 0x30, 0xe6 };
|
||||
return &guid;
|
||||
}
|
||||
|
||||
ggml_backend_t ggml_backend_metal_init(void) {
|
||||
ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_metal_reg(), 0);
|
||||
ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;
|
||||
|
||||
ggml_metal_t ctx = ggml_metal_init(ctx_dev);
|
||||
if (ctx == NULL) {
|
||||
GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ggml_backend_t backend = (ggml_backend_t) malloc(sizeof(ggml_backend));
|
||||
|
||||
*backend = {
|
||||
/* .guid = */ ggml_backend_metal_guid(),
|
||||
/* .interface = */ ggml_backend_metal_i,
|
||||
/* .device = */ dev,
|
||||
/* .context = */ ctx,
|
||||
};
|
||||
|
||||
ggml_backend_metal_set_n_cb(backend, 1);
|
||||
|
||||
return backend;
|
||||
}
|
||||
|
||||
bool ggml_backend_is_metal(ggml_backend_t backend) {
|
||||
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_metal_guid());
|
||||
}
|
||||
|
||||
void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data) {
|
||||
GGML_ASSERT(ggml_backend_is_metal(backend));
|
||||
|
||||
ggml_metal_t ctx = (ggml_metal_t)backend->context;
|
||||
|
||||
ggml_metal_set_abort_callback(ctx, abort_callback, user_data);
|
||||
}
|
||||
|
||||
bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) {
|
||||
GGML_ASSERT(ggml_backend_is_metal(backend));
|
||||
|
||||
ggml_metal_t ctx = (ggml_metal_t)backend->context;
|
||||
|
||||
return ggml_metal_supports_family(ctx, family);
|
||||
}
|
||||
|
||||
void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) {
|
||||
GGML_ASSERT(ggml_backend_is_metal(backend));
|
||||
|
||||
ggml_metal_t ctx = (ggml_metal_t)backend->context;
|
||||
|
||||
ggml_metal_capture_next_compute(ctx);
|
||||
}
|
||||
|
||||
// backend device
|
||||
|
||||
static const char * ggml_backend_metal_device_get_name(ggml_backend_dev_t dev) {
|
||||
return "Metal";
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static const char * ggml_backend_metal_device_get_description(ggml_backend_dev_t dev) {
|
||||
ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;
|
||||
|
||||
return ggml_metal_device_get_props(ctx_dev)->name;
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||
ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;
|
||||
|
||||
ggml_metal_device_get_memory(ctx_dev, free, total);
|
||||
}
|
||||
|
||||
static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backend_dev_t dev) {
|
||||
return GGML_BACKEND_DEVICE_TYPE_GPU;
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
||||
props->name = ggml_backend_metal_device_get_name(dev);
|
||||
props->description = ggml_backend_metal_device_get_description(dev);
|
||||
props->type = ggml_backend_metal_device_get_type(dev);
|
||||
|
||||
ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
|
||||
props->caps = {
|
||||
/* .async = */ true,
|
||||
/* .host_buffer = */ false,
|
||||
/* .buffer_from_host_ptr = */ true,
|
||||
/* .events = */ false,
|
||||
};
|
||||
}
|
||||
|
||||
static ggml_backend_t ggml_backend_metal_device_init(ggml_backend_dev_t dev, const char * params) {
|
||||
ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;
|
||||
|
||||
ggml_metal_t ctx = ggml_metal_init(ctx_dev);
|
||||
if (ctx == NULL) {
|
||||
GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ggml_backend_t backend = (ggml_backend_t) malloc(sizeof(ggml_backend));
|
||||
|
||||
*backend = {
|
||||
/* .guid = */ ggml_backend_metal_guid(),
|
||||
/* .interface = */ ggml_backend_metal_i,
|
||||
/* .device = */ dev,
|
||||
/* .context = */ ctx,
|
||||
};
|
||||
|
||||
ggml_backend_metal_set_n_cb(backend, 1);
|
||||
|
||||
return backend;
|
||||
|
||||
GGML_UNUSED(params);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_metal_device_get_buffer_type(ggml_backend_dev_t dev) {
|
||||
ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;
|
||||
|
||||
const ggml_metal_device_props * props_dev = ggml_metal_device_get_props(ctx_dev);
|
||||
|
||||
return props_dev->use_shared_buffers ? ggml_backend_metal_buffer_type_shared() : ggml_backend_metal_buffer_type_private();
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_metal_device_buffer_mapped(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
||||
ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;
|
||||
|
||||
ggml_metal_buffer_t res = ggml_metal_buffer_map(ctx_dev, ptr, size, max_tensor_size);
|
||||
|
||||
return ggml_backend_buffer_init(ggml_backend_metal_buffer_type_mapped(), ggml_backend_metal_buffer_shared_i, res, size);
|
||||
}
|
||||
|
||||
static bool ggml_backend_metal_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;
|
||||
|
||||
return ggml_metal_device_supports_op(ctx_dev, op);
|
||||
}
|
||||
|
||||
static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
||||
return
|
||||
buft->iface.get_name == ggml_backend_metal_buffer_type_shared_get_name ||
|
||||
buft->iface.get_name == ggml_backend_metal_buffer_type_private_get_name ||
|
||||
buft->iface.get_name == ggml_backend_metal_buffer_type_mapped_get_name;
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static int64_t get_op_batch_size(const ggml_tensor * op) {
|
||||
switch (op->op) {
|
||||
case GGML_OP_MUL_MAT:
|
||||
return op->ne[1];
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
return op->ne[2];
|
||||
default:
|
||||
return ggml_nrows(op);
|
||||
}
|
||||
}
|
||||
|
||||
static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
const int min_batch_size = 32;
|
||||
|
||||
return (op->op == GGML_OP_MUL_MAT ||
|
||||
op->op == GGML_OP_MUL_MAT_ID) &&
|
||||
get_op_batch_size(op) >= min_batch_size;
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
GGML_UNUSED(op);
|
||||
}
|
||||
|
||||
static ggml_backend_device_i ggml_backend_metal_device_i = {
|
||||
/* .get_name = */ ggml_backend_metal_device_get_name,
|
||||
/* .get_description = */ ggml_backend_metal_device_get_description,
|
||||
/* .get_memory = */ ggml_backend_metal_device_get_memory,
|
||||
/* .get_type = */ ggml_backend_metal_device_get_type,
|
||||
/* .get_props = */ ggml_backend_metal_device_get_props,
|
||||
/* .init_backend = */ ggml_backend_metal_device_init,
|
||||
/* .get_buffer_type = */ ggml_backend_metal_device_get_buffer_type,
|
||||
/* .get_host_buffer_type = */ NULL,
|
||||
/* .buffer_from_host_ptr = */ ggml_backend_metal_device_buffer_mapped,
|
||||
/* .supports_op = */ ggml_backend_metal_device_supports_op,
|
||||
/* .supports_buft = */ ggml_backend_metal_device_supports_buft,
|
||||
/* .offload_op = */ ggml_backend_metal_device_offload_op,
|
||||
/* .event_new = */ NULL,
|
||||
/* .event_free = */ NULL,
|
||||
/* .event_synchronize = */ NULL,
|
||||
};
|
||||
|
||||
// backend registry
|
||||
|
||||
static const char * ggml_backend_metal_reg_get_name(ggml_backend_reg_t reg) {
|
||||
return "Metal";
|
||||
|
||||
GGML_UNUSED(reg);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_metal_reg_device_count(ggml_backend_reg_t reg) {
|
||||
return 1;
|
||||
|
||||
GGML_UNUSED(reg);
|
||||
}
|
||||
|
||||
static ggml_backend_dev_t ggml_backend_metal_reg_device_get(ggml_backend_reg_t reg, size_t index) {
|
||||
GGML_ASSERT(index == 0);
|
||||
|
||||
return &g_ggml_metal_device;
|
||||
|
||||
GGML_UNUSED(reg);
|
||||
GGML_UNUSED(index);
|
||||
}
|
||||
|
||||
static ggml_backend_feature g_ggml_backend_metal_features[] = {
|
||||
#if defined(GGML_METAL_EMBED_LIBRARY)
|
||||
{ "EMBED_LIBRARY", "1" },
|
||||
#endif
|
||||
{ NULL, NULL },
|
||||
};
|
||||
|
||||
static ggml_backend_feature * ggml_backend_metal_get_features(ggml_backend_reg_t reg) {
|
||||
return g_ggml_backend_metal_features;
|
||||
|
||||
GGML_UNUSED(reg);
|
||||
}
|
||||
|
||||
static void * ggml_backend_metal_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
||||
if (strcmp(name, "ggml_backend_get_features") == 0) {
|
||||
return (void *)ggml_backend_metal_get_features;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
|
||||
GGML_UNUSED(reg);
|
||||
}
|
||||
|
||||
static ggml_backend_reg_i ggml_backend_metal_reg_i = {
|
||||
/* .get_name = */ ggml_backend_metal_reg_get_name,
|
||||
/* .device_count = */ ggml_backend_metal_reg_device_count,
|
||||
/* .device_get = */ ggml_backend_metal_reg_device_get,
|
||||
/* .get_proc_address = */ ggml_backend_metal_get_proc_address,
|
||||
};
|
||||
|
||||
ggml_backend_reg_t ggml_backend_metal_reg(void) {
|
||||
{
|
||||
g_ggml_metal_reg = {
|
||||
/* .api_version = */ GGML_BACKEND_API_VERSION,
|
||||
/* .iface = */ ggml_backend_metal_reg_i,
|
||||
/* .context = */ NULL,
|
||||
};
|
||||
|
||||
g_ggml_metal_device = {
|
||||
/* .iface = */ ggml_backend_metal_device_i,
|
||||
/* .reg = */ &g_ggml_metal_reg,
|
||||
/* .context = */ ggml_metal_device_get(),
|
||||
};
|
||||
}
|
||||
|
||||
return &g_ggml_metal_reg;
|
||||
}
|
||||
|
||||
GGML_BACKEND_DL_IMPL(ggml_backend_metal_reg)
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -26,8 +26,8 @@ kernel void kernel_timestep_embedding(
|
||||
local_half_dim = logical_dim / 2;
|
||||
local_embed_data_ptr = (global float *)((global char *)local_dst_output_base_ptr + local_i * dst_nb1_bytes);
|
||||
|
||||
if (logical_dim % 2 != 0 && local_j == ((logical_dim + 1) / 2)) {
|
||||
local_embed_data_ptr[logical_dim] = 0.0f;
|
||||
if (logical_dim % 2 != 0 && local_j == local_half_dim) {
|
||||
local_embed_data_ptr[2 * local_half_dim] = 0.0f;
|
||||
}
|
||||
|
||||
if (local_j >= local_half_dim) {
|
||||
|
||||
@@ -225,9 +225,9 @@ struct bin_bcast_sycl {
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_parallel_for(
|
||||
stream,
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) * sycl::range<3>(1, 1, block_size),
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) *
|
||||
sycl::range<3>(1, 1, block_size),
|
||||
sycl::range<3>(1, 1, block_size)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
k_bin_bcast_unravel<bin_op>(
|
||||
@@ -246,8 +246,9 @@ struct bin_bcast_sycl {
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_parallel_for(
|
||||
stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
|
||||
ne2, ne3, ne10, ne11, ne12, ne13,
|
||||
s1, s2, s3, s01, s02, s03, s11, s12, s13,
|
||||
@@ -302,6 +303,10 @@ inline void ggml_sycl_op_sub(ggml_backend_sycl_context & ctx, ggml_tensor *dst)
|
||||
ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_sub>>(ctx, dst->src[0], dst->src[1], dst);
|
||||
}
|
||||
|
||||
inline void ggml_sycl_op_count_equal(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_count_equal>>(ctx, dst->src[0], dst->src[1], dst);
|
||||
}
|
||||
|
||||
inline void ggml_sycl_op_mul(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
|
||||
ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_mul>>(ctx, dst->src[0], dst->src[1], dst);
|
||||
@@ -327,6 +332,11 @@ void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
ggml_sycl_op_sub(ctx, dst);
|
||||
}
|
||||
|
||||
void ggml_sycl_count_equal(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
||||
ggml_sycl_op_count_equal(ctx, dst);
|
||||
}
|
||||
|
||||
void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
||||
ggml_sycl_op_mul(ctx, dst);
|
||||
|
||||
@@ -16,6 +16,12 @@ static __dpct_inline__ float op_sub(const float a, const float b) {
|
||||
return a - b;
|
||||
}
|
||||
|
||||
static __dpct_inline__ float op_count_equal(const float a, const float b) {
|
||||
return (a == b) ? 1.0f : 0.0f;
|
||||
}
|
||||
|
||||
void ggml_sycl_count_equal(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
static __dpct_inline__ float op_mul(const float a, const float b) {
|
||||
return a * b;
|
||||
}
|
||||
|
||||
@@ -89,24 +89,33 @@ static void concat_f32_sycl(const float *x, const float *y, float *dst,
|
||||
sycl::range<3> gridDim(ne2, ne1, num_blocks);
|
||||
switch (dim) {
|
||||
case 0:
|
||||
sycl_parallel_for(stream,
|
||||
sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) { concat_f32_dim0(x, y, dst, ne0, ne00, item_ct1); });
|
||||
break;
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(gridDim *
|
||||
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
concat_f32_dim0(x, y, dst, ne0, ne00, item_ct1);
|
||||
});
|
||||
break;
|
||||
case 1:
|
||||
sycl_parallel_for(stream,
|
||||
sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) { concat_f32_dim1(x, y, dst, ne0, ne01, item_ct1); });
|
||||
break;
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(gridDim *
|
||||
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
concat_f32_dim1(x, y, dst, ne0, ne01, item_ct1);
|
||||
});
|
||||
break;
|
||||
// dim >=2 will be dispatched to the default path
|
||||
default:
|
||||
sycl_parallel_for(stream,
|
||||
sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) { concat_f32_dim2(x, y, dst, ne0, ne02, item_ct1); });
|
||||
break;
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(gridDim *
|
||||
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
concat_f32_dim2(x, y, dst, ne0, ne02, item_ct1);
|
||||
});
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -120,7 +129,7 @@ static void concat_f32_sycl_non_cont(
|
||||
int64_t ne2, int64_t ne3, uint64_t nb0, uint64_t nb1, uint64_t nb2,
|
||||
uint64_t nb3, int32_t dim) {
|
||||
sycl::range<3> gridDim(ne3, ne2, ne1);
|
||||
sycl_parallel_for(stream, sycl::nd_range<3>(gridDim, sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
|
||||
stream->parallel_for(sycl::nd_range<3>(gridDim, sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
|
||||
int64_t i3 = item_ct1.get_group(0);
|
||||
int64_t i2 = item_ct1.get_group(1);
|
||||
int64_t i1 = item_ct1.get_group(2);
|
||||
|
||||
@@ -59,10 +59,16 @@ static void conv_transpose_1d_f32_f32_sycl(
|
||||
const int num_blocks = (output_size + SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE - 1) / SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE;
|
||||
const sycl::range<3> block_dims(1, 1, SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE);
|
||||
const sycl::range<3> block_nums(1, 1, num_blocks);
|
||||
sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
conv_transpose_1d_kernel(s0, output_size, src0_ne0, src0_ne1, src0_ne2, src1_ne0, dst_ne0, src0, src1, dst,
|
||||
item_ct1);
|
||||
});
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(
|
||||
block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
conv_transpose_1d_kernel(
|
||||
s0, output_size,
|
||||
src0_ne0, src0_ne1, src0_ne2,
|
||||
src1_ne0, dst_ne0,
|
||||
src0, src1, dst, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
|
||||
+166
-99
@@ -33,11 +33,14 @@ static void dequantize_block_sycl(const void *__restrict__ vx,
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
sycl_parallel_for(
|
||||
stream,
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) { dequantize_block<qk, qr, dequantize_kernel>(vx, y, k, item_ct1); });
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(
|
||||
sycl::range<3>(1, 1, num_blocks) *
|
||||
sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block<qk, qr, dequantize_kernel>(vx, y, k, item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -50,18 +53,24 @@ static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int64_t k,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_parallel_for(
|
||||
stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
|
||||
[=](sycl::nd_item<3> item_ct1) { dequantize_block_q2_K(vx, y, item_ct1); });
|
||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 64),
|
||||
sycl::range<3>(1, 1, 64)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_q2_K(vx, y, item_ct1);
|
||||
});
|
||||
}
|
||||
#else
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_parallel_for(
|
||||
stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) { dequantize_block_q2_K(vx, y, item_ct1); });
|
||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 32),
|
||||
sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_q2_K(vx, y, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -76,18 +85,24 @@ static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int64_t k,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_parallel_for(
|
||||
stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
|
||||
[=](sycl::nd_item<3> item_ct1) { dequantize_block_q3_K(vx, y, item_ct1); });
|
||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 64),
|
||||
sycl::range<3>(1, 1, 64)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_q3_K(vx, y, item_ct1);
|
||||
});
|
||||
}
|
||||
#else
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_parallel_for(
|
||||
stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) { dequantize_block_q3_K(vx, y, item_ct1); });
|
||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 32),
|
||||
sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_q3_K(vx, y, item_ct1);
|
||||
});
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -101,9 +116,12 @@ static void dequantize_row_q4_0_sycl(const void *vx, dst_t *y, const int64_t k,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_parallel_for(
|
||||
stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) { dequantize_block_q4_0(vx, y, nb32, item_ct1); });
|
||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 32),
|
||||
sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_q4_0(vx, y, nb32, item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -117,12 +135,13 @@ static void dequantize_row_q4_0_sycl_reorder(const void *vx, dst_t *y, const int
|
||||
int constexpr WARP_K = WARP_SIZE * QK4_0;
|
||||
const int n_warp = (k + WARP_K - 1) / WARP_K;
|
||||
GGML_ASSERT(k % 2 == 0);
|
||||
sycl_parallel_for(stream,
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, n_warp) * sycl::range<3>(1, 1, WARP_SIZE),
|
||||
sycl::range<3>(1, 1, WARP_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
dequantize_block_q4_0_reorder(vx, y, k, item_ct1);
|
||||
});
|
||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, n_warp) *
|
||||
sycl::range<3>(1, 1, WARP_SIZE),
|
||||
sycl::range<3>(1, 1, WARP_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]]{
|
||||
dequantize_block_q4_0_reorder(vx, y, k, item_ct1);
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
template <typename dst_t>
|
||||
@@ -134,9 +153,12 @@ static void dequantize_row_q4_1_sycl(const void *vx, dst_t *y, const int64_t k,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_parallel_for(
|
||||
stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) { dequantize_block_q4_1(vx, y, nb32, item_ct1); });
|
||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 32),
|
||||
sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_q4_1(vx, y, nb32, item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -149,13 +171,14 @@ static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int64_t k,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl::local_accessor<uint8_t, 1> scale_local_acc(sycl::range<1>(12), cgh);
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_q4_K(vx, y, get_pointer(scale_local_acc), item_ct1);
|
||||
});
|
||||
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 32),
|
||||
sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_q4_K(vx, y, get_pointer(scale_local_acc), item_ct1);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -168,13 +191,13 @@ static void dequantize_row_q4_K_sycl_reorder(const void * vx, dst_t * y, const i
|
||||
|
||||
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
||||
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
sycl::local_accessor<uint8_t, 1> scale_local_acc(sycl::range<1>(12), cgh);
|
||||
|
||||
sycl_parallel_for<1>(cgh, sycl::nd_range<1>(sycl::range<1>(global_size), sycl::range<1>(local_size)),
|
||||
[=](sycl::nd_item<1> item_ct1) {
|
||||
dequantize_block_q4_K_reorder(vx, y, get_pointer(scale_local_acc), item_ct1, nb);
|
||||
});
|
||||
cgh.parallel_for(sycl::nd_range<1>(sycl::range<1>(global_size), sycl::range<1>(local_size)),
|
||||
[=](sycl::nd_item<1> item_ct1) {
|
||||
dequantize_block_q4_K_reorder(vx, y, get_pointer(scale_local_acc), item_ct1, nb);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
@@ -187,18 +210,24 @@ static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int64_t k,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_parallel_for(
|
||||
stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
|
||||
[=](sycl::nd_item<3> item_ct1) { dequantize_block_q5_K(vx, y, item_ct1); });
|
||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 64),
|
||||
sycl::range<3>(1, 1, 64)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_q5_K(vx, y, item_ct1);
|
||||
});
|
||||
}
|
||||
#else
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_parallel_for(
|
||||
stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) { dequantize_block_q5_K(vx, y, item_ct1); });
|
||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 32),
|
||||
sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_q5_K(vx, y, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -213,18 +242,24 @@ static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int64_t k,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_parallel_for(
|
||||
stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
|
||||
[=](sycl::nd_item<3> item_ct1) { dequantize_block_q6_K(vx, y, item_ct1); });
|
||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 64),
|
||||
sycl::range<3>(1, 1, 64)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_q6_K(vx, y, item_ct1);
|
||||
});
|
||||
}
|
||||
#else
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_parallel_for(
|
||||
stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) { dequantize_block_q6_K(vx, y, item_ct1); });
|
||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 32),
|
||||
sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_q6_K(vx, y, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -236,9 +271,9 @@ static void dequantize_row_q6_K_sycl_reorder(const void * vx, dst_t * y, const i
|
||||
|
||||
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
||||
|
||||
sycl_parallel_for(stream,
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
|
||||
[=](sycl::nd_item<3> item_ct1) { dequantize_block_q6_K_reorder(vx, y, item_ct1, nb); });
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
|
||||
[=](sycl::nd_item<3> item_ct1) { dequantize_block_q6_K_reorder(vx, y, item_ct1, nb); });
|
||||
}
|
||||
|
||||
template <typename dst_t>
|
||||
@@ -249,10 +284,15 @@ static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int64_t k,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) { dequantize_block_iq1_s(vx, y, item_ct1, iq1s_grid_gpu); });
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 32),
|
||||
sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_iq1_s(
|
||||
vx, y, item_ct1, iq1s_grid_gpu
|
||||
);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -265,10 +305,15 @@ static void dequantize_row_iq1_m_sycl(const void *vx, dst_t *y, const int64_t k,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) { dequantize_block_iq1_m(vx, y, item_ct1, iq1s_grid_gpu); });
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 32),
|
||||
sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_iq1_m(
|
||||
vx, y, item_ct1, iq1s_grid_gpu
|
||||
);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -281,12 +326,15 @@ static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int64_t
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_iq2_xxs(vx, y, item_ct1, iq2xxs_grid, ksigns_iq2xs, kmask_iq2xs);
|
||||
});
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 32),
|
||||
sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_iq2_xxs(
|
||||
vx, y, item_ct1, iq2xxs_grid,
|
||||
ksigns_iq2xs, kmask_iq2xs);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -299,12 +347,15 @@ static void dequantize_row_iq2_xs_sycl(const void *vx, dst_t *y, const int64_t k
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_iq2_xs(vx, y, item_ct1, iq2xs_grid, ksigns_iq2xs, kmask_iq2xs);
|
||||
});
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 32),
|
||||
sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_iq2_xs(
|
||||
vx, y, item_ct1, iq2xs_grid,
|
||||
ksigns_iq2xs, kmask_iq2xs);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -317,10 +368,13 @@ static void dequantize_row_iq2_s_sycl(const void *vx, dst_t *y, const int64_t k,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) { dequantize_block_iq2_s(vx, y, item_ct1); });
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 32),
|
||||
sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_iq2_s(vx, y, item_ct1);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -334,12 +388,15 @@ static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int64_t
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_iq3_xxs(vx, y, item_ct1, iq3xxs_grid, ksigns_iq2xs, kmask_iq2xs);
|
||||
});
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 32),
|
||||
sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_iq3_xxs(
|
||||
vx, y, item_ct1, iq3xxs_grid,
|
||||
ksigns_iq2xs, kmask_iq2xs);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -352,10 +409,14 @@ static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int64_t k,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) { dequantize_block_iq3_s(vx, y, item_ct1, kmask_iq2xs, iq3s_grid); });
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 32),
|
||||
sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_iq3_s(
|
||||
vx, y, item_ct1, kmask_iq2xs, iq3s_grid);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -371,11 +432,14 @@ static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int64_t k
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl_parallel_for(
|
||||
cgh,
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) { dequantize_block_iq4_xs(vx, y, item_ct1); });
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 32),
|
||||
sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_iq4_xs(vx, y, item_ct1);
|
||||
});
|
||||
});
|
||||
}
|
||||
#endif
|
||||
@@ -389,11 +453,14 @@ static void dequantize_row_iq4_nl_sycl(const void *vx, dst_t *y, const int64_t k
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl_parallel_for(
|
||||
cgh,
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) { dequantize_block_iq4_nl(vx, y, item_ct1); });
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
||||
sycl::range<3>(1, 1, 32),
|
||||
sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_iq4_nl(vx, y, item_ct1);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
+72
-94
@@ -201,8 +201,7 @@ static void ggml_cpy_f16_f32_sycl(const char * cx, char * cdst, const int ne, co
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
||||
|
||||
sycl_parallel_for(
|
||||
stream,
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
@@ -220,8 +219,7 @@ static void ggml_cpy_f32_f32_sycl(const char * cx, char * cdst, const int ne, co
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
||||
|
||||
sycl_parallel_for(
|
||||
stream,
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
@@ -239,8 +237,7 @@ static void ggml_cpy_f32_f16_sycl(const char * cx, char * cdst, const int ne, co
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
||||
|
||||
sycl_parallel_for(
|
||||
stream,
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
@@ -256,11 +253,11 @@ static void ggml_cpy_f32_q8_0_sycl(const char * cx, char * cdst, const int ne, c
|
||||
const int nb12, const int nb13, queue_ptr stream) {
|
||||
GGML_ASSERT(ne % QK8_0 == 0);
|
||||
const int num_blocks = ne / QK8_0;
|
||||
sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
||||
});
|
||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
static void ggml_cpy_q8_0_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
||||
@@ -268,11 +265,11 @@ static void ggml_cpy_q8_0_f32_sycl(const char * cx, char * cdst, const int ne, c
|
||||
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13, queue_ptr stream) {
|
||||
const int num_blocks = ne;
|
||||
sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_q_f32<cpy_blck_q8_0_f32, QK8_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
||||
});
|
||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_q_f32<cpy_blck_q8_0_f32, QK8_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_q4_0_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
||||
@@ -281,11 +278,11 @@ static void ggml_cpy_f32_q4_0_sycl(const char * cx, char * cdst, const int ne, c
|
||||
const int nb12, const int nb13, queue_ptr stream) {
|
||||
GGML_ASSERT(ne % QK4_0 == 0);
|
||||
const int num_blocks = ne / QK4_0;
|
||||
sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
||||
});
|
||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
static void ggml_cpy_q4_0_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
||||
@@ -293,9 +290,8 @@ static void ggml_cpy_q4_0_f32_sycl(const char * cx, char * cdst, const int ne, c
|
||||
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13, queue_ptr stream) {
|
||||
const int num_blocks = ne;
|
||||
sycl_parallel_for(
|
||||
stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_q_f32<cpy_blck_q_f32<dequantize_q4_0, QK4_0>, QK4_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
||||
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
||||
item_ct1);
|
||||
@@ -308,11 +304,11 @@ static void ggml_cpy_f32_q4_1_sycl(const char * cx, char * cdst, const int ne, c
|
||||
const int nb12, const int nb13, queue_ptr stream) {
|
||||
GGML_ASSERT(ne % QK4_1 == 0);
|
||||
const int num_blocks = ne / QK4_1;
|
||||
sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
||||
});
|
||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
static void ggml_cpy_q4_1_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
||||
@@ -320,9 +316,8 @@ static void ggml_cpy_q4_1_f32_sycl(const char * cx, char * cdst, const int ne, c
|
||||
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13, queue_ptr stream) {
|
||||
const int num_blocks = ne;
|
||||
sycl_parallel_for(
|
||||
stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_q_f32<cpy_blck_q_f32<dequantize_q4_1, QK4_1>, QK4_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
||||
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
||||
item_ct1);
|
||||
@@ -335,11 +330,11 @@ static void ggml_cpy_f32_q5_0_sycl(const char * cx, char * cdst, const int ne, c
|
||||
const int nb12, const int nb13, queue_ptr stream) {
|
||||
GGML_ASSERT(ne % QK5_0 == 0);
|
||||
const int num_blocks = ne / QK5_0;
|
||||
sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_q<cpy_blck_f32_q5_0, QK5_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
||||
});
|
||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_q<cpy_blck_f32_q5_0, QK5_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
static void ggml_cpy_q5_0_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
||||
@@ -347,9 +342,8 @@ static void ggml_cpy_q5_0_f32_sycl(const char * cx, char * cdst, const int ne, c
|
||||
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13, queue_ptr stream) {
|
||||
const int num_blocks = ne;
|
||||
sycl_parallel_for(
|
||||
stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_q_f32<cpy_blck_q_f32<dequantize_q5_0, QK5_0>, QK5_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
||||
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
||||
item_ct1);
|
||||
@@ -362,11 +356,11 @@ static void ggml_cpy_f32_q5_1_sycl(const char * cx, char * cdst, const int ne, c
|
||||
const int nb12, const int nb13, queue_ptr stream) {
|
||||
GGML_ASSERT(ne % QK5_1 == 0);
|
||||
const int num_blocks = ne / QK5_1;
|
||||
sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_q<cpy_blck_f32_q5_1, QK5_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
||||
});
|
||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_q<cpy_blck_f32_q5_1, QK5_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
static void ggml_cpy_q5_1_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
||||
@@ -374,9 +368,8 @@ static void ggml_cpy_q5_1_f32_sycl(const char * cx, char * cdst, const int ne, c
|
||||
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13, queue_ptr stream) {
|
||||
const int num_blocks = ne;
|
||||
sycl_parallel_for(
|
||||
stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_q_f32<cpy_blck_q_f32<dequantize_q5_1, QK5_1>, QK5_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
||||
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
||||
item_ct1);
|
||||
@@ -389,11 +382,11 @@ static void ggml_cpy_f32_iq4_nl_sycl(const char * cx, char * cdst, const int ne,
|
||||
const int nb12, const int nb13, queue_ptr stream) {
|
||||
GGML_ASSERT(ne % QK4_NL == 0);
|
||||
const int num_blocks = ne / QK4_NL;
|
||||
sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
||||
});
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11,
|
||||
ne12, nb10, nb11, nb12, nb13, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
static void ggml_cpy_f16_f16_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
||||
@@ -404,8 +397,7 @@ static void ggml_cpy_f16_f16_sycl(const char * cx, char * cdst, const int ne, co
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
||||
|
||||
sycl_parallel_for(
|
||||
stream,
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
@@ -424,8 +416,7 @@ static void ggml_cpy_i16_i16_sycl(const char * cx, char * cdst, const int ne, co
|
||||
// dpct::has_capability_or_fail(stream->get_device(),
|
||||
// {sycl::aspect::fp16});
|
||||
|
||||
sycl_parallel_for(
|
||||
stream,
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
@@ -444,8 +435,7 @@ static void ggml_cpy_i32_i32_sycl(const char * cx, char * cdst, const int ne, co
|
||||
// dpct::has_capability_or_fail(stream->get_device(),
|
||||
// {sycl::aspect::fp16});
|
||||
|
||||
sycl_parallel_for(
|
||||
stream,
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
@@ -460,13 +450,11 @@ static void ggml_cpy_q8_0_q8_0(const char * cx, char * cdst, const int ne, const
|
||||
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13, queue_ptr stream) {
|
||||
const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
|
||||
sycl_parallel_for(stream,
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_q_q<block_q8_0, QK8_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11,
|
||||
ne12, nb10, nb11, nb12, nb13, item_ct1);
|
||||
});
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_q_q<block_q8_0, QK8_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -475,13 +463,11 @@ static void ggml_cpy_q5_0_q5_0(const char * cx, char * cdst, const int ne, const
|
||||
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13, queue_ptr stream) {
|
||||
const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
|
||||
sycl_parallel_for(stream,
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_q_q<block_q5_0, QK5_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11,
|
||||
ne12, nb10, nb11, nb12, nb13, item_ct1);
|
||||
});
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_q_q<block_q5_0, QK5_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -491,13 +477,11 @@ static void ggml_cpy_q5_1_q5_1(const char * cx, char * cdst, const int ne, const
|
||||
const int nb12, const int nb13, queue_ptr stream) {
|
||||
const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
|
||||
|
||||
sycl_parallel_for(stream,
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_q_q<block_q5_1, QK5_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11,
|
||||
ne12, nb10, nb11, nb12, nb13, item_ct1);
|
||||
});
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_q_q<block_q5_1, QK5_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -506,13 +490,10 @@ static void ggml_cpy_q4_0_q4_0(const char * cx, char * cdst, const int ne, const
|
||||
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13, queue_ptr stream) {
|
||||
const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
|
||||
sycl_parallel_for(stream,
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_q_q<block_q4_0, QK4_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11,
|
||||
ne12, nb10, nb11, nb12, nb13, item_ct1);
|
||||
});
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_q_q<block_q4_0, QK4_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -522,13 +503,10 @@ static void ggml_cpy_q4_1_q4_1(const char * cx, char * cdst, const int ne, const
|
||||
const int nb12, const int nb13, queue_ptr stream) {
|
||||
|
||||
const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
|
||||
sycl_parallel_for(stream,
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_q_q<block_q4_1, QK4_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11,
|
||||
ne12, nb10, nb11, nb12, nb13, item_ct1);
|
||||
});
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_q_q<block_q4_1, QK4_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try {
|
||||
|
||||
+67
-49
@@ -208,10 +208,12 @@ static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec<1, 1, convert_f16>(vx, y, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec<1, 1, convert_f16>(vx, y, dst, ncols,
|
||||
nrows, item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -875,11 +877,12 @@ static void dequantize_mul_mat_vec_q4_0_sycl_reorder(const void *vx, const dfloa
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec_reorder<QK4_0, QR4_0, dequantize_q4_0_reorder>(vx, y, dst, ncols,
|
||||
nrows, item_ct1);
|
||||
});
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec_reorder<QK4_0, QR4_0, dequantize_q4_0_reorder>(
|
||||
vx, y, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -897,10 +900,12 @@ static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>(vx, y, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>(
|
||||
vx, y, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -916,10 +921,12 @@ static void dequantize_mul_mat_vec_q4_1_sycl(const void *vx, const dfloat *y,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>(vx, y, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>(
|
||||
vx, y, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -935,10 +942,12 @@ static void dequantize_mul_mat_vec_q5_0_sycl(const void *vx, const dfloat *y,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>(vx, y, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>(
|
||||
vx, y, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -954,10 +963,12 @@ static void dequantize_mul_mat_vec_q5_1_sycl(const void *vx, const dfloat *y,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>(vx, y, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>(
|
||||
vx, y, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -973,10 +984,12 @@ static void dequantize_mul_mat_vec_q8_0_sycl(const void *vx, const dfloat *y,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>(vx, y, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>(
|
||||
vx, y, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -989,10 +1002,11 @@ static void dequantize_mul_mat_vec_q2_K_sycl(const void *vx, const float *y,
|
||||
const int block_num_y = (nrows + ny - 1) / ny;
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
|
||||
sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q3_K_sycl(const void *vx, const float *y,
|
||||
@@ -1004,10 +1018,11 @@ static void dequantize_mul_mat_vec_q3_K_sycl(const void *vx, const float *y,
|
||||
const int block_num_y = (nrows + ny - 1) / ny;
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
|
||||
sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q4_K_sycl(const void *vx, const float *y,
|
||||
@@ -1019,10 +1034,11 @@ static void dequantize_mul_mat_vec_q4_K_sycl(const void *vx, const float *y,
|
||||
const int block_num_y = (nrows + ny - 1) / ny;
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
|
||||
sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q5_K_sycl(const void *vx, const float *y,
|
||||
@@ -1031,10 +1047,11 @@ static void dequantize_mul_mat_vec_q5_K_sycl(const void *vx, const float *y,
|
||||
dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const sycl::range<3> block_dims(1, 1, QK_WARP_SIZE);
|
||||
sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1);
|
||||
});
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q6_K_sycl(const void *vx, const float *y,
|
||||
@@ -1046,10 +1063,11 @@ static void dequantize_mul_mat_vec_q6_K_sycl(const void *vx, const float *y,
|
||||
const int block_num_y = (nrows + ny - 1) / ny;
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
|
||||
sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
void ggml_sycl_op_dequantize_mul_mat_vec(
|
||||
|
||||
@@ -13,10 +13,10 @@
|
||||
#ifndef GGML_SYCL_DPCT_HELPER_HPP
|
||||
#define GGML_SYCL_DPCT_HELPER_HPP
|
||||
|
||||
#include <map>
|
||||
#include <sycl/sycl.hpp>
|
||||
#include <sycl/half_type.hpp>
|
||||
#include <syclcompat/math.hpp>
|
||||
#include <map>
|
||||
|
||||
#ifdef GGML_SYCL_USE_INTEL_ONEMKL
|
||||
#include <oneapi/mkl.hpp>
|
||||
@@ -118,36 +118,6 @@ inline auto get_onemath_backend(sycl::queue& queue)
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef SYCL_EXT_ONEAPI_ENQUEUE_FUNCTIONS
|
||||
namespace syclex = sycl::ext::oneapi::experimental;
|
||||
#endif
|
||||
|
||||
template <int NR, typename Func>
|
||||
__dpct_inline__ void sycl_parallel_for(sycl::handler & cgh, sycl::nd_range<NR> nd_range, Func && func) {
|
||||
#ifdef SYCL_EXT_ONEAPI_ENQUEUE_FUNCTIONS
|
||||
syclex::nd_launch(cgh, nd_range, func);
|
||||
#else
|
||||
cgh.parallel_for(nd_range, func);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <int NR, typename Func>
|
||||
__dpct_inline__ void sycl_parallel_for(sycl::queue * q, sycl::nd_range<NR> nd_range, Func && func) {
|
||||
#ifdef SYCL_EXT_ONEAPI_ENQUEUE_FUNCTIONS
|
||||
syclex::nd_launch(*q, nd_range, func);
|
||||
#else
|
||||
q->parallel_for(nd_range, func);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename Func> __dpct_inline__ void sycl_launch(sycl::queue * stream, Func && func) {
|
||||
#ifdef SYCL_EXT_ONEAPI_ENQUEUE_FUNCTIONS
|
||||
syclex::submit(*stream, func);
|
||||
#else
|
||||
stream->submit(func);
|
||||
#endif
|
||||
}
|
||||
|
||||
namespace dpct
|
||||
{
|
||||
typedef sycl::queue *queue_ptr;
|
||||
|
||||
@@ -407,7 +407,7 @@ static void acc_f32_sycl(const float *x, const float *y, float *dst,
|
||||
const int ne12, const int nb1, const int nb2,
|
||||
const int offset, queue_ptr stream) {
|
||||
int num_blocks = ceil_div(n_elements, SYCL_ACC_BLOCK_SIZE);
|
||||
sycl_parallel_for(stream,
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<1>(sycl::range<1>(num_blocks) *
|
||||
sycl::range<1>(SYCL_ACC_BLOCK_SIZE),
|
||||
sycl::range<1>(SYCL_ACC_BLOCK_SIZE)),
|
||||
@@ -425,8 +425,8 @@ static void upscale_sycl(const T *x, T *dst, const int nb00, const int nb01,
|
||||
int dst_size = ne10 * ne11 * ne12 * ne13;
|
||||
int num_blocks = ceil_div(dst_size, SYCL_UPSCALE_BLOCK_SIZE);
|
||||
sycl::range<1> gridDim(num_blocks * SYCL_UPSCALE_BLOCK_SIZE);
|
||||
sycl_parallel_for<1>(
|
||||
stream, sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
|
||||
upscale(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, item_ct1);
|
||||
});
|
||||
}
|
||||
@@ -437,7 +437,7 @@ static void pad_sycl(const T *x, T *dst, const int ne00,
|
||||
const int ne1, const int ne2, queue_ptr stream) {
|
||||
int num_blocks = ceil_div(ne0, SYCL_PAD_BLOCK_SIZE);
|
||||
sycl::range<3> gridDim(ne2, ne1, num_blocks);
|
||||
sycl_parallel_for(stream,
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) { pad(x, dst, ne0, ne00, ne01, ne02, item_ct1); });
|
||||
@@ -639,7 +639,7 @@ static inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor
|
||||
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
|
||||
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
|
||||
const int num_blocks = ceil_div(k_elements, 256);
|
||||
sycl_parallel_for(stream,
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
|
||||
sycl::range<1>(256)),
|
||||
[=](sycl::nd_item<1> item_ct1) {
|
||||
@@ -652,7 +652,7 @@ static inline void ggml_sycl_op_abs(ggml_backend_sycl_context & ctx, ggml_tensor
|
||||
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
|
||||
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
|
||||
const int num_blocks = ceil_div(k_elements, 256);
|
||||
sycl_parallel_for(stream,
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
|
||||
sycl::range<1>(256)),
|
||||
[=](sycl::nd_item<1> item_ct1) {
|
||||
@@ -665,7 +665,7 @@ static inline void ggml_sycl_op_elu(ggml_backend_sycl_context & ctx, ggml_tensor
|
||||
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
|
||||
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
|
||||
const int num_blocks = ceil_div(k_elements, 256);
|
||||
sycl_parallel_for(stream,
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
|
||||
sycl::range<1>(256)),
|
||||
[=](sycl::nd_item<1> item_ct1) {
|
||||
@@ -678,7 +678,7 @@ static inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tenso
|
||||
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
|
||||
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
|
||||
const int num_blocks = ceil_div(k_elements, SYCL_SILU_BLOCK_SIZE);
|
||||
sycl_parallel_for(stream,
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SILU_BLOCK_SIZE),
|
||||
sycl::range<1>(SYCL_SILU_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<1> item_ct1) {
|
||||
@@ -691,7 +691,7 @@ static inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tenso
|
||||
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
|
||||
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
|
||||
const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE);
|
||||
sycl_parallel_for(stream,
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE),
|
||||
sycl::range<1>(SYCL_GELU_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<1> item_ct1) {
|
||||
@@ -704,7 +704,7 @@ static inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml
|
||||
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
|
||||
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
|
||||
const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE);
|
||||
sycl_parallel_for(stream,
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE),
|
||||
sycl::range<1>(SYCL_GELU_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<1> item_ct1) {
|
||||
@@ -717,7 +717,7 @@ static inline void ggml_sycl_op_gelu_erf(ggml_backend_sycl_context & ctx, ggml_t
|
||||
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
|
||||
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
|
||||
const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE);
|
||||
sycl_parallel_for(stream,
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE),
|
||||
sycl::range<1>(SYCL_GELU_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<1> item_ct1) {
|
||||
@@ -730,7 +730,7 @@ static inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tenso
|
||||
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
|
||||
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
|
||||
const int num_blocks = ceil_div(k_elements, SYCL_TANH_BLOCK_SIZE);
|
||||
sycl_parallel_for(stream,
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_TANH_BLOCK_SIZE),
|
||||
sycl::range<1>(SYCL_TANH_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<1> item_ct1) {
|
||||
@@ -743,7 +743,7 @@ static inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tenso
|
||||
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
|
||||
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
|
||||
const int num_blocks = ceil_div(k_elements, SYCL_RELU_BLOCK_SIZE);
|
||||
sycl_parallel_for(stream,
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_RELU_BLOCK_SIZE),
|
||||
sycl::range<1>(SYCL_RELU_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<1> item_ct1) {
|
||||
@@ -756,7 +756,7 @@ static inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, ggm
|
||||
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
|
||||
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
|
||||
const int num_blocks = ceil_div(k_elements, SYCL_HARDSIGMOID_BLOCK_SIZE);
|
||||
sycl_parallel_for(stream,
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_HARDSIGMOID_BLOCK_SIZE),
|
||||
sycl::range<1>(SYCL_HARDSIGMOID_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<1> item_ct1) {
|
||||
@@ -769,7 +769,7 @@ static inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_
|
||||
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
|
||||
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
|
||||
const int num_blocks = ceil_div(k_elements, SYCL_HARDSWISH_BLOCK_SIZE);
|
||||
sycl_parallel_for(stream,
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_HARDSWISH_BLOCK_SIZE),
|
||||
sycl::range<1>(SYCL_HARDSWISH_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<1> item_ct1) {
|
||||
@@ -782,7 +782,7 @@ static inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor
|
||||
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
|
||||
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
|
||||
const int num_blocks = ceil_div(k_elements, SYCL_EXP_BLOCK_SIZE);
|
||||
sycl_parallel_for(stream,
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_EXP_BLOCK_SIZE),
|
||||
sycl::range<1>(SYCL_EXP_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<1> item_ct1) {
|
||||
@@ -795,7 +795,7 @@ static inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor
|
||||
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
|
||||
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
|
||||
const int num_blocks = ceil_div(k_elements, SYCL_EXP_BLOCK_SIZE); // Using EXP block size
|
||||
sycl_parallel_for(stream,
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_EXP_BLOCK_SIZE),
|
||||
sycl::range<1>(SYCL_EXP_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<1> item_ct1) {
|
||||
@@ -808,7 +808,7 @@ static inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor
|
||||
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
|
||||
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
|
||||
const int num_blocks = ceil_div(k_elements, SYCL_NEG_BLOCK_SIZE);
|
||||
sycl_parallel_for(stream,
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_NEG_BLOCK_SIZE),
|
||||
sycl::range<1>(SYCL_NEG_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<1> item_ct1) {
|
||||
@@ -821,7 +821,7 @@ static inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tenso
|
||||
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
|
||||
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
|
||||
const int num_blocks = ceil_div(k_elements, SYCL_NEG_BLOCK_SIZE); // Using NEG block size
|
||||
sycl_parallel_for(stream,
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_NEG_BLOCK_SIZE),
|
||||
sycl::range<1>(SYCL_NEG_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<1> item_ct1) {
|
||||
@@ -834,7 +834,7 @@ static inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_te
|
||||
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
|
||||
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
|
||||
const int num_blocks = ceil_div(k_elements, SYCL_SIGMOID_BLOCK_SIZE);
|
||||
sycl_parallel_for(stream,
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIGMOID_BLOCK_SIZE),
|
||||
sycl::range<1>(SYCL_SIGMOID_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<1> item_ct1) {
|
||||
@@ -847,7 +847,7 @@ static inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tenso
|
||||
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
|
||||
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
|
||||
const int num_blocks = ceil_div(k_elements, SYCL_SQRT_BLOCK_SIZE);
|
||||
sycl_parallel_for(stream,
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SQRT_BLOCK_SIZE),
|
||||
sycl::range<1>(SYCL_SQRT_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<1> item_ct1) {
|
||||
@@ -860,7 +860,7 @@ static inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor
|
||||
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
|
||||
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
|
||||
const int num_blocks = ceil_div(k_elements, SYCL_SIN_BLOCK_SIZE);
|
||||
sycl_parallel_for(stream,
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIN_BLOCK_SIZE),
|
||||
sycl::range<1>(SYCL_SIN_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<1> item_ct1) {
|
||||
@@ -873,7 +873,7 @@ static inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor
|
||||
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
|
||||
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
|
||||
const int num_blocks = ceil_div(k_elements, SYCL_SIN_BLOCK_SIZE); // Using SIN block size
|
||||
sycl_parallel_for(stream,
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIN_BLOCK_SIZE),
|
||||
sycl::range<1>(SYCL_SIN_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<1> item_ct1) {
|
||||
@@ -888,7 +888,7 @@ static inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml
|
||||
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
|
||||
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream, float slope) {
|
||||
const int num_blocks = ceil_div(k_elements, SYCL_RELU_BLOCK_SIZE);
|
||||
sycl_parallel_for(stream,
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_RELU_BLOCK_SIZE),
|
||||
sycl::range<1>(SYCL_RELU_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<1> item_ct1) {
|
||||
@@ -901,7 +901,7 @@ static inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor
|
||||
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
|
||||
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
|
||||
const int num_blocks = ceil_div(k_elements, SYCL_SQR_BLOCK_SIZE);
|
||||
sycl_parallel_for(stream,
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SQR_BLOCK_SIZE),
|
||||
sycl::range<1>(SYCL_SQR_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<1> item_ct1) {
|
||||
@@ -935,7 +935,7 @@ static inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tens
|
||||
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
|
||||
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream, float min_arg, float max_arg) {
|
||||
const int num_blocks = ceil_div(k_elements, SYCL_CLAMP_BLOCK_SIZE);
|
||||
sycl_parallel_for(stream,
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_CLAMP_BLOCK_SIZE),
|
||||
sycl::range<1>(SYCL_CLAMP_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<1> item_ct1) {
|
||||
@@ -967,7 +967,7 @@ static inline void ggml_sycl_op_geglu(ggml_backend_sycl_context & ctx, ggml_tens
|
||||
ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
|
||||
[](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
|
||||
const uint32_t num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE);
|
||||
sycl_parallel_for(main_stream,
|
||||
main_stream->parallel_for(
|
||||
sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
|
||||
gated_op_fused_geglu(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
|
||||
});
|
||||
@@ -978,7 +978,7 @@ static inline void ggml_sycl_op_reglu(ggml_backend_sycl_context & ctx, ggml_tens
|
||||
ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
|
||||
[](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
|
||||
const uint32_t num_blocks = ceil_div((uint32_t)k, SYCL_RELU_BLOCK_SIZE); // Using RELU block size for reglu
|
||||
sycl_parallel_for(main_stream,
|
||||
main_stream->parallel_for(
|
||||
sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_RELU_BLOCK_SIZE)), sycl::range<1>(SYCL_RELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
|
||||
gated_op_fused_reglu(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
|
||||
});
|
||||
@@ -989,7 +989,7 @@ static inline void ggml_sycl_op_swiglu(ggml_backend_sycl_context & ctx, ggml_ten
|
||||
ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
|
||||
[](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
|
||||
const uint32_t num_blocks = ceil_div((uint32_t)k, SYCL_SILU_BLOCK_SIZE); // Using SILU block size for swiglu
|
||||
sycl_parallel_for(main_stream,
|
||||
main_stream->parallel_for(
|
||||
sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_SILU_BLOCK_SIZE)), sycl::range<1>(SYCL_SILU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
|
||||
gated_op_fused_swiglu(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
|
||||
});
|
||||
@@ -1000,7 +1000,7 @@ static inline void ggml_sycl_op_geglu_erf(ggml_backend_sycl_context & ctx, ggml_
|
||||
ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
|
||||
[](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
|
||||
const uint32_t num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE);
|
||||
sycl_parallel_for(main_stream,
|
||||
main_stream->parallel_for(
|
||||
sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
|
||||
gated_op_fused_geglu_erf(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
|
||||
});
|
||||
@@ -1011,7 +1011,7 @@ static inline void ggml_sycl_op_geglu_quick(ggml_backend_sycl_context & ctx, ggm
|
||||
ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
|
||||
[](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
|
||||
const uint32_t num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE);
|
||||
sycl_parallel_for(main_stream,
|
||||
main_stream->parallel_for(
|
||||
sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
|
||||
gated_op_fused_geglu_quick(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
|
||||
});
|
||||
|
||||
@@ -118,10 +118,12 @@ static void get_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
|
||||
|
||||
GGML_ASSERT(ne00 % 2 == 0);
|
||||
|
||||
sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
k_get_rows<qk, qr, dq>(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2, s3, nb01, nb02, nb03, s10, s11, s12,
|
||||
item_ct1);
|
||||
});
|
||||
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
k_get_rows<qk, qr, dq>(
|
||||
src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2,
|
||||
s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
|
||||
});
|
||||
|
||||
GGML_UNUSED(dst);
|
||||
GGML_UNUSED(ctx);
|
||||
@@ -154,8 +156,9 @@ static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tens
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_parallel_for(
|
||||
stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
k_get_rows_float(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2,
|
||||
s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
|
||||
});
|
||||
|
||||
@@ -1746,12 +1746,13 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
|
||||
const size_t shared_mem = ncols_pad * sizeof(int);
|
||||
|
||||
if (order == GGML_SORT_ORDER_ASC) {
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(
|
||||
sycl::range<1>(shared_mem), cgh);
|
||||
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
k_argsort_f32_i32<GGML_SORT_ORDER_ASC>(
|
||||
x, dst, ncols, ncols_pad, item_ct1,
|
||||
dpct_local_acc_ct1.get_multi_ptr<sycl::access::decorated::no>()
|
||||
@@ -1759,12 +1760,13 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
|
||||
});
|
||||
});
|
||||
} else if (order == GGML_SORT_ORDER_DESC) {
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(
|
||||
sycl::range<1>(shared_mem), cgh);
|
||||
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
k_argsort_f32_i32<GGML_SORT_ORDER_DESC>(
|
||||
x, dst, ncols, ncols_pad, item_ct1,
|
||||
dpct_local_acc_ct1.get_multi_ptr<sycl::access::decorated::no>()
|
||||
@@ -1782,47 +1784,50 @@ static void argmax_f32_i32_sycl(const float *x, int *dst, const int ncols,
|
||||
const sycl::range<3> block_nums(1, nrows, 1);
|
||||
const size_t shared_mem = 256 * sizeof(float);
|
||||
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl::local_accessor<float, 1> shared_data(
|
||||
sycl::range<1>(shared_mem/sizeof(float)), cgh);
|
||||
sycl::local_accessor<int, 1> shared_indices(
|
||||
sycl::range<1>(shared_mem/sizeof(float)), cgh);
|
||||
|
||||
sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
const int row = item_ct1.get_global_id(1);
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
const int row = item_ct1.get_global_id(1);
|
||||
|
||||
float max_val = -INFINITY;
|
||||
int max_idx = -1;
|
||||
float max_val = -INFINITY;
|
||||
int max_idx = -1;
|
||||
|
||||
for (int col = tid; col < ncols; col += 256) {
|
||||
float val = x[row * ncols + col];
|
||||
if (val > max_val) {
|
||||
max_val = val;
|
||||
max_idx = col;
|
||||
}
|
||||
}
|
||||
|
||||
shared_data[tid] = max_val;
|
||||
shared_indices[tid] = max_idx;
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
|
||||
for (int stride = 256 / 2; stride > 0; stride >>= 1) {
|
||||
if (tid < stride) {
|
||||
float val1 = shared_data[tid];
|
||||
float val2 = shared_data[tid + stride];
|
||||
if (val2 > val1) {
|
||||
shared_data[tid] = val2;
|
||||
shared_indices[tid] = shared_indices[tid + stride];
|
||||
for (int col = tid; col < ncols; col += 256) {
|
||||
float val = x[row * ncols + col];
|
||||
if (val > max_val) {
|
||||
max_val = val;
|
||||
max_idx = col;
|
||||
}
|
||||
}
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
}
|
||||
|
||||
if (tid == 0) {
|
||||
dst[row] = shared_indices[0];
|
||||
}
|
||||
});
|
||||
shared_data[tid] = max_val;
|
||||
shared_indices[tid] = max_idx;
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
|
||||
for (int stride = 256/2; stride > 0; stride >>= 1) {
|
||||
if (tid < stride) {
|
||||
float val1 = shared_data[tid];
|
||||
float val2 = shared_data[tid + stride];
|
||||
if (val2 > val1) {
|
||||
shared_data[tid] = val2;
|
||||
shared_indices[tid] = shared_indices[tid + stride];
|
||||
}
|
||||
}
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
}
|
||||
|
||||
|
||||
if (tid == 0) {
|
||||
dst[row] = shared_indices[0];
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
static void diag_mask_inf_f32_sycl(const float *x, float *dst,
|
||||
@@ -2895,7 +2900,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
|
||||
void ** ptrs_dst_get = ptrs_dst.get();
|
||||
size_t nb12_scaled = src1->type == GGML_TYPE_F16 ? nb12 : s12 * sizeof(sycl::half);
|
||||
size_t nb13_scaled = src1->type == GGML_TYPE_F16 ? nb13 : s13 * sizeof(sycl::half);
|
||||
sycl_parallel_for(cgh, sycl::nd_range<3>(block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
cgh.parallel_for(sycl::nd_range<3>(block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
k_compute_batched_ptrs(src0_f16, src1_f16, dst_ddf, ptrs_src_get, ptrs_dst_get, ne12, ne13, ne23, nb02,
|
||||
nb03, nb12_scaled, nb13_scaled, nbd2, nbd3, r2, r3, item_ct1);
|
||||
});
|
||||
@@ -3403,7 +3408,7 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
|
||||
{
|
||||
sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne10, max_work_group_size));
|
||||
sycl::range<3> grid_dims(1, n_ids, ids->ne[1]);
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl::local_accessor<int, 0> src1_row_acc(cgh);
|
||||
|
||||
char *__restrict src1_contiguous_get =
|
||||
@@ -3415,8 +3420,9 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
|
||||
size_t ids_nb_ct6 = ids->nb[1];
|
||||
size_t ids_nb_ct7 = ids->nb[0];
|
||||
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(grid_dims * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
k_copy_src1_to_contiguous(
|
||||
src1_original, src1_contiguous_get,
|
||||
dev_cur_src1_row_get,
|
||||
@@ -3447,14 +3453,15 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
|
||||
{
|
||||
sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne0, max_work_group_size));
|
||||
sycl::range<3> grid_dims(1, 1, num_src1_rows);
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
const char *__restrict dst_contiguous_get =
|
||||
dst_contiguous.get();
|
||||
const mmid_row_mapping *__restrict dev_row_mapping_get =
|
||||
dev_row_mapping.get();
|
||||
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(grid_dims * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
k_copy_dst_from_contiguous(dst_original,
|
||||
dst_contiguous_get,
|
||||
dev_row_mapping_get,
|
||||
@@ -3570,6 +3577,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
|
||||
case GGML_OP_SUB:
|
||||
ggml_sycl_sub(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_COUNT_EQUAL:
|
||||
ggml_sycl_count_equal(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_ACC:
|
||||
ggml_sycl_acc(ctx, dst);
|
||||
break;
|
||||
@@ -4349,6 +4359,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_ADD1:
|
||||
case GGML_OP_SUB:
|
||||
case GGML_OP_COUNT_EQUAL:
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
case GGML_OP_REPEAT:
|
||||
|
||||
@@ -11,13 +11,13 @@ static void gated_linear_attn_f32_kernel(const dpct::queue_ptr stream, u_int B,
|
||||
const u_int n_seq_tokens = T / B;
|
||||
sycl::range<1> block_dims((C / H));
|
||||
sycl::range<1> grid_dims((B * H));
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
/* local memory accessors*/
|
||||
auto _k = sycl::local_accessor<float, 1>(sycl::range<1>(head_size), cgh);
|
||||
auto _r = sycl::local_accessor<float, 1>(sycl::range<1>(head_size), cgh);
|
||||
auto _td = sycl::local_accessor<float, 1>(sycl::range<1>(head_size), cgh);
|
||||
|
||||
sycl_parallel_for<1>(cgh, sycl::nd_range<1>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<1> item) {
|
||||
cgh.parallel_for(sycl::nd_range<1>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<1> item) {
|
||||
u_int tid = item.get_local_id(0);
|
||||
u_int bid = item.get_group(0);
|
||||
|
||||
|
||||
@@ -70,7 +70,7 @@ static void im2col_sycl_internal(const float * x, T * dst, int64_t IW, int64_t I
|
||||
|
||||
const int64_t CHW = IC * KH * KW;
|
||||
|
||||
sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * local_range, local_range), [=](sycl::nd_item<3> item_ct1) {
|
||||
stream->parallel_for(sycl::nd_range<3>(block_nums * local_range, local_range), [=](sycl::nd_item<3> item_ct1) {
|
||||
im2col_kernel<T>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, CHW, s0, s1,
|
||||
p0, p1, d0, d1, item_ct1);
|
||||
});
|
||||
|
||||
+80
-60
@@ -1818,7 +1818,7 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl::local_accessor<int, 1> tile_x_qs_q4_0_acc_ct1(
|
||||
sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
|
||||
sycl::local_accessor<float, 1> tile_x_d_q4_0_acc_ct1(
|
||||
@@ -1829,8 +1829,9 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
|
||||
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
|
||||
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
mul_mat_q4_0<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
@@ -1852,7 +1853,7 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl::local_accessor<int, 1> tile_x_qs_q4_0_acc_ct1(
|
||||
sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
|
||||
sycl::local_accessor<float, 1> tile_x_d_q4_0_acc_ct1(
|
||||
@@ -1863,8 +1864,9 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
|
||||
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
|
||||
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
mul_mat_q4_0<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
@@ -1931,7 +1933,7 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl::local_accessor<int, 1> tile_x_qs_q4_1_acc_ct1(
|
||||
sycl::range<1>(mmq_y * (WARP_SIZE) + +mmq_y), cgh);
|
||||
sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_1_acc_ct1(
|
||||
@@ -1942,8 +1944,9 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
||||
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
|
||||
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
|
||||
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
mul_mat_q4_1<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
@@ -1965,7 +1968,7 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl::local_accessor<int, 1> tile_x_qs_q4_1_acc_ct1(
|
||||
sycl::range<1>(mmq_y * (WARP_SIZE) + +mmq_y), cgh);
|
||||
sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_1_acc_ct1(
|
||||
@@ -1976,8 +1979,9 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
||||
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
|
||||
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
|
||||
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
mul_mat_q4_1<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
@@ -2044,7 +2048,7 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl::local_accessor<int, 1> tile_x_ql_q5_0_acc_ct1(
|
||||
sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
|
||||
sycl::local_accessor<float, 1> tile_x_d_q5_0_acc_ct1(
|
||||
@@ -2055,8 +2059,9 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
|
||||
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
|
||||
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
mul_mat_q5_0<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
@@ -2078,7 +2083,7 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl::local_accessor<int, 1> tile_x_ql_q5_0_acc_ct1(
|
||||
sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
|
||||
sycl::local_accessor<float, 1> tile_x_d_q5_0_acc_ct1(
|
||||
@@ -2089,8 +2094,9 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
|
||||
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
|
||||
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
mul_mat_q5_0<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
@@ -2157,7 +2163,7 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl::local_accessor<int, 1> tile_x_ql_q5_1_acc_ct1(
|
||||
sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
|
||||
sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_1_acc_ct1(
|
||||
@@ -2168,8 +2174,9 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
||||
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
|
||||
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
|
||||
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
mul_mat_q5_1<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
@@ -2191,7 +2198,7 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl::local_accessor<int, 1> tile_x_ql_q5_1_acc_ct1(
|
||||
sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
|
||||
sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_1_acc_ct1(
|
||||
@@ -2202,8 +2209,9 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
||||
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
|
||||
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
|
||||
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
mul_mat_q5_1<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
@@ -2270,7 +2278,7 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl::local_accessor<int, 1> tile_x_qs_q8_0_acc_ct1(
|
||||
sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
|
||||
sycl::local_accessor<float, 1> tile_x_d_q8_0_acc_ct1(
|
||||
@@ -2281,8 +2289,9 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
|
||||
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
|
||||
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
mul_mat_q8_0<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
@@ -2304,7 +2313,7 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl::local_accessor<int, 1> tile_x_qs_q8_0_acc_ct1(
|
||||
sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
|
||||
sycl::local_accessor<float, 1> tile_x_d_q8_0_acc_ct1(
|
||||
@@ -2315,8 +2324,9 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
|
||||
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
|
||||
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
mul_mat_q8_0<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
@@ -2383,7 +2393,7 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl::local_accessor<int, 1> tile_x_ql_q2_K_acc_ct1(
|
||||
sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
|
||||
sycl::local_accessor<sycl::half2, 1> tile_x_dm_q2_K_acc_ct1(
|
||||
@@ -2396,8 +2406,9 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
|
||||
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
|
||||
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
mul_mat_q2_K<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
@@ -2420,7 +2431,7 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl::local_accessor<int, 1> tile_x_ql_q2_K_acc_ct1(
|
||||
sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
|
||||
sycl::local_accessor<sycl::half2, 1> tile_x_dm_q2_K_acc_ct1(
|
||||
@@ -2433,8 +2444,9 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
|
||||
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
|
||||
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
mul_mat_q2_K<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
@@ -2504,7 +2516,7 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl::local_accessor<int, 1> tile_x_ql_q3_K_acc_ct1(
|
||||
sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
|
||||
sycl::local_accessor<sycl::half2, 1> tile_x_dm_q3_K_acc_ct1(
|
||||
@@ -2519,8 +2531,9 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
|
||||
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
|
||||
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
mul_mat_q3_K<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
@@ -2544,7 +2557,7 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl::local_accessor<int, 1> tile_x_ql_q3_K_acc_ct1(
|
||||
sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
|
||||
sycl::local_accessor<sycl::half2, 1> tile_x_dm_q3_K_acc_ct1(
|
||||
@@ -2559,8 +2572,9 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
|
||||
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
|
||||
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
mul_mat_q3_K<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
@@ -2630,7 +2644,7 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl::local_accessor<int, 1> tile_x_ql_q4_K_acc_ct1(
|
||||
sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
|
||||
sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_K_acc_ct1(
|
||||
@@ -2643,8 +2657,9 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
|
||||
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
|
||||
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
mul_mat_q4_K<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
@@ -2667,7 +2682,7 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl::local_accessor<int, 1> tile_x_ql_q4_K_acc_ct1(
|
||||
sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
|
||||
sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_K_acc_ct1(
|
||||
@@ -2680,8 +2695,9 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
|
||||
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
|
||||
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
mul_mat_q4_K<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
@@ -2749,7 +2765,7 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl::local_accessor<int, 1> tile_x_ql_q5_K_acc_ct1(
|
||||
sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
|
||||
sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_K_acc_ct1(
|
||||
@@ -2762,8 +2778,9 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
|
||||
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
|
||||
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
mul_mat_q5_K<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
@@ -2786,7 +2803,7 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl::local_accessor<int, 1> tile_x_ql_q5_K_acc_ct1(
|
||||
sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
|
||||
sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_K_acc_ct1(
|
||||
@@ -2799,8 +2816,9 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
|
||||
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
|
||||
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
mul_mat_q5_K<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
@@ -2868,7 +2886,7 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
|
||||
sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
|
||||
sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
|
||||
@@ -2881,8 +2899,9 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
|
||||
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
|
||||
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
mul_mat_q6_K<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
@@ -2905,7 +2924,7 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
|
||||
sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
|
||||
sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
|
||||
@@ -2918,8 +2937,9 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
|
||||
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
|
||||
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
mul_mat_q6_K<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
|
||||
+201
-132
@@ -544,12 +544,12 @@ static void reorder_mul_mat_vec_q4_0_q8_1_sycl(const void * vx, const void * vy,
|
||||
const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, (block_num_y * WARP_SIZE));
|
||||
const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
|
||||
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl_parallel_for(cgh, sycl::nd_range<3>(global_size, workgroup_size),
|
||||
[=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q4_0>>(vx, vy, dst, ncols, nrows,
|
||||
nd_item);
|
||||
});
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
|
||||
[=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q4_0>>(vx, vy, dst, ncols, nrows,
|
||||
nd_item);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
@@ -561,12 +561,12 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void * vx, const void * vy, float *
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
|
||||
{
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -580,12 +580,17 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
{
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
|
||||
VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -599,12 +604,17 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
{
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
|
||||
VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -618,12 +628,17 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
{
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
|
||||
VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -637,12 +652,17 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
{
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
|
||||
VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -656,12 +676,17 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
{
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
|
||||
VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -675,12 +700,17 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
{
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
|
||||
VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -694,12 +724,17 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
{
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
|
||||
VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -715,12 +750,12 @@ static void reorder_mul_mat_vec_q4_k_q8_1_sycl(const void * vx, const void * vy,
|
||||
const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
|
||||
const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
|
||||
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl_parallel_for(cgh, sycl::nd_range<3>(global_size, workgroup_size),
|
||||
[=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K>>(vx, vy, dst, ncols, nrows,
|
||||
nd_item);
|
||||
});
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
|
||||
[=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K>>(vx, vy, dst, ncols,
|
||||
nrows, nd_item);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
@@ -734,12 +769,17 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
{
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
|
||||
VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -754,12 +794,12 @@ static void reorder_mul_mat_vec_q6_k_q8_1_sycl(const void * vx, const void * vy,
|
||||
const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
|
||||
const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
|
||||
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl_parallel_for(cgh, sycl::nd_range<3>(global_size, workgroup_size),
|
||||
[=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q6_K>>(vx, vy, dst, ncols, nrows,
|
||||
nd_item);
|
||||
});
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
|
||||
[=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q6_K>>(vx, vy, dst, ncols, nrows,
|
||||
nd_item);
|
||||
});
|
||||
});
|
||||
}
|
||||
static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
@@ -771,12 +811,17 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
{
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
|
||||
VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -791,12 +836,14 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
{
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS / 2, block_iq2_xxs, 1>(vx, vy, dst, ncols,
|
||||
nrows, item_ct1);
|
||||
});
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS/2, block_iq2_xxs, 1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -810,12 +857,14 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
{
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS / 2, block_iq2_xs, 1>(vx, vy, dst, ncols,
|
||||
nrows, item_ct1);
|
||||
});
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS/2, block_iq2_xs, 1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -829,12 +878,15 @@ static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
{
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S / 2, block_iq2_s, 1>(vx, vy, dst, ncols, nrows,
|
||||
item_ct1);
|
||||
});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S/2, block_iq2_s, 1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -848,12 +900,15 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
{
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS / 2, block_iq3_xxs, 1>(vx, vy, dst, ncols,
|
||||
nrows, item_ct1);
|
||||
});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS/2, block_iq3_xxs, 1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -867,12 +922,15 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
{
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S / 2, block_iq3_s, 1>(vx, vy, dst, ncols, nrows,
|
||||
item_ct1);
|
||||
});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S/2, block_iq3_s, 1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -886,12 +944,15 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
{
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(vx, vy, dst, ncols, nrows,
|
||||
item_ct1);
|
||||
});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -905,12 +966,14 @@ static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy,
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
{
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(vx, vy, dst, ncols, nrows,
|
||||
item_ct1);
|
||||
});
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -924,12 +987,15 @@ static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy,
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
{
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 2>(vx, vy, dst, ncols, nrows,
|
||||
item_ct1);
|
||||
});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 2>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -943,12 +1009,15 @@ static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy,
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
{
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS / 4, block_iq4_xs, 1>(vx, vy, dst, ncols,
|
||||
nrows, item_ct1);
|
||||
});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS/4, block_iq4_xs, 1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
+74
-55
@@ -254,13 +254,14 @@ static void norm_f32_sycl(const float * x, float * dst, const int ncols, const i
|
||||
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
||||
if (ncols < 1024) {
|
||||
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl_parallel_for(cgh, sycl::nd_range<3>(global_dims * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1,
|
||||
nullptr, WARP_SIZE);
|
||||
});
|
||||
});
|
||||
stream->submit([&](sycl::handler& cgh) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(global_dims * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, nullptr, WARP_SIZE);
|
||||
});
|
||||
});
|
||||
}
|
||||
else {
|
||||
const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
|
||||
@@ -271,15 +272,16 @@ static void norm_f32_sycl(const float * x, float * dst, const int ncols, const i
|
||||
the limit. To get the device limit, query
|
||||
info::device::max_work_group_size. Adjust the work-group size if needed.
|
||||
*/
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
stream->submit([&](sycl::handler& cgh) {
|
||||
sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
|
||||
sycl::range<1>(work_group_size / WARP_SIZE), cgh);
|
||||
sycl_parallel_for(cgh, sycl::nd_range<3>(global_dims * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1,
|
||||
get_pointer(s_sum_acc_ct1), work_group_size);
|
||||
});
|
||||
});
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(global_dims * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -288,14 +290,18 @@ static void group_norm_f32_sycl(const float* x, float* dst,
|
||||
const int ne_elements, queue_ptr stream, int device) {
|
||||
if (group_size < 1024) {
|
||||
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
stream->submit([&](sycl::handler& cgh) {
|
||||
const float eps_ct4 = eps;
|
||||
sycl_parallel_for(cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
group_norm_f32(x, dst, group_size, ne_elements, eps_ct4, item_ct1, nullptr,
|
||||
WARP_SIZE);
|
||||
});
|
||||
});
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
|
||||
block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
group_norm_f32(
|
||||
x, dst, group_size, ne_elements, eps_ct4, item_ct1,
|
||||
nullptr, WARP_SIZE);
|
||||
});
|
||||
});
|
||||
}
|
||||
else {
|
||||
const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
|
||||
@@ -307,18 +313,22 @@ static void group_norm_f32_sycl(const float* x, float* dst,
|
||||
info::device::max_work_group_size. Adjust the work-group size if needed.
|
||||
*/
|
||||
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
stream->submit([&](sycl::handler& cgh) {
|
||||
sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
|
||||
cgh);
|
||||
|
||||
const float eps_ct4 = eps;
|
||||
|
||||
sycl_parallel_for(cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
group_norm_f32(x, dst, group_size, ne_elements, eps_ct4, item_ct1,
|
||||
get_pointer(s_sum_acc_ct1), work_group_size);
|
||||
});
|
||||
});
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
|
||||
block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
group_norm_f32(x, dst, group_size, ne_elements,
|
||||
eps_ct4, item_ct1,
|
||||
get_pointer(s_sum_acc_ct1), work_group_size);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -330,13 +340,14 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const
|
||||
const sycl::range<3> global_dims(nsamples, nchannels, nrows);
|
||||
if (ncols < 1024) {
|
||||
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl_parallel_for(cgh, sycl::nd_range<3>(global_dims * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1,
|
||||
nullptr, WARP_SIZE);
|
||||
});
|
||||
});
|
||||
stream->submit([&](sycl::handler& cgh) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(global_dims * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, nullptr, WARP_SIZE);
|
||||
});
|
||||
});
|
||||
}
|
||||
else {
|
||||
const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
|
||||
@@ -347,15 +358,16 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const
|
||||
the limit. To get the device limit, query
|
||||
info::device::max_work_group_size. Adjust the work-group size if needed.
|
||||
*/
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
stream->submit([&](sycl::handler& cgh) {
|
||||
sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
|
||||
cgh);
|
||||
sycl_parallel_for(cgh, sycl::nd_range<3>(global_dims * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1,
|
||||
get_pointer(s_sum_acc_ct1), work_group_size);
|
||||
});
|
||||
});
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(global_dims * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -366,12 +378,16 @@ static void l2_norm_f32_sycl(const float* x, float* dst, const int ncols,
|
||||
// printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE);
|
||||
if (ncols < 1024) {
|
||||
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl_parallel_for(cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
l2_norm_f32(x, dst, ncols, eps, item_ct1, nullptr, WARP_SIZE);
|
||||
});
|
||||
});
|
||||
stream->submit([&](sycl::handler& cgh) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
||||
block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
l2_norm_f32(x, dst, ncols, eps, item_ct1,
|
||||
nullptr, WARP_SIZE);
|
||||
});
|
||||
});
|
||||
}
|
||||
else {
|
||||
const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
|
||||
@@ -382,15 +398,18 @@ static void l2_norm_f32_sycl(const float* x, float* dst, const int ncols,
|
||||
the limit. To get the device limit, query
|
||||
info::device::max_work_group_size. Adjust the work-group size if needed.
|
||||
*/
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
stream->submit([&](sycl::handler& cgh) {
|
||||
sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
|
||||
cgh);
|
||||
sycl_parallel_for(cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
l2_norm_f32(x, dst, ncols, eps, item_ct1, get_pointer(s_sum_acc_ct1),
|
||||
work_group_size);
|
||||
});
|
||||
});
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
||||
block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
l2_norm_f32(x, dst, ncols, eps, item_ct1,
|
||||
get_pointer(s_sum_acc_ct1), work_group_size);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
+20
-24
@@ -232,22 +232,20 @@ static void rope_norm_sycl(const T * x, T * dst, const int ne0, const int ne1, c
|
||||
the limit. To get the device limit, query
|
||||
info::device::max_work_group_size. Adjust the work-group size if needed.
|
||||
*/
|
||||
sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
rope_norm<T, false>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
|
||||
attn_factor, corr_dims, theta_scale, freq_factors, item_ct1);
|
||||
});
|
||||
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
rope_norm<T, false>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims,
|
||||
theta_scale, freq_factors, item_ct1);
|
||||
});
|
||||
} else {
|
||||
/*
|
||||
DPCT1049:41: The work-group size passed to the SYCL kernel may exceed
|
||||
the limit. To get the device limit, query
|
||||
info::device::max_work_group_size. Adjust the work-group size if needed.
|
||||
*/
|
||||
sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
rope_norm<T, true>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
|
||||
attn_factor, corr_dims, theta_scale, freq_factors, item_ct1);
|
||||
});
|
||||
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
rope_norm<T, true>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims,
|
||||
theta_scale, freq_factors, item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -266,17 +264,15 @@ static void rope_neox_sycl(const T * x, T * dst, const int ne0, const int ne1, c
|
||||
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
||||
|
||||
if (freq_factors == nullptr) {
|
||||
sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
rope_neox<T, false>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
|
||||
attn_factor, corr_dims, theta_scale, freq_factors, item_ct1);
|
||||
});
|
||||
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
rope_neox<T, false>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims,
|
||||
theta_scale, freq_factors, item_ct1);
|
||||
});
|
||||
} else {
|
||||
sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
rope_neox<T, true>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
|
||||
attn_factor, corr_dims, theta_scale, freq_factors, item_ct1);
|
||||
});
|
||||
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
rope_neox<T, true>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims,
|
||||
theta_scale, freq_factors, item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -299,12 +295,12 @@ static void rope_multi_sycl(const T * x, T * dst, const int ne0, const int ne1,
|
||||
}
|
||||
// launch kernel
|
||||
if (freq_factors == nullptr) {
|
||||
sycl_parallel_for(stream, nd_range, [=](sycl::nd_item<3> item_ct1) {
|
||||
stream->parallel_for(nd_range, [=](sycl::nd_item<3> item_ct1) {
|
||||
rope_multi<T, false>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
|
||||
corr_dims, theta_scale, freq_factors, sections, item_ct1);
|
||||
});
|
||||
} else {
|
||||
sycl_parallel_for(stream, nd_range, [=](sycl::nd_item<3> item_ct1) {
|
||||
stream->parallel_for(nd_range, [=](sycl::nd_item<3> item_ct1) {
|
||||
rope_multi<T, true>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
|
||||
corr_dims, theta_scale, freq_factors, sections, item_ct1);
|
||||
});
|
||||
@@ -334,12 +330,12 @@ static void rope_vision_sycl(const T * x, T * dst, const int ne0, const int ne1,
|
||||
}
|
||||
// launch kernel
|
||||
if (freq_factors == nullptr) {
|
||||
sycl_parallel_for(stream, nd_range, [=](sycl::nd_item<3> item_ct1) {
|
||||
stream->parallel_for(nd_range, [=](sycl::nd_item<3> item_ct1) {
|
||||
rope_vision<T, false>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
|
||||
corr_dims, theta_scale, freq_factors, sections, item_ct1);
|
||||
});
|
||||
} else {
|
||||
sycl_parallel_for(stream, nd_range, [=](sycl::nd_item<3> item_ct1) {
|
||||
stream->parallel_for(nd_range, [=](sycl::nd_item<3> item_ct1) {
|
||||
rope_vision<T, true>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
|
||||
corr_dims, theta_scale, freq_factors, sections, item_ct1);
|
||||
});
|
||||
|
||||
@@ -48,7 +48,7 @@ static void set_rows_sycl_q(const char * __restrict__ src0_d,
|
||||
constexpr int block_size = 256;
|
||||
const int64_t grid_size = ceil_div(total_blocks, block_size);
|
||||
|
||||
sycl_parallel_for(stream, sycl::nd_range<1>(grid_size * block_size, block_size), [=](sycl::nd_item<1> item_ct1) {
|
||||
stream->parallel_for(sycl::nd_range<1>(grid_size * block_size, block_size), [=](sycl::nd_item<1> item_ct1) {
|
||||
const int64_t i = item_ct1.get_global_linear_id();
|
||||
if (i >= total_blocks) {
|
||||
return;
|
||||
@@ -129,8 +129,7 @@ static void set_rows_sycl(
|
||||
constexpr int block_size = 64;
|
||||
const int64_t grid_size = ceil_div(total_elements, block_size);
|
||||
|
||||
sycl_parallel_for(
|
||||
stream,
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<1>(grid_size * block_size, block_size),
|
||||
[=](sycl::nd_item<1> item_ct1) {
|
||||
k_set_rows<TIn, TOut>(
|
||||
|
||||
@@ -127,11 +127,11 @@ static void soft_max_f32_submitter(const float * x, const T * mask, float * dst,
|
||||
const int nrows_y, const float scale, const float max_bias, const float m0,
|
||||
const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims,
|
||||
const size_t n_local_scratch, queue_ptr stream) {
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl::local_accessor<float, 1> local_buf_acc(n_local_scratch, cgh);
|
||||
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, dst, ncols_par,
|
||||
nrows_y, scale, max_bias, m0,
|
||||
|
||||
@@ -21,11 +21,12 @@ static void timestep_embedding_f32(
|
||||
int j = item_ct1.get_local_id(2) + item_ct1.get_group(2) * item_ct1.get_local_range(2);
|
||||
float * embed_data = (float *)((char *)dst + i*nb1);
|
||||
|
||||
if (dim % 2 != 0 && j == ((dim + 1) / 2)) {
|
||||
embed_data[dim] = 0.f;
|
||||
int half = dim / 2;
|
||||
|
||||
if (dim % 2 != 0 && j == half) {
|
||||
embed_data[2 * half] = 0.f;
|
||||
}
|
||||
|
||||
int half = dim / 2;
|
||||
if (j >= half) {
|
||||
return;
|
||||
}
|
||||
@@ -45,9 +46,14 @@ static void timestep_embedding_f32_sycl(
|
||||
int num_blocks = (half_ceil + SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE - 1) / SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE;
|
||||
sycl::range<3> block_dims(1, 1, SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE);
|
||||
sycl::range<3> gridDim(1, ne00, num_blocks);
|
||||
sycl_parallel_for(stream, sycl::nd_range<3>(gridDim * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
timestep_embedding_f32(x, dst, nb1, dim, max_period, item_ct1);
|
||||
});
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(
|
||||
gridDim * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
timestep_embedding_f32(
|
||||
x, dst, nb1, dim, max_period, item_ct1
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
+16
-12
@@ -207,11 +207,12 @@ void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
||||
|
||||
// Submit kernel
|
||||
if (C / H == WKV_BLOCK_SIZE) {
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
stream->submit([&](sycl::handler& cgh) {
|
||||
sycl::local_accessor<float, 1> shared_mem_acc(shared_mem_size, cgh);
|
||||
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(grid_dims * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
rwkv_wkv6_f32_kernel<WKV_BLOCK_SIZE>(
|
||||
B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d,
|
||||
item_ct1, (float*)shared_mem_acc.get_multi_ptr<sycl::access::decorated::no>().get()
|
||||
@@ -219,11 +220,12 @@ void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
||||
});
|
||||
});
|
||||
} else {
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
stream->submit([&](sycl::handler& cgh) {
|
||||
sycl::local_accessor<float, 1> shared_mem_acc(shared_mem_size, cgh);
|
||||
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(grid_dims * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
rwkv_wkv6_f32_kernel<WKV_BLOCK_SIZE * 2>(
|
||||
B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d,
|
||||
item_ct1, (float*)shared_mem_acc.get_multi_ptr<sycl::access::decorated::no>().get()
|
||||
@@ -262,11 +264,12 @@ void ggml_sycl_op_rwkv_wkv7(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
||||
|
||||
// Submit kernel
|
||||
if (C / H == WKV_BLOCK_SIZE) {
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
stream->submit([&](sycl::handler& cgh) {
|
||||
sycl::local_accessor<float, 1> shared_mem_acc(shared_mem_size, cgh);
|
||||
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(grid_dims * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
rwkv_wkv7_f32_kernel<WKV_BLOCK_SIZE>(
|
||||
B, T, C, H, r_d, w_d, k_d, v_d, a_d, b_d, s_d, dst_d,
|
||||
item_ct1, (float*)shared_mem_acc.get_multi_ptr<sycl::access::decorated::no>().get()
|
||||
@@ -274,11 +277,12 @@ void ggml_sycl_op_rwkv_wkv7(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
||||
});
|
||||
});
|
||||
} else {
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
stream->submit([&](sycl::handler& cgh) {
|
||||
sycl::local_accessor<float, 1> shared_mem_acc(shared_mem_size, cgh);
|
||||
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(grid_dims * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
rwkv_wkv7_f32_kernel<WKV_BLOCK_SIZE * 2>(
|
||||
B, T, C, H, r_d, w_d, k_d, v_d, a_d, b_d, s_d, dst_d,
|
||||
item_ct1, (float*)shared_mem_acc.get_multi_ptr<sycl::access::decorated::no>().get()
|
||||
|
||||
@@ -5,8 +5,14 @@
|
||||
#include "ggml-cpu.h"
|
||||
#endif
|
||||
|
||||
// See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers-
|
||||
#define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1
|
||||
|
||||
#include <vulkan/vulkan.hpp>
|
||||
|
||||
// See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers-
|
||||
VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <iomanip>
|
||||
@@ -121,6 +127,8 @@ struct vk_pipeline_struct {
|
||||
bool needed {};
|
||||
// set to true when the shader has been compiled
|
||||
bool compiled {};
|
||||
// number of registers used, extracted from pipeline executable properties
|
||||
uint32_t register_count {};
|
||||
};
|
||||
|
||||
typedef std::shared_ptr<vk_pipeline_struct> vk_pipeline;
|
||||
@@ -429,6 +437,8 @@ struct vk_device_struct {
|
||||
|
||||
bool coopmat2;
|
||||
|
||||
bool pipeline_executable_properties_support {};
|
||||
|
||||
size_t idx;
|
||||
|
||||
bool mul_mat_l[GGML_TYPE_COUNT];
|
||||
@@ -1221,8 +1231,6 @@ static std::string format_size(size_t size) {
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
static std::mutex log_mutex;
|
||||
|
||||
class vk_memory_logger {
|
||||
public:
|
||||
vk_memory_logger(): total_device(0), total_host(0) {}
|
||||
@@ -1412,6 +1420,8 @@ struct ggml_backend_vk_buffer_context {
|
||||
};
|
||||
|
||||
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
||||
static std::mutex log_mutex;
|
||||
|
||||
void vk_memory_logger::log_allocation(vk_buffer_ref buf_ref, size_t size) {
|
||||
std::lock_guard<std::mutex> guard(log_mutex);
|
||||
vk_buffer buf = buf_ref.lock();
|
||||
@@ -1603,6 +1613,20 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
|
||||
vk_instance.pfn_vkSetDebugUtilsObjectNameEXT(device->device, &static_cast<VkDebugUtilsObjectNameInfoEXT &>(duoni));
|
||||
}
|
||||
|
||||
if (device->pipeline_executable_properties_support) {
|
||||
vk::PipelineExecutableInfoKHR executableInfo;
|
||||
executableInfo.pipeline = pipeline->pipeline;
|
||||
|
||||
auto statistics = device->device.getPipelineExecutableStatisticsKHR(executableInfo);
|
||||
for (auto & s : statistics) {
|
||||
// "Register Count" is reported by NVIDIA drivers.
|
||||
if (strcmp(s.name, "Register Count") == 0) {
|
||||
VK_LOG_DEBUG(pipeline->name << " " << s.name << ": " << s.value.u64 << " registers");
|
||||
pipeline->register_count = (uint32_t)s.value.u64;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
std::lock_guard<std::recursive_mutex> guard(device->mutex);
|
||||
device->all_pipelines.push_back(pipeline);
|
||||
@@ -1960,7 +1984,7 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std
|
||||
}
|
||||
}
|
||||
|
||||
if (buf->device_memory == VK_NULL_HANDLE) {
|
||||
if (!buf->device_memory) {
|
||||
device->device.destroyBuffer(buf->buffer);
|
||||
throw vk::OutOfDeviceMemoryError("No suitable memory type found");
|
||||
}
|
||||
@@ -3610,6 +3634,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
bool amd_shader_core_properties2 = false;
|
||||
bool pipeline_robustness = false;
|
||||
bool coopmat2_support = false;
|
||||
bool pipeline_executable_properties_support = false;
|
||||
device->coopmat_support = false;
|
||||
device->integer_dot_product = false;
|
||||
bool bfloat16_support = false;
|
||||
@@ -3652,6 +3677,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
!getenv("GGML_VK_DISABLE_BFLOAT16")) {
|
||||
bfloat16_support = true;
|
||||
#endif
|
||||
} else if (strcmp("VK_KHR_pipeline_executable_properties", properties.extensionName) == 0) {
|
||||
pipeline_executable_properties_support = true;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3878,8 +3905,18 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
device_extensions.push_back("VK_KHR_shader_integer_dot_product");
|
||||
}
|
||||
|
||||
VkPhysicalDevicePipelineExecutablePropertiesFeaturesKHR pep_features {};
|
||||
pep_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_EXECUTABLE_PROPERTIES_FEATURES_KHR;
|
||||
if (pipeline_executable_properties_support) {
|
||||
last_struct->pNext = (VkBaseOutStructure *)&pep_features;
|
||||
last_struct = (VkBaseOutStructure *)&pep_features;
|
||||
device_extensions.push_back("VK_KHR_pipeline_executable_properties");
|
||||
}
|
||||
|
||||
vkGetPhysicalDeviceFeatures2(device->physical_device, &device_features2);
|
||||
|
||||
device->pipeline_executable_properties_support = pipeline_executable_properties_support;
|
||||
|
||||
device->fp16 = device->fp16 && vk12_features.shaderFloat16;
|
||||
|
||||
#if defined(VK_KHR_shader_bfloat16)
|
||||
@@ -4386,8 +4423,8 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
||||
|
||||
static bool ggml_vk_instance_validation_ext_available();
|
||||
static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
|
||||
|
||||
static bool ggml_vk_instance_debug_utils_ext_available(const std::vector<vk::ExtensionProperties> & instance_extensions);
|
||||
static bool ggml_vk_device_is_supported(const vk::PhysicalDevice & vkdev);
|
||||
|
||||
static void ggml_vk_instance_init() {
|
||||
if (vk_instance_initialized) {
|
||||
@@ -4395,6 +4432,9 @@ static void ggml_vk_instance_init() {
|
||||
}
|
||||
VK_LOG_DEBUG("ggml_vk_instance_init()");
|
||||
|
||||
// See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers-
|
||||
VULKAN_HPP_DEFAULT_DISPATCHER.init(vkGetInstanceProcAddr);
|
||||
|
||||
uint32_t api_version = vk::enumerateInstanceVersion();
|
||||
|
||||
if (api_version < VK_API_VERSION_1_2) {
|
||||
@@ -4462,6 +4502,9 @@ static void ggml_vk_instance_init() {
|
||||
|
||||
vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr;
|
||||
|
||||
// See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers-
|
||||
VULKAN_HPP_DEFAULT_DISPATCHER.init(vk_instance.instance);
|
||||
|
||||
std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
|
||||
|
||||
// Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan
|
||||
@@ -4497,7 +4540,7 @@ static void ggml_vk_instance_init() {
|
||||
new_driver.pNext = &new_id;
|
||||
devices[i].getProperties2(&new_props);
|
||||
|
||||
if (new_props.properties.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
|
||||
if ((new_props.properties.deviceType == vk::PhysicalDeviceType::eDiscreteGpu || new_props.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu) && ggml_vk_device_is_supported(devices[i])) {
|
||||
// Check if there are two physical devices corresponding to the same GPU
|
||||
auto old_device = std::find_if(
|
||||
vk_instance.device_indices.begin(),
|
||||
@@ -4567,7 +4610,7 @@ static void ggml_vk_instance_init() {
|
||||
}
|
||||
}
|
||||
|
||||
// If no dedicated GPUs found, fall back to the first non-CPU device.
|
||||
// If no GPUs found, fall back to the first non-CPU device.
|
||||
// If only CPU devices are available, return without devices.
|
||||
if (vk_instance.device_indices.empty()) {
|
||||
for (size_t i = 0; i < devices.size(); i++) {
|
||||
@@ -12078,12 +12121,63 @@ void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total
|
||||
}
|
||||
}
|
||||
|
||||
static vk::PhysicalDeviceType ggml_backend_vk_get_device_type(int device_idx) {
|
||||
GGML_ASSERT(device_idx >= 0 && device_idx < (int) vk_instance.device_indices.size());
|
||||
|
||||
vk::PhysicalDevice device = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device_idx]];
|
||||
|
||||
vk::PhysicalDeviceProperties2 props = {};
|
||||
device.getProperties2(&props);
|
||||
|
||||
return props.properties.deviceType;
|
||||
}
|
||||
|
||||
static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
|
||||
GGML_ASSERT(device_idx >= 0 && device_idx < (int) vk_instance.device_indices.size());
|
||||
|
||||
vk::PhysicalDevice device = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device_idx]];
|
||||
|
||||
const std::vector<vk::ExtensionProperties> ext_props = device.enumerateDeviceExtensionProperties();
|
||||
|
||||
bool ext_support = false;
|
||||
|
||||
for (const auto& properties : ext_props) {
|
||||
if (strcmp("VK_EXT_pci_bus_info", properties.extensionName) == 0) {
|
||||
ext_support = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!ext_support) {
|
||||
return "";
|
||||
}
|
||||
|
||||
vk::PhysicalDeviceProperties2 props = {};
|
||||
vk::PhysicalDevicePCIBusInfoPropertiesEXT pci_bus_info = {};
|
||||
|
||||
props.pNext = &pci_bus_info;
|
||||
|
||||
device.getProperties2(&props);
|
||||
|
||||
const uint32_t pci_domain = pci_bus_info.pciDomain;
|
||||
const uint32_t pci_bus = pci_bus_info.pciBus;
|
||||
const uint32_t pci_device = pci_bus_info.pciDevice;
|
||||
const uint8_t pci_function = (uint8_t) pci_bus_info.pciFunction; // pci function is between 0 and 7, prevent printf overflow warning
|
||||
|
||||
char pci_bus_id[16] = {};
|
||||
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.%x", pci_domain, pci_bus, pci_device, pci_function);
|
||||
|
||||
return std::string(pci_bus_id);
|
||||
}
|
||||
|
||||
//////////////////////////
|
||||
|
||||
struct ggml_backend_vk_device_context {
|
||||
size_t device;
|
||||
std::string name;
|
||||
std::string description;
|
||||
bool is_integrated_gpu;
|
||||
std::string pci_bus_id;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
|
||||
@@ -12112,16 +12206,18 @@ static ggml_backend_buffer_type_t ggml_backend_vk_device_get_host_buffer_type(gg
|
||||
}
|
||||
|
||||
static enum ggml_backend_dev_type ggml_backend_vk_device_get_type(ggml_backend_dev_t dev) {
|
||||
UNUSED(dev);
|
||||
// TODO: return GGML_BACKEND_DEVICE_TYPE_IGPU for integrated GPUs
|
||||
return GGML_BACKEND_DEVICE_TYPE_GPU;
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
|
||||
return ctx->is_integrated_gpu ? GGML_BACKEND_DEVICE_TYPE_IGPU : GGML_BACKEND_DEVICE_TYPE_GPU;
|
||||
}
|
||||
|
||||
static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
|
||||
props->name = ggml_backend_vk_device_get_name(dev);
|
||||
props->description = ggml_backend_vk_device_get_description(dev);
|
||||
props->type = ggml_backend_vk_device_get_type(dev);
|
||||
// TODO: set props->device_id to PCI bus id
|
||||
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
|
||||
ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
props->caps = {
|
||||
/* .async = */ false,
|
||||
@@ -12388,8 +12484,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
}
|
||||
|
||||
if (
|
||||
src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_I32 ||
|
||||
src0_type == GGML_TYPE_I32 && src1_type == GGML_TYPE_F32
|
||||
(src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_I32) ||
|
||||
(src0_type == GGML_TYPE_I32 && src1_type == GGML_TYPE_F32)
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
@@ -12554,6 +12650,8 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
ctx->device = i;
|
||||
ctx->name = GGML_VK_NAME + std::to_string(i);
|
||||
ctx->description = desc;
|
||||
ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu;
|
||||
ctx->pci_bus_id = ggml_backend_vk_get_device_pci_id(i);
|
||||
devices.push_back(new ggml_backend_device {
|
||||
/* .iface = */ ggml_backend_vk_device_i,
|
||||
/* .reg = */ reg,
|
||||
@@ -12640,6 +12738,20 @@ static bool ggml_vk_instance_debug_utils_ext_available(
|
||||
UNUSED(instance_extensions);
|
||||
}
|
||||
|
||||
static bool ggml_vk_device_is_supported(const vk::PhysicalDevice & vkdev) {
|
||||
VkPhysicalDeviceFeatures2 device_features2;
|
||||
device_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
|
||||
|
||||
VkPhysicalDeviceVulkan11Features vk11_features;
|
||||
vk11_features.pNext = nullptr;
|
||||
vk11_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES;
|
||||
device_features2.pNext = &vk11_features;
|
||||
|
||||
vkGetPhysicalDeviceFeatures2(vkdev, &device_features2);
|
||||
|
||||
return vk11_features.storageBuffer16BitAccess;
|
||||
}
|
||||
|
||||
static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch) {
|
||||
switch (props.vendorID) {
|
||||
case VK_VENDOR_ID_INTEL:
|
||||
@@ -13040,16 +13152,16 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
|
||||
} else if (tensor->op == GGML_OP_IM2COL_3D) {
|
||||
const int32_t s0 = tensor->op_params[0];
|
||||
const int32_t s1 = tensor->op_params[1];
|
||||
const int32_t s1 = tensor->op_params[2];
|
||||
const int32_t s2 = tensor->op_params[2];
|
||||
const int32_t p0 = tensor->op_params[3];
|
||||
const int32_t p1 = tensor->op_params[4];
|
||||
const int32_t p1 = tensor->op_params[5];
|
||||
const int32_t p2 = tensor->op_params[5];
|
||||
const int32_t d0 = tensor->op_params[6];
|
||||
const int32_t d1 = tensor->op_params[7];
|
||||
const int32_t d1 = tensor->op_params[8];
|
||||
const int32_t d2 = tensor->op_params[8];
|
||||
const int32_t IC = tensor->op_params[9];
|
||||
|
||||
tensor_clone = ggml_im2col(ggml_ctx, src_clone[0], src_clone[1], IC, s0, s1, s2, p0, p1, p2, d0, d1, d2, tensor->type);
|
||||
tensor_clone = ggml_im2col_3d(ggml_ctx, src_clone[0], src_clone[1], IC, s0, s1, s2, p0, p1, p2, d0, d1, d2, tensor->type);
|
||||
} else if (tensor->op == GGML_OP_TIMESTEP_EMBEDDING) {
|
||||
const int32_t dim = tensor->op_params[0];
|
||||
const int32_t max_period = tensor->op_params[1];
|
||||
|
||||
@@ -29,7 +29,7 @@ void main() {
|
||||
uint qs = data_a[ib].qs[4 * ib32 + l];
|
||||
const uint8_t sign = data_a[ib].qs[QUANT_K / 8 + 4 * ib32 + l];
|
||||
qs |= (qh << (8 - 2 * l)) & 0x300;
|
||||
const uvec2 grid = iq2s_grid[qs & 511];
|
||||
const uvec2 grid = iq2s_grid[qs];
|
||||
const u8vec4 grid0 = unpack8(grid.x);
|
||||
const u8vec4 grid1 = unpack8(grid.y);
|
||||
data_b[b_idx + 8 * l + 0] = D_TYPE(db[l/2] * grid0.x * ((sign & 1) != 0 ? -1.0 : 1.0));
|
||||
|
||||
@@ -33,7 +33,8 @@ void main() {
|
||||
[[unroll]] for (uint l = 0; l < 4; ++l) {
|
||||
const uint sign7 = bitfieldExtract(signscale, 7 * int(l), 7);
|
||||
const uint sign8 = sign7 | (bitCount(sign7) << 7); // parity bit
|
||||
const uvec2 grid = iq2xxs_grid[data_a[ib].qs[8 * is + l]];
|
||||
const uint qs = data_a[ib].qs[8 * is + l];
|
||||
const uvec2 grid = iq2xxs_grid[qs];
|
||||
const u8vec4 grid0 = unpack8(grid.x);
|
||||
const u8vec4 grid1 = unpack8(grid.y);
|
||||
data_b[b_idx + 8 * l + 0] = D_TYPE(db * grid0.x * ((sign8 & 1) != 0 ? -1.0 : 1.0));
|
||||
|
||||
@@ -22,15 +22,16 @@ void main() {
|
||||
const uint b_idx = 256 * ib + 32 * is;
|
||||
|
||||
const float d = float(data_a[ib].d);
|
||||
const float db = d * (1 + 2 * ((data_a[ib].scales[is] >> (4 * (is % 2))) & 0xf));
|
||||
const float db = d * (1 + 2 * ((data_a[ib].scales[is / 2] >> (4 * (is % 2))) & 0xf));
|
||||
|
||||
// We must produce 32 values using 4 sign bytes, 1 qh byte, 8 qs bytes.
|
||||
uint qh = data_a[ib].qh[is];
|
||||
[[unroll]] for (uint l = 0; l < 8; ++l) {
|
||||
uint qs = data_a[ib].qs[8 * is + l];
|
||||
uint gidx = qs | ((qh << (8 - l)) & 256);
|
||||
uint8_t signs = data_a[ib].signs[8 * is + l / 2] >> (4 * (l & 1));
|
||||
u8vec4 grid = unpack8(iq3s_grid[gidx]);
|
||||
const uint iqs = 8 * is + l;
|
||||
const uint qs = data_a[ib].qs[iqs];
|
||||
const uint gidx = qs | ((qh << (8 - l)) & 256);
|
||||
const uint8_t signs = data_a[ib].signs[iqs / 2] >> (4 * (l & 1));
|
||||
const u8vec4 grid = unpack8(iq3s_grid[gidx]);
|
||||
data_b[b_idx + 4 * l + 0] = D_TYPE(db * grid.x * ((signs & 1) != 0 ? -1.0 : 1.0));
|
||||
data_b[b_idx + 4 * l + 1] = D_TYPE(db * grid.y * ((signs & 2) != 0 ? -1.0 : 1.0));
|
||||
data_b[b_idx + 4 * l + 2] = D_TYPE(db * grid.z * ((signs & 4) != 0 ? -1.0 : 1.0));
|
||||
|
||||
@@ -35,8 +35,10 @@ void main() {
|
||||
const uint sign7 = bitfieldExtract(signscale, 7 * int(l), 7);
|
||||
// Restore parity bit.
|
||||
const uint sign8 = sign7 | (bitCount(sign7) << 7);
|
||||
const u8vec4 grid0 = unpack8(iq3xxs_grid[data_a[ib].qs[8 * is + 2 * l]]);
|
||||
const u8vec4 grid1 = unpack8(iq3xxs_grid[data_a[ib].qs[8 * is + 2 * l + 1]]);
|
||||
const uint qs0 = data_a[ib].qs[8 * is + 2 * l];
|
||||
const uint qs1 = data_a[ib].qs[8 * is + 2 * l + 1];
|
||||
const u8vec4 grid0 = unpack8(iq3xxs_grid[qs0]);
|
||||
const u8vec4 grid1 = unpack8(iq3xxs_grid[qs1]);
|
||||
data_b[b_idx + 8 * l + 0] = D_TYPE(db * grid0.x * ((sign8 & 1) != 0 ? -1.0 : 1.0));
|
||||
data_b[b_idx + 8 * l + 1] = D_TYPE(db * grid0.y * ((sign8 & 2) != 0 ? -1.0 : 1.0));
|
||||
data_b[b_idx + 8 * l + 2] = D_TYPE(db * grid0.z * ((sign8 & 4) != 0 ? -1.0 : 1.0));
|
||||
|
||||
@@ -183,6 +183,8 @@ void load_row_ids(uint expert_idx, bool nei0_is_pow2, uint ic) {
|
||||
shared ACC_TYPE coopmat_stage[TM * TN * NUM_WARPS];
|
||||
#endif
|
||||
|
||||
#include "mul_mm_funcs.comp"
|
||||
|
||||
void main() {
|
||||
#ifdef NEEDS_INIT_IQ_SHMEM
|
||||
init_iq_shmem(gl_WorkGroupSize);
|
||||
@@ -310,550 +312,13 @@ void main() {
|
||||
|
||||
for (uint block = start_k; block < end_k; block += BK) {
|
||||
[[unroll]] for (uint l = 0; l < BM; l += loadstride_a) {
|
||||
|
||||
#if defined(DATA_A_F32) || defined(DATA_A_F16)
|
||||
#if LOAD_VEC_A == 8
|
||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
|
||||
A_TYPE32 aa = A_TYPE32(data_a[idx]);
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(aa[0].x);
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE(aa[0].y);
|
||||
buf_a[buf_idx + 2] = FLOAT_TYPE(aa[0].z);
|
||||
buf_a[buf_idx + 3] = FLOAT_TYPE(aa[0].w);
|
||||
buf_a[buf_idx + 4] = FLOAT_TYPE(aa[1].x);
|
||||
buf_a[buf_idx + 5] = FLOAT_TYPE(aa[1].y);
|
||||
buf_a[buf_idx + 6] = FLOAT_TYPE(aa[1].z);
|
||||
buf_a[buf_idx + 7] = FLOAT_TYPE(aa[1].w);
|
||||
#elif LOAD_VEC_A == 4
|
||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
|
||||
A_TYPE32 aa = A_TYPE32(data_a[idx]);
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(aa.x);
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE(aa.y);
|
||||
buf_a[buf_idx + 2] = FLOAT_TYPE(aa.z);
|
||||
buf_a[buf_idx + 3] = FLOAT_TYPE(aa.w);
|
||||
#else
|
||||
if (ir * BM + loadc_a + l < p.M && block + loadr_a < end_k) {
|
||||
buf_a[(loadc_a + l) * SHMEM_STRIDE + loadr_a] = FLOAT_TYPE(data_a[pos_a + (loadc_a + l) * p.stride_a + loadr_a]);
|
||||
} else {
|
||||
buf_a[(loadc_a + l) * SHMEM_STRIDE + loadr_a] = FLOAT_TYPE(0.0f);
|
||||
}
|
||||
#endif
|
||||
#elif defined(DATA_A_BF16)
|
||||
#if LOAD_VEC_A == 4
|
||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
|
||||
buf_a[buf_idx ] = TO_FLOAT_TYPE(data_a[idx].x);
|
||||
buf_a[buf_idx + 1] = TO_FLOAT_TYPE(data_a[idx].y);
|
||||
buf_a[buf_idx + 2] = TO_FLOAT_TYPE(data_a[idx].z);
|
||||
buf_a[buf_idx + 3] = TO_FLOAT_TYPE(data_a[idx].w);
|
||||
#else
|
||||
if (ir * BM + loadc_a + l < p.M && block + loadr_a < end_k) {
|
||||
buf_a[(loadc_a + l) * SHMEM_STRIDE + loadr_a] = TO_FLOAT_TYPE(data_a[pos_a + (loadc_a + l) * p.stride_a + loadr_a]);
|
||||
} else {
|
||||
buf_a[(loadc_a + l) * SHMEM_STRIDE + loadr_a] = TO_FLOAT_TYPE(uint16_t(0));
|
||||
}
|
||||
#endif
|
||||
#elif defined(DATA_A_Q4_0)
|
||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + 4 * loadr_a;
|
||||
|
||||
const uint ib = idx / 4;
|
||||
const uint iqs = idx & 0x03;
|
||||
|
||||
const float d = float(data_a_packed16[ib].d);
|
||||
const uint vui = uint(data_a_packed16[ib].qs[2*iqs]) | (uint(data_a_packed16[ib].qs[2*iqs + 1]) << 16);
|
||||
const vec4 v0 = (vec4(unpack8(vui & 0x0F0F0F0F)) - 8.0f) * d;
|
||||
const vec4 v1 = (vec4(unpack8((vui >> 4) & 0x0F0F0F0F)) - 8.0f) * d;
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(v0.x);
|
||||
buf_a[buf_idx + 1 ] = FLOAT_TYPE(v0.y);
|
||||
buf_a[buf_idx + 2 ] = FLOAT_TYPE(v0.z);
|
||||
buf_a[buf_idx + 3 ] = FLOAT_TYPE(v0.w);
|
||||
buf_a[buf_idx + 16] = FLOAT_TYPE(v1.x);
|
||||
buf_a[buf_idx + 17] = FLOAT_TYPE(v1.y);
|
||||
buf_a[buf_idx + 18] = FLOAT_TYPE(v1.z);
|
||||
buf_a[buf_idx + 19] = FLOAT_TYPE(v1.w);
|
||||
#elif defined(DATA_A_Q4_1)
|
||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + 4 * loadr_a;
|
||||
|
||||
const uint ib = idx / 4;
|
||||
const uint iqs = idx & 0x03;
|
||||
|
||||
const float d = float(data_a_packed16[ib].d);
|
||||
const float m = float(data_a_packed16[ib].m);
|
||||
const uint vui = uint(data_a_packed16[ib].qs[2*iqs]) | (uint(data_a_packed16[ib].qs[2*iqs + 1]) << 16);
|
||||
const vec4 v0 = vec4(unpack8(vui & 0x0F0F0F0F)) * d + m;
|
||||
const vec4 v1 = vec4(unpack8((vui >> 4) & 0x0F0F0F0F)) * d + m;
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(v0.x);
|
||||
buf_a[buf_idx + 1 ] = FLOAT_TYPE(v0.y);
|
||||
buf_a[buf_idx + 2 ] = FLOAT_TYPE(v0.z);
|
||||
buf_a[buf_idx + 3 ] = FLOAT_TYPE(v0.w);
|
||||
buf_a[buf_idx + 16] = FLOAT_TYPE(v1.x);
|
||||
buf_a[buf_idx + 17] = FLOAT_TYPE(v1.y);
|
||||
buf_a[buf_idx + 18] = FLOAT_TYPE(v1.z);
|
||||
buf_a[buf_idx + 19] = FLOAT_TYPE(v1.w);
|
||||
#elif defined(DATA_A_Q5_0)
|
||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + 2 * loadr_a;
|
||||
|
||||
const uint ib = idx / 8;
|
||||
const uint iqs = idx & 0x07;
|
||||
|
||||
const float d = float(data_a_packed16[ib].d);
|
||||
const uint uint_qh = uint(data_a_packed16[ib].qh[1]) << 16 | uint(data_a_packed16[ib].qh[0]);
|
||||
const ivec2 qh0 = ivec2(((uint_qh >> 2*iqs) << 4) & 0x10, (uint_qh >> (2*iqs + 12)) & 0x10);
|
||||
const ivec2 qh1 = ivec2(((uint_qh >> (2*iqs + 1)) << 4) & 0x10, (uint_qh >> (2*iqs + 13)) & 0x10);
|
||||
|
||||
const uint vui = uint(data_a_packed16[ib].qs[iqs]);
|
||||
const vec4 v = (vec4((vui & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, (vui >> 12) | qh1.y) - 16.0f) * d;
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(v.x);
|
||||
buf_a[buf_idx + 1 ] = FLOAT_TYPE(v.z);
|
||||
buf_a[buf_idx + 16] = FLOAT_TYPE(v.y);
|
||||
buf_a[buf_idx + 17] = FLOAT_TYPE(v.w);
|
||||
#elif defined(DATA_A_Q5_1)
|
||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + 2 * loadr_a;
|
||||
|
||||
const uint ib = idx / 8;
|
||||
const uint iqs = idx & 0x07;
|
||||
|
||||
const float d = float(data_a_packed16[ib].d);
|
||||
const float m = float(data_a_packed16[ib].m);
|
||||
const uint uint_qh = data_a_packed16[ib].qh;
|
||||
const ivec2 qh0 = ivec2(((uint_qh >> 2*iqs) << 4) & 0x10, (uint_qh >> (2*iqs + 12)) & 0x10);
|
||||
const ivec2 qh1 = ivec2(((uint_qh >> (2*iqs + 1)) << 4) & 0x10, (uint_qh >> (2*iqs + 13)) & 0x10);
|
||||
|
||||
const uint vui = uint(data_a_packed16[ib].qs[iqs]);
|
||||
const vec4 v = vec4((vui & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, (vui >> 12) | qh1.y) * d + m;
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(v.x);
|
||||
buf_a[buf_idx + 1 ] = FLOAT_TYPE(v.z);
|
||||
buf_a[buf_idx + 16] = FLOAT_TYPE(v.y);
|
||||
buf_a[buf_idx + 17] = FLOAT_TYPE(v.w);
|
||||
#elif defined(DATA_A_Q8_0)
|
||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
|
||||
|
||||
const uint ib = idx / 8;
|
||||
const uint iqs = idx & 0x07;
|
||||
|
||||
const float d = float(data_a_packed16[ib].d);
|
||||
const i8vec2 v0 = unpack8(int32_t(data_a_packed16[ib].qs[2*iqs])).xy; // vec4 used due to #12147
|
||||
const i8vec2 v1 = unpack8(int32_t(data_a_packed16[ib].qs[2*iqs + 1])).xy;
|
||||
const vec4 v = vec4(v0.x, v0.y, v1.x, v1.y) * d;
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(v.x);
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
|
||||
buf_a[buf_idx + 2] = FLOAT_TYPE(v.z);
|
||||
buf_a[buf_idx + 3] = FLOAT_TYPE(v.w);
|
||||
#elif defined(DATA_A_Q2_K)
|
||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
|
||||
|
||||
const uint ib = idx / 128; // 2 values per idx
|
||||
const uint iqs = idx % 128; // 0..127
|
||||
|
||||
const uint qsi = (iqs / 64) * 32 + (iqs % 16) * 2; // 0,2,4..30
|
||||
const uint scalesi = iqs / 8; // 0..15
|
||||
const uint qsshift = ((iqs % 64) / 16) * 2; // 0,2,4,6
|
||||
|
||||
const uvec2 qs = uvec2(data_a[ib].qs[qsi], data_a[ib].qs[qsi + 1]);
|
||||
const uint scales = data_a[ib].scales[scalesi];
|
||||
const vec2 d = vec2(data_a[ib].d);
|
||||
|
||||
const vec2 v = d.x * float(scales & 0xF) * vec2((qs >> qsshift) & 3) - d.y * float(scales >> 4);
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(v.x);
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
|
||||
#elif defined(DATA_A_Q3_K)
|
||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
|
||||
|
||||
const uint ib = idx / 128; // 2 values per idx
|
||||
const uint iqs = idx % 128; // 0..127
|
||||
|
||||
const uint n = iqs / 64; // 0,1
|
||||
const uint qsi = n * 32 + (iqs % 16) * 2; // 0,2,4..62
|
||||
const uint hmi = (iqs % 16) * 2; // 0,2,4..30
|
||||
const uint j = (iqs % 64) / 4; // 0..3
|
||||
const uint is = iqs / 8; // 0..15
|
||||
const uint halfsplit = ((iqs % 64) / 16); // 0,1,2,3
|
||||
const uint qsshift = halfsplit * 2; // 0,2,4,6
|
||||
const uint m = 1 << (4 * n + halfsplit); // 1,2,4,8,16,32,64,128
|
||||
|
||||
const int8_t us = int8_t(((data_a[ib].scales[is % 8] >> (4 * int(is / 8))) & 0xF)
|
||||
| (((data_a[ib].scales[8 + (is % 4)] >> (2 * int(is / 4))) & 3) << 4));
|
||||
const float dl = float(data_a[ib].d) * float(us - 32);
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(dl * float(int8_t((data_a[ib].qs[qsi ] >> qsshift) & 3) - (((data_a[ib].hmask[hmi ] & m) != 0) ? 0 : 4)));
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE(dl * float(int8_t((data_a[ib].qs[qsi + 1] >> qsshift) & 3) - (((data_a[ib].hmask[hmi + 1] & m) != 0) ? 0 : 4)));
|
||||
#elif defined(DATA_A_Q4_K)
|
||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
|
||||
|
||||
const uint ib = idx / 128; // 2 values per idx
|
||||
const uint iqs = idx % 128; // 0..127
|
||||
|
||||
const uint n = iqs / 32; // 0,1,2,3
|
||||
const uint b = (iqs % 32) / 16; // 0,1
|
||||
const uint is = 2 * n + b; // 0..7
|
||||
const uint qsi = n * 32 + (iqs % 16) * 2; // 0,2,4..126
|
||||
|
||||
const vec2 loadd = vec2(data_a[ib].d);
|
||||
|
||||
const uint scidx0 = (is < 4) ? is : (is + 4);
|
||||
const uint scidx1 = (is < 4) ? is : (is - 4);
|
||||
const uint scidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||
const uint scidxshift1 = (is < 4) ? 0 : 2;
|
||||
const uint mbidx0 = is + 4;
|
||||
const uint mbidx1 = (is < 4) ? is + 4 : is;
|
||||
const uint mbidxmask0 = (is < 4) ? 0xF : 0xF0;
|
||||
const uint mbidxshift0 = (is < 4) ? 0 : 4;
|
||||
const uint mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||
const uint mbidxshift1 = (is < 4) ? 0 : 2;
|
||||
|
||||
const uint8_t sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1));
|
||||
const uint8_t mbyte = uint8_t((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0 | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1));
|
||||
|
||||
const float d = loadd.x * sc;
|
||||
const float m = -loadd.y * mbyte;
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi ] >> (b * 4)) & 0xF), m));
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF), m));
|
||||
#elif defined(DATA_A_Q5_K)
|
||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
|
||||
|
||||
const uint ib = idx / 128; // 2 values per idx
|
||||
const uint iqs = idx % 128; // 0..127
|
||||
|
||||
const uint n = iqs / 32; // 0,1,2,3
|
||||
const uint b = (iqs % 32) / 16; // 0,1
|
||||
const uint is = 2 * n + b; // 0..7
|
||||
const uint qsi = n * 32 + (iqs % 16) * 2; // 0,2,4..126
|
||||
const uint qhi = (iqs % 16) * 2; // 0,2,4..30
|
||||
|
||||
const uint8_t hm = uint8_t(1 << (iqs / 16));
|
||||
|
||||
const vec2 loadd = vec2(data_a[ib].d);
|
||||
|
||||
const uint scidx0 = (is < 4) ? is : (is + 4);
|
||||
const uint scidx1 = (is < 4) ? is : (is - 4);
|
||||
const uint scidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||
const uint scidxshift1 = (is < 4) ? 0 : 2;
|
||||
const uint mbidx0 = is + 4;
|
||||
const uint mbidx1 = (is < 4) ? is + 4 : is;
|
||||
const uint mbidxmask0 = (is < 4) ? 0xF : 0xF0;
|
||||
const uint mbidxshift0 = (is < 4) ? 0 : 4;
|
||||
const uint mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||
const uint mbidxshift1 = (is < 4) ? 0 : 2;
|
||||
|
||||
const uint8_t sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1));
|
||||
const uint8_t mbyte = uint8_t(((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0) | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1));
|
||||
|
||||
const float d = loadd.x * sc;
|
||||
const float m = -loadd.y * mbyte;
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi ] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi ] & hm) != 0 ? 16 : 0), m));
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi + 1] & hm) != 0 ? 16 : 0), m));
|
||||
#elif defined(DATA_A_Q6_K)
|
||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
|
||||
|
||||
const uint ib = idx / 128; // 2 values per idx
|
||||
const uint iqs = idx % 128; // 0..127
|
||||
|
||||
const uint n = iqs / 64; // 0,1
|
||||
const uint b = (iqs % 64) / 32; // 0,1
|
||||
const uint is_b = (iqs % 16) / 8; // 0,1
|
||||
const uint qhshift = ((iqs % 64) / 16) * 2; // 0,2,4,6
|
||||
const uint is = 8 * n + qhshift + is_b; // 0..15
|
||||
const uint qsi = n * 64 + (iqs % 32) * 2; // 0,2,4..126
|
||||
const uint qhi = n * 32 + (iqs % 16) * 2; // 0,2,4..62
|
||||
|
||||
const float dscale = float(data_a[ib].d) * float(data_a[ib].scales[is]);
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi ] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi ] >> qhshift) & 3) << 4)) - 32));
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32));
|
||||
#elif defined(DATA_A_IQ1_S)
|
||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
|
||||
|
||||
const uint ib = idx / 32; // 8 values per idx
|
||||
const uint ib32 = (idx % 32) / 4; // 0..7
|
||||
const uint ib8 = idx % 32;
|
||||
|
||||
const float d = float(data_a[ib].d);
|
||||
const uint qh = data_a[ib].qh[ib32];
|
||||
const uint qs = data_a[ib].qs[ib8];
|
||||
const float dl = d * (2 * bitfieldExtract(qh, 12, 3) + 1);
|
||||
const float delta = ((qh & 0x8000) != 0) ? -IQ1S_DELTA : IQ1S_DELTA;
|
||||
const int16_t grid = int16_t(iq1s_grid[qs | (bitfieldExtract(qh, 3 * int(ib8 & 3), 3) << 8)]);
|
||||
|
||||
[[unroll]] for (int k = 0; k < 8; ++k) {
|
||||
buf_a[buf_idx + k] = FLOAT_TYPE(dl * (bitfieldExtract(grid, 2 * k, 2) + delta));
|
||||
}
|
||||
#elif defined(DATA_A_IQ1_M)
|
||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
|
||||
|
||||
const uint ib = idx / 32; // 8 values per idx
|
||||
const uint ib8 = idx % 32;
|
||||
const uint ib16 = ib8 / 2;
|
||||
|
||||
const uint16_t[4] scales = data_a[ib].scales;
|
||||
const u16vec4 s = u16vec4(scales[0], scales[1], scales[2], scales[3]) >> 12;
|
||||
const float d = float(unpackHalf2x16(s.x | (s.y << 4) | (s.z << 8) | (s.w << 12)).x);
|
||||
const uint sc = scales[ib8 / 8];
|
||||
const uint qs = data_a[ib].qs[ib8];
|
||||
const uint qh = data_a[ib].qh[ib16] >> (4 * (ib8 & 1));
|
||||
const float dl = d * (2 * bitfieldExtract(sc, 3 * int(ib16 & 3), 3) + 1);
|
||||
const float delta = ((qh & 8) != 0) ? -IQ1M_DELTA : IQ1M_DELTA;
|
||||
const int16_t grid = int16_t(iq1s_grid[qs | ((qh & 7) << 8)]);
|
||||
|
||||
[[unroll]] for (int k = 0; k < 8; ++k) {
|
||||
buf_a[buf_idx + k] = FLOAT_TYPE(dl * (bitfieldExtract(grid, 2 * k, 2) + delta));
|
||||
}
|
||||
#elif defined(DATA_A_IQ2_XXS)
|
||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
|
||||
|
||||
const uint ib = idx / 32; // 8 values per idx
|
||||
const uint ib32 = (idx % 32) / 4; // 0..7
|
||||
const uint ib8 = idx % 4;
|
||||
|
||||
const float d = float(data_a[ib].d);
|
||||
const uint qs = data_a[ib].qs[8 * ib32 + ib8];
|
||||
const uint signs = pack32(u8vec4(
|
||||
data_a[ib].qs[8*ib32 + 4],
|
||||
data_a[ib].qs[8*ib32 + 5],
|
||||
data_a[ib].qs[8*ib32 + 6],
|
||||
data_a[ib].qs[8*ib32 + 7]
|
||||
));
|
||||
const FLOAT_TYPE db = FLOAT_TYPE(d * 0.25 * (0.5 + (signs >> 28)));
|
||||
const uint32_t sign7 = bitfieldExtract(signs, 7 * int(ib8), 7);
|
||||
const uint sign = sign7 | (bitCount(sign7) << 7);
|
||||
const uvec2 grid = iq2xxs_grid[qs];
|
||||
const vec4 grid0 = vec4(unpack8(grid.x));
|
||||
const vec4 grid1 = vec4(unpack8(grid.y));
|
||||
|
||||
buf_a[buf_idx ] = db * FLOAT_TYPE((sign & 1) != 0 ? -grid0.x : grid0.x);
|
||||
buf_a[buf_idx + 1] = db * FLOAT_TYPE((sign & 2) != 0 ? -grid0.y : grid0.y);
|
||||
buf_a[buf_idx + 2] = db * FLOAT_TYPE((sign & 4) != 0 ? -grid0.z : grid0.z);
|
||||
buf_a[buf_idx + 3] = db * FLOAT_TYPE((sign & 8) != 0 ? -grid0.w : grid0.w);
|
||||
buf_a[buf_idx + 4] = db * FLOAT_TYPE((sign & 16) != 0 ? -grid1.x : grid1.x);
|
||||
buf_a[buf_idx + 5] = db * FLOAT_TYPE((sign & 32) != 0 ? -grid1.y : grid1.y);
|
||||
buf_a[buf_idx + 6] = db * FLOAT_TYPE((sign & 64) != 0 ? -grid1.z : grid1.z);
|
||||
buf_a[buf_idx + 7] = db * FLOAT_TYPE((sign & 128) != 0 ? -grid1.w : grid1.w);
|
||||
#elif defined(DATA_A_IQ2_XS)
|
||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
|
||||
|
||||
const uint ib = idx / 32; // 8 values per idx
|
||||
const uint ib32 = (idx % 32) / 4; // 0..7
|
||||
const uint ib8 = idx % 4; // 0..3
|
||||
|
||||
const float d = float(data_a[ib].d);
|
||||
const uint scale = (data_a[ib].scales[ib32] >> (2 * (ib8 & 2))) & 0xf;
|
||||
const FLOAT_TYPE db = FLOAT_TYPE(d * 0.25 * (0.5 + scale));
|
||||
const uint qs = data_a[ib].qs[4 * ib32 + ib8];
|
||||
const uint sign7 = qs >> 9;
|
||||
const uint sign = sign7 | (bitCount(sign7) << 7);
|
||||
const uvec2 grid = iq2xs_grid[qs & 511];
|
||||
const vec4 grid0 = vec4(unpack8(grid.x));
|
||||
const vec4 grid1 = vec4(unpack8(grid.y));
|
||||
|
||||
buf_a[buf_idx ] = db * FLOAT_TYPE((sign & 1) != 0 ? -grid0.x : grid0.x);
|
||||
buf_a[buf_idx + 1] = db * FLOAT_TYPE((sign & 2) != 0 ? -grid0.y : grid0.y);
|
||||
buf_a[buf_idx + 2] = db * FLOAT_TYPE((sign & 4) != 0 ? -grid0.z : grid0.z);
|
||||
buf_a[buf_idx + 3] = db * FLOAT_TYPE((sign & 8) != 0 ? -grid0.w : grid0.w);
|
||||
buf_a[buf_idx + 4] = db * FLOAT_TYPE((sign & 16) != 0 ? -grid1.x : grid1.x);
|
||||
buf_a[buf_idx + 5] = db * FLOAT_TYPE((sign & 32) != 0 ? -grid1.y : grid1.y);
|
||||
buf_a[buf_idx + 6] = db * FLOAT_TYPE((sign & 64) != 0 ? -grid1.z : grid1.z);
|
||||
buf_a[buf_idx + 7] = db * FLOAT_TYPE((sign & 128) != 0 ? -grid1.w : grid1.w);
|
||||
#elif defined(DATA_A_IQ2_S)
|
||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
|
||||
|
||||
const uint ib = idx / 32; // 8 values per idx
|
||||
const uint ib8 = idx % 32; // 0..31
|
||||
const uint ib32 = ib8 / 4; // 0..7
|
||||
|
||||
const uint scale = (data_a[ib].scales[ib32] >> (2 * (ib8 & 2))) & 0xf;
|
||||
const uint qs = data_a[ib].qs[ib8];
|
||||
const uint qh = data_a[ib].qh[ib32];
|
||||
const uint qhshift = 2 * (ib8 % 4);
|
||||
const uint sign = data_a[ib].qs[QUANT_K / 8 + ib8];
|
||||
|
||||
const float d = float(data_a[ib].d);
|
||||
const FLOAT_TYPE db = FLOAT_TYPE(d * 0.25 * (0.5 + scale));
|
||||
const uvec2 grid = iq2s_grid[qs | ((qh << (8 - qhshift)) & 0x300)];
|
||||
const vec4 grid0 = vec4(unpack8(grid.x));
|
||||
const vec4 grid1 = vec4(unpack8(grid.y));
|
||||
|
||||
buf_a[buf_idx ] = db * FLOAT_TYPE((sign & 1) != 0 ? -grid0.x : grid0.x);
|
||||
buf_a[buf_idx + 1] = db * FLOAT_TYPE((sign & 2) != 0 ? -grid0.y : grid0.y);
|
||||
buf_a[buf_idx + 2] = db * FLOAT_TYPE((sign & 4) != 0 ? -grid0.z : grid0.z);
|
||||
buf_a[buf_idx + 3] = db * FLOAT_TYPE((sign & 8) != 0 ? -grid0.w : grid0.w);
|
||||
buf_a[buf_idx + 4] = db * FLOAT_TYPE((sign & 16) != 0 ? -grid1.x : grid1.x);
|
||||
buf_a[buf_idx + 5] = db * FLOAT_TYPE((sign & 32) != 0 ? -grid1.y : grid1.y);
|
||||
buf_a[buf_idx + 6] = db * FLOAT_TYPE((sign & 64) != 0 ? -grid1.z : grid1.z);
|
||||
buf_a[buf_idx + 7] = db * FLOAT_TYPE((sign & 128) != 0 ? -grid1.w : grid1.w);
|
||||
#elif defined(DATA_A_IQ3_XXS)
|
||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
|
||||
|
||||
const uint ib = idx / 64; // 4 values per idx
|
||||
const uint iqs = idx % 64; // 0..63
|
||||
const uint is = QUANT_K / 4 + 4 * (iqs / 8); // 8 values
|
||||
|
||||
const float d = float(data_a[ib].d);
|
||||
const uint qs = data_a[ib].qs[iqs];
|
||||
const uint signs = pack32(u8vec4(
|
||||
data_a[ib].qs[is+0],
|
||||
data_a[ib].qs[is+1],
|
||||
data_a[ib].qs[is+2],
|
||||
data_a[ib].qs[is+3]
|
||||
));
|
||||
const float db = d * 0.5 * (0.5 + (signs >> 28));
|
||||
const uint32_t sign7 = bitfieldExtract(signs, 7 * (int(iqs / 2) % 4), 7);
|
||||
const uint sign = (sign7 | (bitCount(sign7) << 7)) >> (4 * (idx % 2));
|
||||
const uint grid = iq3xxs_grid[qs];
|
||||
const vec4 v = db * vec4(unpack8(grid));
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE((sign & 1) != 0 ? -v.x : v.x);
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE((sign & 2) != 0 ? -v.y : v.y);
|
||||
buf_a[buf_idx + 2] = FLOAT_TYPE((sign & 4) != 0 ? -v.z : v.z);
|
||||
buf_a[buf_idx + 3] = FLOAT_TYPE((sign & 8) != 0 ? -v.w : v.w);
|
||||
#elif defined(DATA_A_IQ3_S)
|
||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
|
||||
|
||||
const uint ib = idx / 64; // 4 values per idx
|
||||
const uint iqs = idx % 64; // 0..63
|
||||
const uint iqh = iqs / 8;
|
||||
|
||||
const float d = float(data_a[ib].d);
|
||||
const uint qs = data_a[ib].qs[iqs];
|
||||
const uint qh = data_a[ib].qh[iqh];
|
||||
const int8_t sign = int8_t(data_a[ib].signs[iqs / 2] >> (4 * (idx % 2)));
|
||||
const uint scale = data_a[ib].scales[iqs / 16];
|
||||
const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(sign << 1, sign)));
|
||||
const float db = d * (1 + 2 * ((scale >> (4 * (iqh & 1))) & 0xf));
|
||||
const uint32_t grid = iq3s_grid[qs | ((qh << (8 - (iqs % 8))) & 256)];
|
||||
const vec4 v = db * vec4(unpack8(grid));
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE((sign & 1) != 0 ? -v.x : v.x);
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE((sign & 2) != 0 ? -v.y : v.y);
|
||||
buf_a[buf_idx + 2] = FLOAT_TYPE((sign & 4) != 0 ? -v.z : v.z);
|
||||
buf_a[buf_idx + 3] = FLOAT_TYPE((sign & 8) != 0 ? -v.w : v.w);
|
||||
#elif defined(DATA_A_IQ4_XS)
|
||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
|
||||
|
||||
const uint ib = idx / 128; // 2 values per idx
|
||||
const uint ib32 = (idx % 128) / 16; // 0..7
|
||||
const uint iq = 16 * ib32 + 2 * (idx % 8);
|
||||
|
||||
const uint sl = (data_a[ib].scales_l[ib32/2] >> (4 * (ib32 & 1))) & 0xF;
|
||||
const uint sh = ((data_a[ib].scales_h) >> (2 * ib32)) & 3;
|
||||
const uint qshift = (idx & 8) >> 1;
|
||||
u8vec2 qs = u8vec2(data_a[ib].qs[iq], data_a[ib].qs[iq + 1]);
|
||||
qs = (qs >> qshift) & uint8_t(0xF);
|
||||
|
||||
const float d = float(data_a[ib].d);
|
||||
const vec2 v = d * float(int(sl | (sh << 4)) - 32) * vec2(kvalues_iq4nl[qs.x], kvalues_iq4nl[qs.y]);
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(v.x);
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
|
||||
#elif defined(DATA_A_IQ4_NL)
|
||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + 2 * loadr_a;
|
||||
|
||||
const uint ib = idx / 8;
|
||||
const uint iqs = idx & 0x07;
|
||||
|
||||
const FLOAT_TYPE d = FLOAT_TYPE(data_a_packed16[ib].d);
|
||||
const uint vui = uint(data_a_packed16[ib].qs[iqs]);
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(kvalues_iq4nl[vui & 0xF]) * d;
|
||||
buf_a[buf_idx + 1 ] = FLOAT_TYPE(kvalues_iq4nl[bitfieldExtract(vui, 8, 4)]) * d;
|
||||
buf_a[buf_idx + 16] = FLOAT_TYPE(kvalues_iq4nl[bitfieldExtract(vui, 4, 4)]) * d;
|
||||
buf_a[buf_idx + 17] = FLOAT_TYPE(kvalues_iq4nl[vui >> 12]) * d;
|
||||
#elif defined(DATA_A_MXFP4)
|
||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + 2 * loadr_a;
|
||||
|
||||
const uint ib = idx / 8;
|
||||
const uint iqs = (idx & 0x07) * 2;
|
||||
|
||||
const float d = e8m0_to_fp32(data_a[ib].e);
|
||||
const uint vui = uint(data_a[ib].qs[iqs]);
|
||||
const uint vui2 = uint(data_a[ib].qs[iqs+1]);
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(kvalues_mxfp4[vui & 0xF] * d);
|
||||
buf_a[buf_idx + 16] = FLOAT_TYPE(kvalues_mxfp4[vui >> 4] * d);
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE(kvalues_mxfp4[vui2 & 0xF] * d);
|
||||
buf_a[buf_idx + 17] = FLOAT_TYPE(kvalues_mxfp4[vui2 >> 4] * d);
|
||||
#endif
|
||||
load_a_to_shmem(pos_a, loadr_a, loadc_a + l, ir * BM + loadc_a + l, block + loadr_a, end_k);
|
||||
}
|
||||
[[unroll]] for (uint l = 0; l < BN; l += loadstride_b) {
|
||||
#if LOAD_VEC_B == 8
|
||||
#ifdef MUL_MAT_ID
|
||||
const u16vec2 row_idx = row_ids[loadc_b + l];
|
||||
const uint idx = pos_b + row_idx.y * p.batch_stride_b / LOAD_VEC_B + (row_idx.x % p.ne11) * p.stride_b / LOAD_VEC_B + loadr_b;
|
||||
#if !defined(MUL_MAT_ID)
|
||||
load_b_to_shmem(pos_b, loadr_b, loadc_b + l, ic * BN + loadc_b + l, block + loadr_b, end_k);
|
||||
#else
|
||||
const uint idx = pos_b + (loadc_b + l) * p.stride_b / LOAD_VEC_B + loadr_b;
|
||||
#endif
|
||||
const uint buf_idx = (loadc_b + l) * SHMEM_STRIDE + loadr_b * LOAD_VEC_B;
|
||||
#if defined(DATA_B_BF16)
|
||||
B_TYPE32 bb = TO_FLOAT_TYPE(data_b[idx]);
|
||||
#else
|
||||
B_TYPE32 bb = B_TYPE32(data_b[idx]);
|
||||
#endif
|
||||
buf_b[buf_idx + 0] = FLOAT_TYPE(bb[0].x);
|
||||
buf_b[buf_idx + 1] = FLOAT_TYPE(bb[0].y);
|
||||
buf_b[buf_idx + 2] = FLOAT_TYPE(bb[0].z);
|
||||
buf_b[buf_idx + 3] = FLOAT_TYPE(bb[0].w);
|
||||
buf_b[buf_idx + 4] = FLOAT_TYPE(bb[1].x);
|
||||
buf_b[buf_idx + 5] = FLOAT_TYPE(bb[1].y);
|
||||
buf_b[buf_idx + 6] = FLOAT_TYPE(bb[1].z);
|
||||
buf_b[buf_idx + 7] = FLOAT_TYPE(bb[1].w);
|
||||
#elif LOAD_VEC_B == 4
|
||||
#ifdef MUL_MAT_ID
|
||||
const u16vec2 row_idx = row_ids[loadc_b + l];
|
||||
const uint idx = pos_b + row_idx.y * p.batch_stride_b / LOAD_VEC_B + (row_idx.x % p.ne11) * p.stride_b / LOAD_VEC_B + loadr_b;
|
||||
#else
|
||||
const uint idx = pos_b + (loadc_b + l) * p.stride_b / LOAD_VEC_B + loadr_b;
|
||||
#endif
|
||||
const uint buf_idx = (loadc_b + l) * SHMEM_STRIDE + loadr_b * LOAD_VEC_B;
|
||||
#if defined(DATA_B_BF16)
|
||||
B_TYPE32 bb = TO_FLOAT_TYPE(data_b[idx]);
|
||||
#else
|
||||
B_TYPE32 bb = B_TYPE32(data_b[idx]);
|
||||
#endif
|
||||
buf_b[buf_idx + 0] = FLOAT_TYPE(bb.x);
|
||||
buf_b[buf_idx + 1] = FLOAT_TYPE(bb.y);
|
||||
buf_b[buf_idx + 2] = FLOAT_TYPE(bb.z);
|
||||
buf_b[buf_idx + 3] = FLOAT_TYPE(bb.w);
|
||||
#elif !MUL_MAT_ID
|
||||
if (ic * BN + loadc_b + l < p.N && block + loadr_b < end_k) {
|
||||
buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = TO_FLOAT_TYPE(data_b[pos_b + (loadc_b + l) * p.stride_b + loadr_b]);
|
||||
} else {
|
||||
buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = FLOAT_TYPE(0.0f);
|
||||
}
|
||||
#else
|
||||
const uint row_i = ic * BN + loadc_b + l;
|
||||
if (row_i < _ne1 && block + loadr_b < end_k) {
|
||||
const u16vec2 row_idx = row_ids[loadc_b + l];
|
||||
buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = TO_FLOAT_TYPE(data_b[pos_b + row_idx.y * p.batch_stride_b + (row_idx.x % p.ne11) * p.stride_b + loadr_b]);
|
||||
} else {
|
||||
buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = FLOAT_TYPE(0.0f);
|
||||
}
|
||||
load_b_to_shmem(pos_b, loadr_b, loadc_b + l, ic, _ne1, block + loadr_b, end_k);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,568 @@
|
||||
void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uint idx_m, const uint idx_k, const uint end_k) {
|
||||
#if defined(DATA_A_F32) || defined(DATA_A_F16)
|
||||
#if LOAD_VEC_A == 8
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A;
|
||||
FLOAT_TYPE_VEC8 aa = FLOAT_TYPE_VEC8(data_a[idx]);
|
||||
buf_a[buf_idx ] = aa[0].x;
|
||||
buf_a[buf_idx + 1] = aa[0].y;
|
||||
buf_a[buf_idx + 2] = aa[0].z;
|
||||
buf_a[buf_idx + 3] = aa[0].w;
|
||||
buf_a[buf_idx + 4] = aa[1].x;
|
||||
buf_a[buf_idx + 5] = aa[1].y;
|
||||
buf_a[buf_idx + 6] = aa[1].z;
|
||||
buf_a[buf_idx + 7] = aa[1].w;
|
||||
#elif LOAD_VEC_A == 4
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A;
|
||||
FLOAT_TYPE_VEC4 aa = FLOAT_TYPE_VEC4(data_a[idx]);
|
||||
buf_a[buf_idx ] = aa.x;
|
||||
buf_a[buf_idx + 1] = aa.y;
|
||||
buf_a[buf_idx + 2] = aa.z;
|
||||
buf_a[buf_idx + 3] = aa.w;
|
||||
#else
|
||||
if (idx_m < p.M && idx_k < end_k) {
|
||||
buf_a[col * SHMEM_STRIDE + row] = FLOAT_TYPE(data_a[pos_a + col * p.stride_a + row]);
|
||||
} else {
|
||||
buf_a[col * SHMEM_STRIDE + row] = FLOAT_TYPE(0.0f);
|
||||
}
|
||||
#endif
|
||||
#elif defined(DATA_A_BF16)
|
||||
#if LOAD_VEC_A == 4
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A;
|
||||
FLOAT_TYPE_VEC4 aa = FLOAT_TYPE_VEC4(TO_FLOAT_TYPE(data_a[idx]));
|
||||
buf_a[buf_idx ] = aa.x;
|
||||
buf_a[buf_idx + 1] = aa.y;
|
||||
buf_a[buf_idx + 2] = aa.z;
|
||||
buf_a[buf_idx + 3] = aa.w;
|
||||
#else
|
||||
if (idx_m < p.M && idx_k < end_k) {
|
||||
buf_a[col * SHMEM_STRIDE + row] = TO_FLOAT_TYPE(data_a[pos_a + col * p.stride_a + row]);
|
||||
} else {
|
||||
buf_a[col * SHMEM_STRIDE + row] = TO_FLOAT_TYPE(uint16_t(0));
|
||||
}
|
||||
#endif
|
||||
#elif defined(DATA_A_Q4_0)
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + 4 * row;
|
||||
|
||||
const uint ib = idx / 4;
|
||||
const uint iqs = idx & 0x03;
|
||||
|
||||
const float d = float(data_a_packed16[ib].d);
|
||||
const uint vui = uint(data_a_packed16[ib].qs[2*iqs]) | (uint(data_a_packed16[ib].qs[2*iqs + 1]) << 16);
|
||||
const vec4 v0 = (vec4(unpack8(vui & 0x0F0F0F0F)) - 8.0f) * d;
|
||||
const vec4 v1 = (vec4(unpack8((vui >> 4) & 0x0F0F0F0F)) - 8.0f) * d;
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(v0.x);
|
||||
buf_a[buf_idx + 1 ] = FLOAT_TYPE(v0.y);
|
||||
buf_a[buf_idx + 2 ] = FLOAT_TYPE(v0.z);
|
||||
buf_a[buf_idx + 3 ] = FLOAT_TYPE(v0.w);
|
||||
buf_a[buf_idx + 16] = FLOAT_TYPE(v1.x);
|
||||
buf_a[buf_idx + 17] = FLOAT_TYPE(v1.y);
|
||||
buf_a[buf_idx + 18] = FLOAT_TYPE(v1.z);
|
||||
buf_a[buf_idx + 19] = FLOAT_TYPE(v1.w);
|
||||
#elif defined(DATA_A_Q4_1)
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + 4 * row;
|
||||
|
||||
const uint ib = idx / 4;
|
||||
const uint iqs = idx & 0x03;
|
||||
|
||||
const float d = float(data_a_packed16[ib].d);
|
||||
const float m = float(data_a_packed16[ib].m);
|
||||
const uint vui = uint(data_a_packed16[ib].qs[2*iqs]) | (uint(data_a_packed16[ib].qs[2*iqs + 1]) << 16);
|
||||
const vec4 v0 = vec4(unpack8(vui & 0x0F0F0F0F)) * d + m;
|
||||
const vec4 v1 = vec4(unpack8((vui >> 4) & 0x0F0F0F0F)) * d + m;
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(v0.x);
|
||||
buf_a[buf_idx + 1 ] = FLOAT_TYPE(v0.y);
|
||||
buf_a[buf_idx + 2 ] = FLOAT_TYPE(v0.z);
|
||||
buf_a[buf_idx + 3 ] = FLOAT_TYPE(v0.w);
|
||||
buf_a[buf_idx + 16] = FLOAT_TYPE(v1.x);
|
||||
buf_a[buf_idx + 17] = FLOAT_TYPE(v1.y);
|
||||
buf_a[buf_idx + 18] = FLOAT_TYPE(v1.z);
|
||||
buf_a[buf_idx + 19] = FLOAT_TYPE(v1.w);
|
||||
#elif defined(DATA_A_Q5_0)
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + 2 * row;
|
||||
|
||||
const uint ib = idx / 8;
|
||||
const uint iqs = idx & 0x07;
|
||||
|
||||
const float d = float(data_a_packed16[ib].d);
|
||||
const uint uint_qh = uint(data_a_packed16[ib].qh[1]) << 16 | uint(data_a_packed16[ib].qh[0]);
|
||||
const ivec2 qh0 = ivec2(((uint_qh >> 2*iqs) << 4) & 0x10, (uint_qh >> (2*iqs + 12)) & 0x10);
|
||||
const ivec2 qh1 = ivec2(((uint_qh >> (2*iqs + 1)) << 4) & 0x10, (uint_qh >> (2*iqs + 13)) & 0x10);
|
||||
|
||||
const uint vui = uint(data_a_packed16[ib].qs[iqs]);
|
||||
const vec4 v = (vec4((vui & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, (vui >> 12) | qh1.y) - 16.0f) * d;
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(v.x);
|
||||
buf_a[buf_idx + 1 ] = FLOAT_TYPE(v.z);
|
||||
buf_a[buf_idx + 16] = FLOAT_TYPE(v.y);
|
||||
buf_a[buf_idx + 17] = FLOAT_TYPE(v.w);
|
||||
#elif defined(DATA_A_Q5_1)
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + 2 * row;
|
||||
|
||||
const uint ib = idx / 8;
|
||||
const uint iqs = idx & 0x07;
|
||||
|
||||
const float d = float(data_a_packed16[ib].d);
|
||||
const float m = float(data_a_packed16[ib].m);
|
||||
const uint uint_qh = data_a_packed16[ib].qh;
|
||||
const ivec2 qh0 = ivec2(((uint_qh >> 2*iqs) << 4) & 0x10, (uint_qh >> (2*iqs + 12)) & 0x10);
|
||||
const ivec2 qh1 = ivec2(((uint_qh >> (2*iqs + 1)) << 4) & 0x10, (uint_qh >> (2*iqs + 13)) & 0x10);
|
||||
|
||||
const uint vui = uint(data_a_packed16[ib].qs[iqs]);
|
||||
const vec4 v = vec4((vui & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, (vui >> 12) | qh1.y) * d + m;
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(v.x);
|
||||
buf_a[buf_idx + 1 ] = FLOAT_TYPE(v.z);
|
||||
buf_a[buf_idx + 16] = FLOAT_TYPE(v.y);
|
||||
buf_a[buf_idx + 17] = FLOAT_TYPE(v.w);
|
||||
#elif defined(DATA_A_Q8_0)
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A;
|
||||
|
||||
const uint ib = idx / 8;
|
||||
const uint iqs = idx & 0x07;
|
||||
|
||||
const float d = float(data_a_packed16[ib].d);
|
||||
const i8vec2 v0 = unpack8(int32_t(data_a_packed16[ib].qs[2*iqs])).xy; // vec4 used due to #12147
|
||||
const i8vec2 v1 = unpack8(int32_t(data_a_packed16[ib].qs[2*iqs + 1])).xy;
|
||||
const vec4 v = vec4(v0.x, v0.y, v1.x, v1.y) * d;
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(v.x);
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
|
||||
buf_a[buf_idx + 2] = FLOAT_TYPE(v.z);
|
||||
buf_a[buf_idx + 3] = FLOAT_TYPE(v.w);
|
||||
#elif defined(DATA_A_Q2_K)
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A;
|
||||
|
||||
const uint ib = idx / 128; // 2 values per idx
|
||||
const uint iqs = idx % 128; // 0..127
|
||||
|
||||
const uint qsi = (iqs / 64) * 32 + (iqs % 16) * 2; // 0,2,4..30
|
||||
const uint scalesi = iqs / 8; // 0..15
|
||||
const uint qsshift = ((iqs % 64) / 16) * 2; // 0,2,4,6
|
||||
|
||||
const uvec2 qs = uvec2(data_a[ib].qs[qsi], data_a[ib].qs[qsi + 1]);
|
||||
const uint scales = data_a[ib].scales[scalesi];
|
||||
const vec2 d = vec2(data_a[ib].d);
|
||||
|
||||
const vec2 v = d.x * float(scales & 0xF) * vec2((qs >> qsshift) & 3) - d.y * float(scales >> 4);
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(v.x);
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
|
||||
#elif defined(DATA_A_Q3_K)
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A;
|
||||
|
||||
const uint ib = idx / 128; // 2 values per idx
|
||||
const uint iqs = idx % 128; // 0..127
|
||||
|
||||
const uint n = iqs / 64; // 0,1
|
||||
const uint qsi = n * 32 + (iqs % 16) * 2; // 0,2,4..62
|
||||
const uint hmi = (iqs % 16) * 2; // 0,2,4..30
|
||||
const uint j = (iqs % 64) / 4; // 0..3
|
||||
const uint is = iqs / 8; // 0..15
|
||||
const uint halfsplit = ((iqs % 64) / 16); // 0,1,2,3
|
||||
const uint qsshift = halfsplit * 2; // 0,2,4,6
|
||||
const uint m = 1 << (4 * n + halfsplit); // 1,2,4,8,16,32,64,128
|
||||
|
||||
const int8_t us = int8_t(((data_a[ib].scales[is % 8] >> (4 * int(is / 8))) & 0xF)
|
||||
| (((data_a[ib].scales[8 + (is % 4)] >> (2 * int(is / 4))) & 3) << 4));
|
||||
const float dl = float(data_a[ib].d) * float(us - 32);
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(dl * float(int8_t((data_a[ib].qs[qsi ] >> qsshift) & 3) - (((data_a[ib].hmask[hmi ] & m) != 0) ? 0 : 4)));
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE(dl * float(int8_t((data_a[ib].qs[qsi + 1] >> qsshift) & 3) - (((data_a[ib].hmask[hmi + 1] & m) != 0) ? 0 : 4)));
|
||||
#elif defined(DATA_A_Q4_K)
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A;
|
||||
|
||||
const uint ib = idx / 128; // 2 values per idx
|
||||
const uint iqs = idx % 128; // 0..127
|
||||
|
||||
const uint n = iqs / 32; // 0,1,2,3
|
||||
const uint b = (iqs % 32) / 16; // 0,1
|
||||
const uint is = 2 * n + b; // 0..7
|
||||
const uint qsi = n * 32 + (iqs % 16) * 2; // 0,2,4..126
|
||||
|
||||
const vec2 loadd = vec2(data_a[ib].d);
|
||||
|
||||
const uint scidx0 = (is < 4) ? is : (is + 4);
|
||||
const uint scidx1 = (is < 4) ? is : (is - 4);
|
||||
const uint scidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||
const uint scidxshift1 = (is < 4) ? 0 : 2;
|
||||
const uint mbidx0 = is + 4;
|
||||
const uint mbidx1 = (is < 4) ? is + 4 : is;
|
||||
const uint mbidxmask0 = (is < 4) ? 0xF : 0xF0;
|
||||
const uint mbidxshift0 = (is < 4) ? 0 : 4;
|
||||
const uint mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||
const uint mbidxshift1 = (is < 4) ? 0 : 2;
|
||||
|
||||
const uint8_t sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1));
|
||||
const uint8_t mbyte = uint8_t((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0 | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1));
|
||||
|
||||
const float d = loadd.x * sc;
|
||||
const float m = -loadd.y * mbyte;
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi ] >> (b * 4)) & 0xF), m));
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF), m));
|
||||
#elif defined(DATA_A_Q5_K)
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A;
|
||||
|
||||
const uint ib = idx / 128; // 2 values per idx
|
||||
const uint iqs = idx % 128; // 0..127
|
||||
|
||||
const uint n = iqs / 32; // 0,1,2,3
|
||||
const uint b = (iqs % 32) / 16; // 0,1
|
||||
const uint is = 2 * n + b; // 0..7
|
||||
const uint qsi = n * 32 + (iqs % 16) * 2; // 0,2,4..126
|
||||
const uint qhi = (iqs % 16) * 2; // 0,2,4..30
|
||||
|
||||
const uint8_t hm = uint8_t(1 << (iqs / 16));
|
||||
|
||||
const vec2 loadd = vec2(data_a[ib].d);
|
||||
|
||||
const uint scidx0 = (is < 4) ? is : (is + 4);
|
||||
const uint scidx1 = (is < 4) ? is : (is - 4);
|
||||
const uint scidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||
const uint scidxshift1 = (is < 4) ? 0 : 2;
|
||||
const uint mbidx0 = is + 4;
|
||||
const uint mbidx1 = (is < 4) ? is + 4 : is;
|
||||
const uint mbidxmask0 = (is < 4) ? 0xF : 0xF0;
|
||||
const uint mbidxshift0 = (is < 4) ? 0 : 4;
|
||||
const uint mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||
const uint mbidxshift1 = (is < 4) ? 0 : 2;
|
||||
|
||||
const uint8_t sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1));
|
||||
const uint8_t mbyte = uint8_t(((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0) | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1));
|
||||
|
||||
const float d = loadd.x * sc;
|
||||
const float m = -loadd.y * mbyte;
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi ] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi ] & hm) != 0 ? 16 : 0), m));
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi + 1] & hm) != 0 ? 16 : 0), m));
|
||||
#elif defined(DATA_A_Q6_K)
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A;
|
||||
|
||||
const uint ib = idx / 128; // 2 values per idx
|
||||
const uint iqs = idx % 128; // 0..127
|
||||
|
||||
const uint n = iqs / 64; // 0,1
|
||||
const uint b = (iqs % 64) / 32; // 0,1
|
||||
const uint is_b = (iqs % 16) / 8; // 0,1
|
||||
const uint qhshift = ((iqs % 64) / 16) * 2; // 0,2,4,6
|
||||
const uint is = 8 * n + qhshift + is_b; // 0..15
|
||||
const uint qsi = n * 64 + (iqs % 32) * 2; // 0,2,4..126
|
||||
const uint qhi = n * 32 + (iqs % 16) * 2; // 0,2,4..62
|
||||
|
||||
const float dscale = float(data_a[ib].d) * float(data_a[ib].scales[is]);
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi ] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi ] >> qhshift) & 3) << 4)) - 32));
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32));
|
||||
#elif defined(DATA_A_IQ1_S)
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A;
|
||||
|
||||
const uint ib = idx / 32; // 8 values per idx
|
||||
const uint ib32 = (idx % 32) / 4; // 0..7
|
||||
const uint ib8 = idx % 32;
|
||||
|
||||
const float d = float(data_a[ib].d);
|
||||
const uint qh = data_a[ib].qh[ib32];
|
||||
const uint qs = data_a[ib].qs[ib8];
|
||||
const float dl = d * (2 * bitfieldExtract(qh, 12, 3) + 1);
|
||||
const float delta = ((qh & 0x8000) != 0) ? -IQ1S_DELTA : IQ1S_DELTA;
|
||||
const int16_t grid = int16_t(iq1s_grid[qs | (bitfieldExtract(qh, 3 * int(ib8 & 3), 3) << 8)]);
|
||||
|
||||
[[unroll]] for (int k = 0; k < 8; ++k) {
|
||||
buf_a[buf_idx + k] = FLOAT_TYPE(dl * (bitfieldExtract(grid, 2 * k, 2) + delta));
|
||||
}
|
||||
#elif defined(DATA_A_IQ1_M)
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A;
|
||||
|
||||
const uint ib = idx / 32; // 8 values per idx
|
||||
const uint ib8 = idx % 32;
|
||||
const uint ib16 = ib8 / 2;
|
||||
|
||||
const uint16_t[4] scales = data_a[ib].scales;
|
||||
const u16vec4 s = u16vec4(scales[0], scales[1], scales[2], scales[3]) >> 12;
|
||||
const float d = float(unpackHalf2x16(s.x | (s.y << 4) | (s.z << 8) | (s.w << 12)).x);
|
||||
const uint sc = scales[ib8 / 8];
|
||||
const uint qs = data_a[ib].qs[ib8];
|
||||
const uint qh = data_a[ib].qh[ib16] >> (4 * (ib8 & 1));
|
||||
const float dl = d * (2 * bitfieldExtract(sc, 3 * int(ib16 & 3), 3) + 1);
|
||||
const float delta = ((qh & 8) != 0) ? -IQ1M_DELTA : IQ1M_DELTA;
|
||||
const int16_t grid = int16_t(iq1s_grid[qs | ((qh & 7) << 8)]);
|
||||
|
||||
[[unroll]] for (int k = 0; k < 8; ++k) {
|
||||
buf_a[buf_idx + k] = FLOAT_TYPE(dl * (bitfieldExtract(grid, 2 * k, 2) + delta));
|
||||
}
|
||||
#elif defined(DATA_A_IQ2_XXS)
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A;
|
||||
|
||||
const uint ib = idx / 32; // 8 values per idx
|
||||
const uint ib32 = (idx % 32) / 4; // 0..7
|
||||
const uint ib8 = idx % 4;
|
||||
|
||||
const float d = float(data_a[ib].d);
|
||||
const uint qs = data_a[ib].qs[8 * ib32 + ib8];
|
||||
const uint signs = pack32(u8vec4(
|
||||
data_a[ib].qs[8*ib32 + 4],
|
||||
data_a[ib].qs[8*ib32 + 5],
|
||||
data_a[ib].qs[8*ib32 + 6],
|
||||
data_a[ib].qs[8*ib32 + 7]
|
||||
));
|
||||
const FLOAT_TYPE db = FLOAT_TYPE(d * 0.25 * (0.5 + (signs >> 28)));
|
||||
const uint32_t sign7 = bitfieldExtract(signs, 7 * int(ib8), 7);
|
||||
const uint sign = sign7 | (bitCount(sign7) << 7);
|
||||
const uvec2 grid = iq2xxs_grid[qs];
|
||||
const vec4 grid0 = vec4(unpack8(grid.x));
|
||||
const vec4 grid1 = vec4(unpack8(grid.y));
|
||||
|
||||
buf_a[buf_idx ] = db * FLOAT_TYPE((sign & 1) != 0 ? -grid0.x : grid0.x);
|
||||
buf_a[buf_idx + 1] = db * FLOAT_TYPE((sign & 2) != 0 ? -grid0.y : grid0.y);
|
||||
buf_a[buf_idx + 2] = db * FLOAT_TYPE((sign & 4) != 0 ? -grid0.z : grid0.z);
|
||||
buf_a[buf_idx + 3] = db * FLOAT_TYPE((sign & 8) != 0 ? -grid0.w : grid0.w);
|
||||
buf_a[buf_idx + 4] = db * FLOAT_TYPE((sign & 16) != 0 ? -grid1.x : grid1.x);
|
||||
buf_a[buf_idx + 5] = db * FLOAT_TYPE((sign & 32) != 0 ? -grid1.y : grid1.y);
|
||||
buf_a[buf_idx + 6] = db * FLOAT_TYPE((sign & 64) != 0 ? -grid1.z : grid1.z);
|
||||
buf_a[buf_idx + 7] = db * FLOAT_TYPE((sign & 128) != 0 ? -grid1.w : grid1.w);
|
||||
#elif defined(DATA_A_IQ2_XS)
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A;
|
||||
|
||||
const uint ib = idx / 32; // 8 values per idx
|
||||
const uint ib32 = (idx % 32) / 4; // 0..7
|
||||
const uint ib8 = idx % 4; // 0..3
|
||||
|
||||
const float d = float(data_a[ib].d);
|
||||
const uint scale = (data_a[ib].scales[ib32] >> (2 * (ib8 & 2))) & 0xf;
|
||||
const FLOAT_TYPE db = FLOAT_TYPE(d * 0.25 * (0.5 + scale));
|
||||
const uint qs = data_a[ib].qs[4 * ib32 + ib8];
|
||||
const uint sign7 = qs >> 9;
|
||||
const uint sign = sign7 | (bitCount(sign7) << 7);
|
||||
const uvec2 grid = iq2xs_grid[qs & 511];
|
||||
const vec4 grid0 = vec4(unpack8(grid.x));
|
||||
const vec4 grid1 = vec4(unpack8(grid.y));
|
||||
|
||||
buf_a[buf_idx ] = db * FLOAT_TYPE((sign & 1) != 0 ? -grid0.x : grid0.x);
|
||||
buf_a[buf_idx + 1] = db * FLOAT_TYPE((sign & 2) != 0 ? -grid0.y : grid0.y);
|
||||
buf_a[buf_idx + 2] = db * FLOAT_TYPE((sign & 4) != 0 ? -grid0.z : grid0.z);
|
||||
buf_a[buf_idx + 3] = db * FLOAT_TYPE((sign & 8) != 0 ? -grid0.w : grid0.w);
|
||||
buf_a[buf_idx + 4] = db * FLOAT_TYPE((sign & 16) != 0 ? -grid1.x : grid1.x);
|
||||
buf_a[buf_idx + 5] = db * FLOAT_TYPE((sign & 32) != 0 ? -grid1.y : grid1.y);
|
||||
buf_a[buf_idx + 6] = db * FLOAT_TYPE((sign & 64) != 0 ? -grid1.z : grid1.z);
|
||||
buf_a[buf_idx + 7] = db * FLOAT_TYPE((sign & 128) != 0 ? -grid1.w : grid1.w);
|
||||
#elif defined(DATA_A_IQ2_S)
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A;
|
||||
|
||||
const uint ib = idx / 32; // 8 values per idx
|
||||
const uint ib8 = idx % 32; // 0..31
|
||||
const uint ib32 = ib8 / 4; // 0..7
|
||||
|
||||
const uint scale = (data_a[ib].scales[ib32] >> (2 * (ib8 & 2))) & 0xf;
|
||||
const uint qs = data_a[ib].qs[ib8];
|
||||
const uint qh = data_a[ib].qh[ib32];
|
||||
const uint qhshift = 2 * (ib8 % 4);
|
||||
const uint sign = data_a[ib].qs[QUANT_K / 8 + ib8];
|
||||
|
||||
const float d = float(data_a[ib].d);
|
||||
const FLOAT_TYPE db = FLOAT_TYPE(d * 0.25 * (0.5 + scale));
|
||||
const uvec2 grid = iq2s_grid[qs | ((qh << (8 - qhshift)) & 0x300)];
|
||||
const vec4 grid0 = vec4(unpack8(grid.x));
|
||||
const vec4 grid1 = vec4(unpack8(grid.y));
|
||||
|
||||
buf_a[buf_idx ] = db * FLOAT_TYPE((sign & 1) != 0 ? -grid0.x : grid0.x);
|
||||
buf_a[buf_idx + 1] = db * FLOAT_TYPE((sign & 2) != 0 ? -grid0.y : grid0.y);
|
||||
buf_a[buf_idx + 2] = db * FLOAT_TYPE((sign & 4) != 0 ? -grid0.z : grid0.z);
|
||||
buf_a[buf_idx + 3] = db * FLOAT_TYPE((sign & 8) != 0 ? -grid0.w : grid0.w);
|
||||
buf_a[buf_idx + 4] = db * FLOAT_TYPE((sign & 16) != 0 ? -grid1.x : grid1.x);
|
||||
buf_a[buf_idx + 5] = db * FLOAT_TYPE((sign & 32) != 0 ? -grid1.y : grid1.y);
|
||||
buf_a[buf_idx + 6] = db * FLOAT_TYPE((sign & 64) != 0 ? -grid1.z : grid1.z);
|
||||
buf_a[buf_idx + 7] = db * FLOAT_TYPE((sign & 128) != 0 ? -grid1.w : grid1.w);
|
||||
#elif defined(DATA_A_IQ3_XXS)
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A;
|
||||
|
||||
const uint ib = idx / 64; // 4 values per idx
|
||||
const uint iqs = idx % 64; // 0..63
|
||||
const uint is = QUANT_K / 4 + 4 * (iqs / 8); // 8 values
|
||||
|
||||
const float d = float(data_a[ib].d);
|
||||
const uint qs = data_a[ib].qs[iqs];
|
||||
const uint signs = pack32(u8vec4(
|
||||
data_a[ib].qs[is+0],
|
||||
data_a[ib].qs[is+1],
|
||||
data_a[ib].qs[is+2],
|
||||
data_a[ib].qs[is+3]
|
||||
));
|
||||
const float db = d * 0.5 * (0.5 + (signs >> 28));
|
||||
const uint32_t sign7 = bitfieldExtract(signs, 7 * (int(iqs / 2) % 4), 7);
|
||||
const uint sign = (sign7 | (bitCount(sign7) << 7)) >> (4 * (idx % 2));
|
||||
const uint grid = iq3xxs_grid[qs];
|
||||
const vec4 v = db * vec4(unpack8(grid));
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE((sign & 1) != 0 ? -v.x : v.x);
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE((sign & 2) != 0 ? -v.y : v.y);
|
||||
buf_a[buf_idx + 2] = FLOAT_TYPE((sign & 4) != 0 ? -v.z : v.z);
|
||||
buf_a[buf_idx + 3] = FLOAT_TYPE((sign & 8) != 0 ? -v.w : v.w);
|
||||
#elif defined(DATA_A_IQ3_S)
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A;
|
||||
|
||||
const uint ib = idx / 64; // 4 values per idx
|
||||
const uint iqs = idx % 64; // 0..63
|
||||
const uint iqh = iqs / 8;
|
||||
|
||||
const float d = float(data_a[ib].d);
|
||||
const uint qs = data_a[ib].qs[iqs];
|
||||
const uint qh = data_a[ib].qh[iqh];
|
||||
const int8_t sign = int8_t(data_a[ib].signs[iqs / 2] >> (4 * (idx % 2)));
|
||||
const uint scale = data_a[ib].scales[iqs / 16];
|
||||
const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(sign << 1, sign)));
|
||||
const float db = d * (1 + 2 * ((scale >> (4 * (iqh & 1))) & 0xf));
|
||||
const uint32_t grid = iq3s_grid[qs | ((qh << (8 - (iqs % 8))) & 256)];
|
||||
const vec4 v = db * vec4(unpack8(grid));
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE((sign & 1) != 0 ? -v.x : v.x);
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE((sign & 2) != 0 ? -v.y : v.y);
|
||||
buf_a[buf_idx + 2] = FLOAT_TYPE((sign & 4) != 0 ? -v.z : v.z);
|
||||
buf_a[buf_idx + 3] = FLOAT_TYPE((sign & 8) != 0 ? -v.w : v.w);
|
||||
#elif defined(DATA_A_IQ4_XS)
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A;
|
||||
|
||||
const uint ib = idx / 128; // 2 values per idx
|
||||
const uint ib32 = (idx % 128) / 16; // 0..7
|
||||
const uint iq = 16 * ib32 + 2 * (idx % 8);
|
||||
|
||||
const uint sl = (data_a[ib].scales_l[ib32/2] >> (4 * (ib32 & 1))) & 0xF;
|
||||
const uint sh = ((data_a[ib].scales_h) >> (2 * ib32)) & 3;
|
||||
const uint qshift = (idx & 8) >> 1;
|
||||
u8vec2 qs = u8vec2(data_a[ib].qs[iq], data_a[ib].qs[iq + 1]);
|
||||
qs = (qs >> qshift) & uint8_t(0xF);
|
||||
|
||||
const float d = float(data_a[ib].d);
|
||||
const vec2 v = d * float(int(sl | (sh << 4)) - 32) * vec2(kvalues_iq4nl[qs.x], kvalues_iq4nl[qs.y]);
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(v.x);
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
|
||||
#elif defined(DATA_A_IQ4_NL)
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + 2 * row;
|
||||
|
||||
const uint ib = idx / 8;
|
||||
const uint iqs = idx & 0x07;
|
||||
|
||||
const FLOAT_TYPE d = FLOAT_TYPE(data_a_packed16[ib].d);
|
||||
const uint vui = uint(data_a_packed16[ib].qs[iqs]);
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(kvalues_iq4nl[vui & 0xF]) * d;
|
||||
buf_a[buf_idx + 1 ] = FLOAT_TYPE(kvalues_iq4nl[bitfieldExtract(vui, 8, 4)]) * d;
|
||||
buf_a[buf_idx + 16] = FLOAT_TYPE(kvalues_iq4nl[bitfieldExtract(vui, 4, 4)]) * d;
|
||||
buf_a[buf_idx + 17] = FLOAT_TYPE(kvalues_iq4nl[vui >> 12]) * d;
|
||||
#elif defined(DATA_A_MXFP4)
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + 2 * row;
|
||||
|
||||
const uint ib = idx / 8;
|
||||
const uint iqs = (idx & 0x07) * 2;
|
||||
|
||||
const float d = e8m0_to_fp32(data_a[ib].e);
|
||||
const uint vui = uint(data_a[ib].qs[iqs]);
|
||||
const uint vui2 = uint(data_a[ib].qs[iqs+1]);
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(kvalues_mxfp4[vui & 0xF] * d);
|
||||
buf_a[buf_idx + 16] = FLOAT_TYPE(kvalues_mxfp4[vui >> 4] * d);
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE(kvalues_mxfp4[vui2 & 0xF] * d);
|
||||
buf_a[buf_idx + 17] = FLOAT_TYPE(kvalues_mxfp4[vui2 >> 4] * d);
|
||||
#endif
|
||||
}
|
||||
|
||||
#if !defined(MUL_MAT_ID)
|
||||
void load_b_to_shmem(const uint pos_b, const uint row, const uint col, const uint idx_n, const uint idx_k, const uint end_k) {
|
||||
#if LOAD_VEC_B == 8
|
||||
// Not supported for b_type bf16 because bf16mat2x4 does not exist
|
||||
const uint idx = pos_b + col * p.stride_b / LOAD_VEC_B + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_B;
|
||||
FLOAT_TYPE_VEC8 bb = FLOAT_TYPE_VEC8(data_b[idx]);
|
||||
buf_b[buf_idx + 0] = bb[0].x;
|
||||
buf_b[buf_idx + 1] = bb[0].y;
|
||||
buf_b[buf_idx + 2] = bb[0].z;
|
||||
buf_b[buf_idx + 3] = bb[0].w;
|
||||
buf_b[buf_idx + 4] = bb[1].x;
|
||||
buf_b[buf_idx + 5] = bb[1].y;
|
||||
buf_b[buf_idx + 6] = bb[1].z;
|
||||
buf_b[buf_idx + 7] = bb[1].w;
|
||||
#elif LOAD_VEC_B == 4
|
||||
const uint idx = pos_b + col * p.stride_b / LOAD_VEC_B + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_B;
|
||||
#if defined(DATA_B_BF16)
|
||||
FLOAT_TYPE_VEC4 bb = FLOAT_TYPE_VEC4(TO_FLOAT_TYPE(data_b[idx]));
|
||||
#else
|
||||
FLOAT_TYPE_VEC4 bb = FLOAT_TYPE_VEC4(data_b[idx]);
|
||||
#endif
|
||||
buf_b[buf_idx + 0] = bb.x;
|
||||
buf_b[buf_idx + 1] = bb.y;
|
||||
buf_b[buf_idx + 2] = bb.z;
|
||||
buf_b[buf_idx + 3] = bb.w;
|
||||
#else // LOAD_VEC_B == 1
|
||||
if (idx_n < p.N && idx_k < end_k) {
|
||||
buf_b[col * SHMEM_STRIDE + row] = TO_FLOAT_TYPE(data_b[pos_b + col * p.stride_b + row]);
|
||||
} else {
|
||||
buf_b[col * SHMEM_STRIDE + row] = FLOAT_TYPE(0.0f);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#else
|
||||
void load_b_to_shmem(const uint pos_b, const uint row, const uint col, const uint ic, const uint _ne1, const uint idx_k, const uint end_k) {
|
||||
#if LOAD_VEC_B == 8
|
||||
// Not supported for b_type bf16 because bf16mat2x4 does not exist
|
||||
const u16vec2 row_idx = row_ids[col];
|
||||
const uint idx = pos_b + row_idx.y * p.batch_stride_b / LOAD_VEC_B + (row_idx.x % p.ne11) * p.stride_b / LOAD_VEC_B + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_B;
|
||||
FLOAT_TYPE_VEC8 bb = FLOAT_TYPE_VEC8(data_b[idx]);
|
||||
buf_b[buf_idx + 0] = bb[0].x;
|
||||
buf_b[buf_idx + 1] = bb[0].y;
|
||||
buf_b[buf_idx + 2] = bb[0].z;
|
||||
buf_b[buf_idx + 3] = bb[0].w;
|
||||
buf_b[buf_idx + 4] = bb[1].x;
|
||||
buf_b[buf_idx + 5] = bb[1].y;
|
||||
buf_b[buf_idx + 6] = bb[1].z;
|
||||
buf_b[buf_idx + 7] = bb[1].w;
|
||||
#elif LOAD_VEC_B == 4
|
||||
const u16vec2 row_idx = row_ids[col];
|
||||
const uint idx = pos_b + row_idx.y * p.batch_stride_b / LOAD_VEC_B + (row_idx.x % p.ne11) * p.stride_b / LOAD_VEC_B + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_B;
|
||||
#if defined(DATA_B_BF16)
|
||||
FLOAT_TYPE_VEC4 bb = FLOAT_TYPE_VEC4(TO_FLOAT_TYPE(data_b[idx]));
|
||||
#else
|
||||
FLOAT_TYPE_VEC4 bb = FLOAT_TYPE_VEC4(data_b[idx]);
|
||||
#endif
|
||||
buf_b[buf_idx + 0] = bb.x;
|
||||
buf_b[buf_idx + 1] = bb.y;
|
||||
buf_b[buf_idx + 2] = bb.z;
|
||||
buf_b[buf_idx + 3] = bb.w;
|
||||
#else // LOAD_VEC_B == 1
|
||||
const uint row_i = ic * BN + col;
|
||||
if (row_i < _ne1 && idx_k < end_k) {
|
||||
const u16vec2 row_idx = row_ids[col];
|
||||
buf_b[col * SHMEM_STRIDE + row] = TO_FLOAT_TYPE(data_b[pos_b + row_idx.y * p.batch_stride_b + (row_idx.x % p.ne11) * p.stride_b + row]);
|
||||
} else {
|
||||
buf_b[col * SHMEM_STRIDE + row] = FLOAT_TYPE(0.0f);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
@@ -24,11 +24,12 @@ void main() {
|
||||
const uint j = gl_GlobalInvocationID.x;
|
||||
const uint d_offset = i * p.nb1;
|
||||
|
||||
if (p.dim % 2 != 0 && j == ((p.dim + 1) / 2)) {
|
||||
data_d[d_offset + p.dim] = 0.f;
|
||||
const uint half_dim = p.dim / 2;
|
||||
|
||||
if (p.dim % 2 != 0 && j == half_dim) {
|
||||
data_d[d_offset + 2 * half_dim] = 0.f;
|
||||
}
|
||||
|
||||
const uint half_dim = p.dim / 2;
|
||||
if (j >= half_dim) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -13,13 +13,10 @@
|
||||
|
||||
#if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1
|
||||
#define A_TYPE float
|
||||
#define A_TYPE32 float
|
||||
#elif LOAD_VEC_A == 4
|
||||
#define A_TYPE vec4
|
||||
#define A_TYPE32 vec4
|
||||
#elif LOAD_VEC_A == 8
|
||||
#define A_TYPE mat2x4
|
||||
#define A_TYPE32 mat2x4
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@@ -29,13 +26,10 @@
|
||||
|
||||
#if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1
|
||||
#define A_TYPE float16_t
|
||||
#define A_TYPE32 float
|
||||
#elif LOAD_VEC_A == 4
|
||||
#define A_TYPE f16vec4
|
||||
#define A_TYPE32 vec4
|
||||
#elif LOAD_VEC_A == 8
|
||||
#define A_TYPE f16mat2x4
|
||||
#define A_TYPE32 mat2x4
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
@@ -320,9 +320,7 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
|
||||
std::string aligned_b_type_f32 = coopmat2 ? "float" : fp16 ? "mat2x4" : "vec4";
|
||||
std::string aligned_b_type_f16 = coopmat2 ? "float16_t" : fp16 ? "f16mat2x4" : "f16vec4";
|
||||
|
||||
std::map<std::string, std::string> base_dict = {
|
||||
{"FLOAT_TYPE_VEC2", (coopmat2 || fp16) ? "f16vec2" : "vec2"},
|
||||
};
|
||||
std::map<std::string, std::string> base_dict;
|
||||
std::string shader_name = "matmul";
|
||||
|
||||
if (matmul_id_type == MatMulIdType::DEFAULT) {
|
||||
@@ -349,26 +347,74 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
|
||||
|
||||
const std::string source_name = coopmat2 ? "mul_mm_cm2.comp" : "mul_mm.comp";
|
||||
|
||||
auto const &FLOAT_TYPE = [&](const std::string &t) -> std::string {
|
||||
if (t == "bf16") {
|
||||
// scalar path promotes to float
|
||||
if (!coopmat && !coopmat2) {
|
||||
return "float";
|
||||
auto const &FLOAT_TYPE = [&](int vec, const std::string &t) -> std::string {
|
||||
switch (vec) {
|
||||
case 1:
|
||||
if (t == "bf16") {
|
||||
// scalar path promotes to float
|
||||
if (!coopmat && !coopmat2) {
|
||||
return "float";
|
||||
}
|
||||
return "bfloat16_t";
|
||||
}
|
||||
return "bfloat16_t";
|
||||
if (coopmat2 || fp16) {
|
||||
return "float16_t";
|
||||
}
|
||||
return "float";
|
||||
case 2:
|
||||
if (t == "bf16") {
|
||||
// scalar path promotes to float
|
||||
if (!coopmat && !coopmat2) {
|
||||
return "vec2";
|
||||
}
|
||||
return "bf16vec2";
|
||||
}
|
||||
if (coopmat2 || fp16) {
|
||||
return "f16vec2";
|
||||
}
|
||||
return "vec2";
|
||||
case 4:
|
||||
if (t == "bf16") {
|
||||
// scalar path promotes to float
|
||||
if (!coopmat && !coopmat2) {
|
||||
return "vec4";
|
||||
}
|
||||
return "bf16vec4";
|
||||
}
|
||||
if (coopmat2 || fp16) {
|
||||
return "f16vec4";
|
||||
}
|
||||
return "vec4";
|
||||
case 8:
|
||||
if (t == "bf16") {
|
||||
// scalar path promotes to float
|
||||
if (!coopmat && !coopmat2) {
|
||||
return "mat2x4";
|
||||
}
|
||||
throw std::runtime_error("bf16 vec8 not supported");
|
||||
}
|
||||
if (coopmat2 || fp16) {
|
||||
return "f16mat2x4";
|
||||
}
|
||||
return "mat2x4";
|
||||
default:
|
||||
throw std::runtime_error("invalid vector size");
|
||||
}
|
||||
if (coopmat2 || fp16) {
|
||||
return "float16_t";
|
||||
}
|
||||
return "float";
|
||||
};
|
||||
|
||||
const std::map<std::string, std::string> float_type_dict_f16 = {
|
||||
{"FLOAT_TYPE", FLOAT_TYPE(1, "f16")},
|
||||
{"FLOAT_TYPE_VEC2", FLOAT_TYPE(2, "f16")},
|
||||
{"FLOAT_TYPE_VEC4", FLOAT_TYPE(4, "f16")},
|
||||
{"FLOAT_TYPE_VEC8", FLOAT_TYPE(8, "f16")},
|
||||
};
|
||||
|
||||
// Shaders with f16 B_TYPE
|
||||
string_to_spv(shader_name + "_f32_f16", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("f16")}, {"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}, }), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_f32_f16_aligned", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("f16")}, {"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"B_TYPE32", aligned_b_type_f32}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_f32_f16", source_name, merge_maps(merge_maps(base_dict, float_type_dict_f16), {{"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}, }), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_f32_f16_aligned", source_name, merge_maps(merge_maps(base_dict, float_type_dict_f16), {{"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
|
||||
string_to_spv(shader_name + "_f16_aligned", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("f16")}, {"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"B_TYPE32", aligned_b_type_f32}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_f16", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("f16")}, {"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_f16", source_name, merge_maps(merge_maps(base_dict, float_type_dict_f16), {{"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_f16_aligned", source_name, merge_maps(merge_maps(base_dict, float_type_dict_f16), {{"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
|
||||
// bf16
|
||||
{
|
||||
@@ -379,13 +425,19 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
|
||||
// scalar path promotes to float
|
||||
std::string to_float_type = (coopmat || coopmat2) ? "uintBitsToBFloat16EXT" : "bf16_to_fp32";
|
||||
|
||||
const std::map<std::string, std::string> float_type_dict_bf16 = {
|
||||
{"FLOAT_TYPE", FLOAT_TYPE(1, "bf16")},
|
||||
{"FLOAT_TYPE_VEC2", FLOAT_TYPE(2, "bf16")},
|
||||
{"FLOAT_TYPE_VEC4", FLOAT_TYPE(4, "bf16")},
|
||||
};
|
||||
|
||||
// If bfloat16 is not supported, then only compile the scalar (promote to fp32) shader
|
||||
#if !defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT)
|
||||
if (!(coopmat || coopmat2))
|
||||
#endif
|
||||
{
|
||||
string_to_spv(shader_name + "_bf16_aligned", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("bf16")}, {"TO_FLOAT_TYPE", to_float_type}, {"DATA_A_BF16", "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", "4"}, {"B_TYPE", coopmat2 ? "bfloat16_t" : "u16vec4"}, {"B_TYPE32", "vec4"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"DATA_B_BF16", "1"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_bf16", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("bf16")}, {"TO_FLOAT_TYPE", to_float_type}, {"DATA_A_BF16", "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", coopmat2 ? "bfloat16_t" : "uint16_t"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"DATA_B_BF16", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_bf16_aligned", source_name, merge_maps(merge_maps(base_dict, float_type_dict_bf16), {{"TO_FLOAT_TYPE", to_float_type}, {"DATA_A_BF16", "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", "4"}, {"B_TYPE", coopmat2 ? "bfloat16_t" : "u16vec4"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"DATA_B_BF16", "1"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_bf16", source_name, merge_maps(merge_maps(base_dict, float_type_dict_bf16), {{"TO_FLOAT_TYPE", to_float_type}, {"DATA_A_BF16", "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", coopmat2 ? "bfloat16_t" : "uint16_t"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"DATA_B_BF16", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -406,20 +458,27 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
|
||||
// For aligned matmul loads
|
||||
std::string load_vec_a = (coopmat2 || tname == "f32" || tname == "f16" || tname == "bf16") ? load_vec : load_vec_quant;
|
||||
|
||||
const std::map<std::string, std::string> float_type_dict = {
|
||||
{"FLOAT_TYPE", FLOAT_TYPE(1, tname)},
|
||||
{"FLOAT_TYPE_VEC2", FLOAT_TYPE(2, tname)},
|
||||
{"FLOAT_TYPE_VEC4", FLOAT_TYPE(4, tname)},
|
||||
{"FLOAT_TYPE_VEC8", FLOAT_TYPE(8, tname)},
|
||||
};
|
||||
|
||||
// don't generate f32 variants for coopmat2
|
||||
if (!coopmat2) {
|
||||
string_to_spv(shader_name + "_" + tname + "_f32", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE(tname)}, {data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_" + tname + "_f32_aligned", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE(tname)}, {data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"B_TYPE32", aligned_b_type_f32}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_" + tname + "_f32", source_name, merge_maps(merge_maps(base_dict, float_type_dict), {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_" + tname + "_f32_aligned", source_name, merge_maps(merge_maps(base_dict, float_type_dict), {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
}
|
||||
|
||||
if (tname != "f16" && tname != "f32") {
|
||||
string_to_spv(shader_name + "_" + tname + "_f16", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE(tname)}, {data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_" + tname + "_f16_aligned", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE(tname)}, {data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"B_TYPE32", aligned_b_type_f32}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_" + tname + "_f16", source_name, merge_maps(merge_maps(base_dict, float_type_dict), {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_" + tname + "_f16_aligned", source_name, merge_maps(merge_maps(base_dict, float_type_dict), {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
}
|
||||
|
||||
#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
|
||||
if (!coopmat && !coopmat2 && matmul_id_type == MatMulIdType::NONE && is_legacy_quant(tname)) {
|
||||
string_to_spv(shader_name + "_" + tname + "_q8_1", "mul_mmq.comp", merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE(tname)}, {data_a_key, "1"}, {"D_TYPE", "float"},}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_" + tname + "_q8_1", "mul_mmq.comp", merge_maps(merge_maps(base_dict, float_type_dict), {{data_a_key, "1"}, {"D_TYPE", "float"},}), fp16, coopmat, coopmat2, f16acc);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -116,6 +116,10 @@ struct webgpu_context_struct {
|
||||
wgpu::Queue queue;
|
||||
wgpu::Limits limits;
|
||||
|
||||
// Separate this out from limits since on some Metal systems, the limit returned by
|
||||
// querying the limits is higher than the actual allowed maximum.
|
||||
uint32_t max_wg_size_x;
|
||||
|
||||
std::recursive_mutex mutex;
|
||||
|
||||
webgpu_buf_pool param_buf_pool;
|
||||
@@ -124,7 +128,15 @@ struct webgpu_context_struct {
|
||||
wgpu::ComputePipeline memset_pipeline;
|
||||
wgpu::ComputePipeline mul_mat_pipeline[30][2];
|
||||
wgpu::ComputePipeline set_rows_pipeline;
|
||||
wgpu::ComputePipeline get_rows_pipeline[30];
|
||||
wgpu::ComputePipeline get_rows_f32_no_vec_pipeline;
|
||||
wgpu::ComputePipeline cpy_pipeline;
|
||||
wgpu::ComputePipeline add_pipeline[2];
|
||||
wgpu::ComputePipeline add_ip_pipeline[2];
|
||||
wgpu::ComputePipeline mul_pipeline[2];
|
||||
wgpu::ComputePipeline mul_ip_pipeline[2];
|
||||
wgpu::ComputePipeline rms_norm_pipeline;
|
||||
wgpu::ComputePipeline rms_norm_ip_pipeline;
|
||||
|
||||
size_t memset_bytes_per_thread;
|
||||
|
||||
@@ -232,14 +244,15 @@ static void ggml_backend_webgpu_wait_on_submission(webgpu_context & ctx) {
|
||||
std::lock_guard<std::recursive_mutex> lock(ctx->mutex);
|
||||
if (ctx->callback_futures.empty()) {
|
||||
// no existing callbacks, wait on queue submission
|
||||
ctx->instance.WaitAny(ctx->queue.OnSubmittedWorkDone(
|
||||
wgpu::CallbackMode::AllowSpontaneous,
|
||||
[](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
|
||||
if (status != wgpu::QueueWorkDoneStatus::Success) {
|
||||
GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n", std::string(message).c_str());
|
||||
}
|
||||
}),
|
||||
UINT64_MAX);
|
||||
ctx->instance.WaitAny(
|
||||
ctx->queue.OnSubmittedWorkDone(wgpu::CallbackMode::AllowSpontaneous,
|
||||
[](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
|
||||
if (status != wgpu::QueueWorkDoneStatus::Success) {
|
||||
GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n",
|
||||
std::string(message).c_str());
|
||||
}
|
||||
}),
|
||||
UINT64_MAX);
|
||||
} else {
|
||||
// existing callbacks, wait on them
|
||||
ctx->instance.WaitAny(ctx->callback_futures.size(), ctx->callback_futures.data(), UINT64_MAX);
|
||||
@@ -286,10 +299,7 @@ static void ggml_backend_webgpu_submit_queue(webgpu_context & ctx) {
|
||||
// Check for errrors in SET_ROWS operations
|
||||
for (auto & error_bufs : staged_set_row_error_bufs) {
|
||||
wgpu::Future f = error_bufs.host_buf.MapAsync(
|
||||
wgpu::MapMode::Read,
|
||||
0,
|
||||
error_bufs.host_buf.GetSize(),
|
||||
wgpu::CallbackMode::AllowSpontaneous,
|
||||
wgpu::MapMode::Read, 0, error_bufs.host_buf.GetSize(), wgpu::CallbackMode::AllowSpontaneous,
|
||||
[ctx, error_bufs](wgpu::MapAsyncStatus status, wgpu::StringView message) {
|
||||
if (status != wgpu::MapAsyncStatus::Success) {
|
||||
GGML_LOG_ERROR("ggml_webgpu: Failed to map error buffer: %s\n", std::string(message).c_str());
|
||||
@@ -311,10 +321,7 @@ static void ggml_backend_webgpu_map_buffer(webgpu_context & ctx,
|
||||
wgpu::MapMode mode,
|
||||
size_t offset,
|
||||
size_t size) {
|
||||
ctx->instance.WaitAny(buffer.MapAsync(mode,
|
||||
offset,
|
||||
size,
|
||||
wgpu::CallbackMode::AllowSpontaneous,
|
||||
ctx->instance.WaitAny(buffer.MapAsync(mode, offset, size, wgpu::CallbackMode::AllowSpontaneous,
|
||||
[](wgpu::MapAsyncStatus status, wgpu::StringView message) {
|
||||
if (status != wgpu::MapAsyncStatus::Success) {
|
||||
GGML_LOG_ERROR("ggml_webgpu: Failed to map buffer: %s\n",
|
||||
@@ -351,7 +358,8 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context &
|
||||
std::vector<uint32_t> params,
|
||||
std::vector<wgpu::BindGroupEntry> bind_group_entries,
|
||||
uint32_t wg_x,
|
||||
bool submit_and_wait = false) {
|
||||
const char * bind_group_label = nullptr,
|
||||
bool submit_and_wait = false) {
|
||||
webgpu_pool_bufs params_bufs = ctx->param_buf_pool.alloc_bufs();
|
||||
|
||||
ggml_backend_webgpu_map_buffer(ctx, params_bufs.host_buf, wgpu::MapMode::Write, 0, params_bufs.host_buf.GetSize());
|
||||
@@ -372,6 +380,9 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context &
|
||||
bind_group_desc.layout = pipeline.GetBindGroupLayout(0);
|
||||
bind_group_desc.entryCount = bind_group_entries.size();
|
||||
bind_group_desc.entries = bind_group_entries.data();
|
||||
if (bind_group_label) {
|
||||
bind_group_desc.label = bind_group_label;
|
||||
}
|
||||
wgpu::BindGroup bind_group = ctx->device.CreateBindGroup(&bind_group_desc);
|
||||
|
||||
wgpu::CommandEncoder encoder = ctx->device.CreateCommandEncoder();
|
||||
@@ -415,9 +426,9 @@ static void ggml_backend_webgpu_buffer_memset(webgpu_context & ctx,
|
||||
std::vector<wgpu::BindGroupEntry> entries = {
|
||||
{ .binding = 0, .buffer = buf, .offset = 0, .size = buf.GetSize() }
|
||||
};
|
||||
size_t bytes_per_wg = ctx->limits.maxComputeWorkgroupSizeX * ctx->memset_bytes_per_thread;
|
||||
size_t bytes_per_wg = ctx->max_wg_size_x * ctx->memset_bytes_per_thread;
|
||||
uint32_t wg_x = ((size + 3) + bytes_per_wg - 1) / bytes_per_wg;
|
||||
ggml_backend_webgpu_build_and_enqueue(ctx, ctx->memset_pipeline, params, entries, wg_x, true);
|
||||
ggml_backend_webgpu_build_and_enqueue(ctx, ctx->memset_pipeline, params, entries, wg_x, "MEMSET", true);
|
||||
}
|
||||
|
||||
/** End WebGPU Actions */
|
||||
@@ -461,26 +472,26 @@ static size_t ggml_webgpu_tensor_binding_size(webgpu_context & ctx, ggml_tensor
|
||||
~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1);
|
||||
}
|
||||
|
||||
// Used to determine if two tensors are the same for in-place operations
|
||||
static bool ggml_webgpu_tensor_equal(ggml_tensor * a, ggml_tensor * b) {
|
||||
return (ggml_webgpu_tensor_buf(a).Get() == ggml_webgpu_tensor_buf(b).Get()) &&
|
||||
(ggml_webgpu_tensor_offset(a) == ggml_webgpu_tensor_offset(b));
|
||||
}
|
||||
|
||||
static void ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
|
||||
uint32_t ne = (uint32_t) ggml_nelements(dst);
|
||||
|
||||
std::vector<uint32_t> params = { ne,
|
||||
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)),
|
||||
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
|
||||
// Convert byte-strides to element-strides
|
||||
(uint32_t) (src->nb[0] / ggml_type_size(src->type)),
|
||||
(uint32_t) (src->nb[1] / ggml_type_size(src->type)),
|
||||
(uint32_t) (src->nb[2] / ggml_type_size(src->type)),
|
||||
(uint32_t) (src->nb[3] / ggml_type_size(src->type)),
|
||||
(uint32_t) (dst->nb[0] / ggml_type_size(dst->type)),
|
||||
(uint32_t) (dst->nb[1] / ggml_type_size(dst->type)),
|
||||
(uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
|
||||
(uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
|
||||
// Logical shape — same for both tensors even if permuted
|
||||
(uint32_t) src->ne[0],
|
||||
(uint32_t) src->ne[1],
|
||||
(uint32_t) src->ne[2],
|
||||
(uint32_t) src->ne[3] };
|
||||
std::vector<uint32_t> params = {
|
||||
ne, (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)),
|
||||
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
|
||||
// Convert byte-strides to element-strides
|
||||
(uint32_t) (src->nb[0] / ggml_type_size(src->type)), (uint32_t) (src->nb[1] / ggml_type_size(src->type)),
|
||||
(uint32_t) (src->nb[2] / ggml_type_size(src->type)), (uint32_t) (src->nb[3] / ggml_type_size(src->type)),
|
||||
(uint32_t) (dst->nb[0] / ggml_type_size(dst->type)), (uint32_t) (dst->nb[1] / ggml_type_size(dst->type)),
|
||||
(uint32_t) (dst->nb[2] / ggml_type_size(dst->type)), (uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
|
||||
// Logical shape — same for both tensors even if permuted
|
||||
(uint32_t) src->ne[0], (uint32_t) src->ne[1], (uint32_t) src->ne[2], (uint32_t) src->ne[3]
|
||||
};
|
||||
|
||||
std::vector<wgpu::BindGroupEntry> entries = {
|
||||
{ .binding = 0,
|
||||
@@ -493,9 +504,9 @@ static void ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src, ggml_tensor
|
||||
.size = ggml_webgpu_tensor_binding_size(ctx, dst) }
|
||||
};
|
||||
|
||||
size_t max_wg_size = ctx->limits.maxComputeWorkgroupSizeX;
|
||||
size_t max_wg_size = ctx->max_wg_size_x;
|
||||
uint32_t wg_x = (ne + max_wg_size - 1) / max_wg_size;
|
||||
ggml_backend_webgpu_build_and_enqueue(ctx, ctx->cpy_pipeline, params, entries, wg_x);
|
||||
ggml_backend_webgpu_build_and_enqueue(ctx, ctx->cpy_pipeline, params, entries, wg_x, ggml_op_name(dst->op));
|
||||
}
|
||||
|
||||
static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * idx, ggml_tensor * dst) {
|
||||
@@ -509,27 +520,21 @@ static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_t
|
||||
error_bufs.host_buf.Unmap();
|
||||
}
|
||||
|
||||
std::vector<uint32_t> params = { (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)),
|
||||
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, idx) / ggml_type_size(idx->type)),
|
||||
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
|
||||
// Convert byte-strides to element-strides
|
||||
(uint32_t) (src->nb[1] / ggml_type_size(src->type)),
|
||||
(uint32_t) (src->nb[2] / ggml_type_size(src->type)),
|
||||
(uint32_t) (src->nb[3] / ggml_type_size(src->type)),
|
||||
(uint32_t) (idx->nb[0] / ggml_type_size(idx->type)),
|
||||
(uint32_t) (idx->nb[1] / ggml_type_size(idx->type)),
|
||||
(uint32_t) (idx->nb[2] / ggml_type_size(idx->type)),
|
||||
(uint32_t) (dst->nb[1] / ggml_type_size(dst->type)),
|
||||
(uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
|
||||
(uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
|
||||
// Shape of src
|
||||
(uint32_t) src->ne[0],
|
||||
(uint32_t) src->ne[1],
|
||||
(uint32_t) src->ne[2],
|
||||
(uint32_t) src->ne[3],
|
||||
// Shape of idx
|
||||
(uint32_t) (idx->ne[1]),
|
||||
(uint32_t) (idx->ne[2]) };
|
||||
std::vector<uint32_t> params = {
|
||||
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)),
|
||||
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, idx) / ggml_type_size(idx->type)),
|
||||
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
|
||||
// Convert byte-strides to element-strides
|
||||
(uint32_t) (src->nb[1] / ggml_type_size(src->type)), (uint32_t) (src->nb[2] / ggml_type_size(src->type)),
|
||||
(uint32_t) (src->nb[3] / ggml_type_size(src->type)), (uint32_t) (idx->nb[0] / ggml_type_size(idx->type)),
|
||||
(uint32_t) (idx->nb[1] / ggml_type_size(idx->type)), (uint32_t) (idx->nb[2] / ggml_type_size(idx->type)),
|
||||
(uint32_t) (dst->nb[1] / ggml_type_size(dst->type)), (uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
|
||||
(uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
|
||||
// Shape of src
|
||||
(uint32_t) src->ne[0], (uint32_t) src->ne[1], (uint32_t) src->ne[2], (uint32_t) src->ne[3],
|
||||
// Shape of idx
|
||||
(uint32_t) (idx->ne[1]), (uint32_t) (idx->ne[2])
|
||||
};
|
||||
|
||||
std::vector<wgpu::BindGroupEntry> entries = {
|
||||
{ .binding = 0,
|
||||
@@ -547,13 +552,55 @@ static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_t
|
||||
{ .binding = 3, .buffer = error_bufs.dev_buf, .offset = 0, .size = error_bufs.dev_buf.GetSize() }
|
||||
};
|
||||
|
||||
size_t max_wg_size = ctx->limits.maxComputeWorkgroupSizeX;
|
||||
size_t max_wg_size = ctx->max_wg_size_x;
|
||||
uint32_t wg_x = (src->ne[1] * src->ne[2] * src->ne[3] + max_wg_size - 1) / max_wg_size;
|
||||
|
||||
std::lock_guard<std::recursive_mutex> lock(ctx->mutex);
|
||||
ctx->staged_set_row_error_bufs.push_back(error_bufs);
|
||||
|
||||
ggml_backend_webgpu_build_and_enqueue(ctx, ctx->set_rows_pipeline, params, entries, wg_x);
|
||||
ggml_backend_webgpu_build_and_enqueue(ctx, ctx->set_rows_pipeline, params, entries, wg_x, ggml_op_name(dst->op));
|
||||
}
|
||||
|
||||
static void ggml_webgpu_get_rows(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * idx, ggml_tensor * dst) {
|
||||
std::vector<uint32_t> params = {
|
||||
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)),
|
||||
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, idx) / ggml_type_size(idx->type)),
|
||||
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
|
||||
// Convert byte-strides to element-strides
|
||||
(uint32_t) (src->nb[1] / ggml_type_size(src->type)), (uint32_t) (src->nb[2] / ggml_type_size(src->type)),
|
||||
(uint32_t) (src->nb[3] / ggml_type_size(src->type)), (uint32_t) (idx->nb[0] / ggml_type_size(idx->type)),
|
||||
(uint32_t) (idx->nb[1] / ggml_type_size(idx->type)), (uint32_t) (idx->nb[2] / ggml_type_size(idx->type)),
|
||||
(uint32_t) (dst->nb[1] / ggml_type_size(dst->type)), (uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
|
||||
(uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
|
||||
// Shape of dst
|
||||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3],
|
||||
// Shape of idx
|
||||
(uint32_t) (idx->ne[1]), (uint32_t) (idx->ne[2])
|
||||
};
|
||||
|
||||
std::vector<wgpu::BindGroupEntry> entries = {
|
||||
{ .binding = 0,
|
||||
.buffer = ggml_webgpu_tensor_buf(src),
|
||||
.offset = ggml_webgpu_tensor_align_offset(ctx, src),
|
||||
.size = ggml_webgpu_tensor_binding_size(ctx, src) },
|
||||
{ .binding = 1,
|
||||
.buffer = ggml_webgpu_tensor_buf(idx),
|
||||
.offset = ggml_webgpu_tensor_align_offset(ctx, idx),
|
||||
.size = ggml_webgpu_tensor_binding_size(ctx, idx) },
|
||||
{ .binding = 2,
|
||||
.buffer = ggml_webgpu_tensor_buf(dst),
|
||||
.offset = ggml_webgpu_tensor_align_offset(ctx, dst),
|
||||
.size = ggml_webgpu_tensor_binding_size(ctx, dst) }
|
||||
};
|
||||
|
||||
size_t max_wg_size = ctx->max_wg_size_x;
|
||||
uint32_t wg_x = (dst->ne[1] * dst->ne[2] * dst->ne[3] + max_wg_size - 1) / max_wg_size;
|
||||
|
||||
wgpu::ComputePipeline pipeline = ctx->get_rows_pipeline[src->type];
|
||||
if (src->type == GGML_TYPE_F32 && dst->ne[0] % 4 != 0) {
|
||||
pipeline = ctx->get_rows_f32_no_vec_pipeline;
|
||||
}
|
||||
ggml_backend_webgpu_build_and_enqueue(ctx, pipeline, params, entries, wg_x, ggml_op_name(dst->op));
|
||||
}
|
||||
|
||||
static void ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst) {
|
||||
@@ -593,7 +640,104 @@ static void ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_t
|
||||
|
||||
uint32_t wg_x =
|
||||
(dst->ne[0] * dst->ne[1] * dst->ne[2] * dst->ne[3] + WEBGPU_MUL_MAT_WG_SIZE - 1) / WEBGPU_MUL_MAT_WG_SIZE;
|
||||
ggml_backend_webgpu_build_and_enqueue(ctx, ctx->mul_mat_pipeline[src0->type][src1->type], params, entries, wg_x);
|
||||
ggml_backend_webgpu_build_and_enqueue(ctx, ctx->mul_mat_pipeline[src0->type][src1->type], params, entries, wg_x,
|
||||
ggml_op_name(dst->op));
|
||||
}
|
||||
|
||||
static void ggml_webgpu_binary_op(webgpu_context & ctx,
|
||||
ggml_tensor * src0,
|
||||
ggml_tensor * src1,
|
||||
ggml_tensor * dst,
|
||||
wgpu::ComputePipeline & pipeline,
|
||||
bool in_place) {
|
||||
std::vector<uint32_t> params = {
|
||||
(uint32_t) ggml_nelements(dst),
|
||||
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)),
|
||||
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)),
|
||||
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
|
||||
(uint32_t) (src1->nb[0] / ggml_type_size(src1->type)),
|
||||
(uint32_t) (src1->nb[1] / ggml_type_size(src1->type)),
|
||||
(uint32_t) (src1->nb[2] / ggml_type_size(src1->type)),
|
||||
(uint32_t) (src1->nb[3] / ggml_type_size(src1->type)),
|
||||
(uint32_t) src0->ne[0],
|
||||
(uint32_t) src0->ne[1],
|
||||
(uint32_t) src0->ne[2],
|
||||
(uint32_t) src1->ne[0],
|
||||
(uint32_t) src1->ne[1],
|
||||
(uint32_t) src1->ne[2],
|
||||
(uint32_t) src1->ne[3],
|
||||
};
|
||||
|
||||
std::vector<wgpu::BindGroupEntry> entries = {
|
||||
{ .binding = 0,
|
||||
.buffer = ggml_webgpu_tensor_buf(src0),
|
||||
.offset = ggml_webgpu_tensor_align_offset(ctx, src0),
|
||||
.size = ggml_webgpu_tensor_binding_size(ctx, src0) },
|
||||
{ .binding = 1,
|
||||
.buffer = ggml_webgpu_tensor_buf(src1),
|
||||
.offset = ggml_webgpu_tensor_align_offset(ctx, src1),
|
||||
.size = ggml_webgpu_tensor_binding_size(ctx, src1) }
|
||||
};
|
||||
if (!in_place) {
|
||||
entries.push_back({ .binding = 2,
|
||||
.buffer = ggml_webgpu_tensor_buf(dst),
|
||||
.offset = ggml_webgpu_tensor_align_offset(ctx, dst),
|
||||
.size = ggml_webgpu_tensor_binding_size(ctx, dst) });
|
||||
}
|
||||
|
||||
size_t max_wg_size = ctx->max_wg_size_x;
|
||||
uint32_t wg_x = (ggml_nelements(dst) + max_wg_size - 1) / max_wg_size;
|
||||
ggml_backend_webgpu_build_and_enqueue(ctx, pipeline, params, entries, wg_x, ggml_op_name(dst->op));
|
||||
}
|
||||
|
||||
static void ggml_webgpu_rms_norm(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
|
||||
bool in_place = ggml_webgpu_tensor_equal(src, dst);
|
||||
|
||||
uint32_t eps;
|
||||
memcpy(&eps, dst->op_params, sizeof(float));
|
||||
|
||||
std::vector<uint32_t> params = {
|
||||
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)),
|
||||
};
|
||||
if (!in_place) {
|
||||
params.push_back((uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)));
|
||||
}
|
||||
params.push_back((uint32_t) (src->nb[1] / ggml_type_size(src->type)));
|
||||
params.push_back((uint32_t) (src->nb[2] / ggml_type_size(src->type)));
|
||||
params.push_back((uint32_t) (src->nb[3] / ggml_type_size(src->type)));
|
||||
if (!in_place) {
|
||||
params.push_back((uint32_t) (dst->nb[1] / ggml_type_size(dst->type)));
|
||||
params.push_back((uint32_t) (dst->nb[2] / ggml_type_size(dst->type)));
|
||||
params.push_back((uint32_t) (dst->nb[3] / ggml_type_size(dst->type)));
|
||||
}
|
||||
params.push_back((uint32_t) src->ne[0]);
|
||||
params.push_back((uint32_t) src->ne[1]);
|
||||
params.push_back((uint32_t) src->ne[2]);
|
||||
params.push_back((uint32_t) src->ne[3]);
|
||||
params.push_back(eps); // epsilon, will be bitcast to float in shader
|
||||
|
||||
std::vector<wgpu::BindGroupEntry> entries = {
|
||||
{ .binding = 0,
|
||||
.buffer = ggml_webgpu_tensor_buf(src),
|
||||
.offset = ggml_webgpu_tensor_align_offset(ctx, src),
|
||||
.size = ggml_webgpu_tensor_binding_size(ctx, src) }
|
||||
};
|
||||
if (!in_place) {
|
||||
entries.push_back({ .binding = 1,
|
||||
.buffer = ggml_webgpu_tensor_buf(dst),
|
||||
.offset = ggml_webgpu_tensor_align_offset(ctx, dst),
|
||||
.size = ggml_webgpu_tensor_binding_size(ctx, dst) });
|
||||
}
|
||||
|
||||
wgpu::ComputePipeline pipeline;
|
||||
if (in_place) {
|
||||
pipeline = ctx->rms_norm_ip_pipeline;
|
||||
} else {
|
||||
pipeline = ctx->rms_norm_pipeline;
|
||||
}
|
||||
size_t max_wg_size = ctx->max_wg_size_x;
|
||||
uint32_t wg_x = (src->ne[1] * src->ne[2] * src->ne[3] + max_wg_size - 1) / max_wg_size;
|
||||
ggml_backend_webgpu_build_and_enqueue(ctx, pipeline, params, entries, wg_x, ggml_op_name(dst->op));
|
||||
}
|
||||
|
||||
// Returns true if node has enqueued work into the queue, false otherwise
|
||||
@@ -615,20 +759,34 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node) {
|
||||
case GGML_OP_RESHAPE:
|
||||
return false;
|
||||
case GGML_OP_CPY:
|
||||
{
|
||||
ggml_webgpu_cpy(ctx, src0, node);
|
||||
break;
|
||||
}
|
||||
ggml_webgpu_cpy(ctx, src0, node);
|
||||
break;
|
||||
case GGML_OP_SET_ROWS:
|
||||
{
|
||||
ggml_webgpu_set_rows(ctx, src0, src1, node);
|
||||
break;
|
||||
}
|
||||
ggml_webgpu_set_rows(ctx, src0, src1, node);
|
||||
break;
|
||||
case GGML_OP_GET_ROWS:
|
||||
ggml_webgpu_get_rows(ctx, src0, src1, node);
|
||||
break;
|
||||
case GGML_OP_MUL_MAT:
|
||||
{
|
||||
ggml_webgpu_mul_mat(ctx, src0, src1, node);
|
||||
break;
|
||||
ggml_webgpu_mul_mat(ctx, src0, src1, node);
|
||||
break;
|
||||
case GGML_OP_ADD:
|
||||
if (ggml_webgpu_tensor_equal(src0, node)) {
|
||||
ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->add_ip_pipeline[node->type], true);
|
||||
} else {
|
||||
ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->add_pipeline[node->type], false);
|
||||
}
|
||||
break;
|
||||
case GGML_OP_MUL:
|
||||
if (ggml_webgpu_tensor_equal(src0, node)) {
|
||||
ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->mul_ip_pipeline[node->type], true);
|
||||
} else {
|
||||
ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->mul_pipeline[node->type], false);
|
||||
}
|
||||
break;
|
||||
case GGML_OP_RMS_NORM:
|
||||
ggml_webgpu_rms_norm(ctx, src0, node);
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -731,8 +889,8 @@ static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
((uint8_t *) &val32)[i] = ((const uint8_t *) data)[size - remaining_size + i];
|
||||
}
|
||||
// memset the remaining bytes
|
||||
ggml_backend_webgpu_buffer_memset(
|
||||
webgpu_ctx, buf_ctx->buffer, val32, total_offset + (size - remaining_size), remaining_size);
|
||||
ggml_backend_webgpu_buffer_memset(webgpu_ctx, buf_ctx->buffer, val32, total_offset + (size - remaining_size),
|
||||
remaining_size);
|
||||
} else {
|
||||
// wait for WriteBuffer to complete
|
||||
ggml_backend_webgpu_wait_on_submission(webgpu_ctx);
|
||||
@@ -766,11 +924,8 @@ static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||
if (webgpu_ctx->get_tensor_staging_buf) {
|
||||
webgpu_ctx->get_tensor_staging_buf.Destroy();
|
||||
}
|
||||
ggml_webgpu_create_buffer(device,
|
||||
webgpu_ctx->get_tensor_staging_buf,
|
||||
final_size,
|
||||
wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead,
|
||||
"get_tensor_staging_buf");
|
||||
ggml_webgpu_create_buffer(device, webgpu_ctx->get_tensor_staging_buf, final_size,
|
||||
wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead, "get_tensor_staging_buf");
|
||||
}
|
||||
|
||||
// Copy the data from the buffer to the staging buffer
|
||||
@@ -824,8 +979,7 @@ static ggml_backend_buffer_t ggml_backend_webgpu_buffer_type_alloc_buffer(ggml_b
|
||||
ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
|
||||
|
||||
wgpu::Buffer buf;
|
||||
ggml_webgpu_create_buffer(ctx->webgpu_ctx->device,
|
||||
buf,
|
||||
ggml_webgpu_create_buffer(ctx->webgpu_ctx->device, buf,
|
||||
(size + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1),
|
||||
wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst,
|
||||
"allocated_buffer");
|
||||
@@ -890,9 +1044,17 @@ static ggml_guid_t ggml_backend_webgpu_guid(void) {
|
||||
return reinterpret_cast<ggml_guid_t>((void *) guid_str);
|
||||
}
|
||||
|
||||
// The max workgroup size is a common constant
|
||||
static std::vector<wgpu::ConstantEntry> ggml_webgpu_max_wg_size_entry(webgpu_context & webgpu_ctx) {
|
||||
std::vector<wgpu::ConstantEntry> constants(1);
|
||||
constants[0].key = "wg_size";
|
||||
constants[0].value = webgpu_ctx->max_wg_size_x;
|
||||
return constants;
|
||||
}
|
||||
|
||||
static void ggml_webgpu_init_memset_pipeline(webgpu_context & webgpu_ctx) {
|
||||
// we use the maximum workgroup size for the memset pipeline
|
||||
size_t max_wg_size = webgpu_ctx->limits.maxComputeWorkgroupSizeX;
|
||||
size_t max_wg_size = webgpu_ctx->max_wg_size_x;
|
||||
size_t max_threads = max_wg_size * webgpu_ctx->limits.maxComputeWorkgroupsPerDimension;
|
||||
// Size the bytes_per_thread so that the largest buffer size can be handled
|
||||
webgpu_ctx->memset_bytes_per_thread =
|
||||
@@ -906,109 +1068,142 @@ static void ggml_webgpu_init_memset_pipeline(webgpu_context & webgpu_ctx) {
|
||||
}
|
||||
|
||||
static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) {
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device,
|
||||
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_F32][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_f32_f32,
|
||||
"mul_mat_f32_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device,
|
||||
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_F16][GGML_TYPE_F16],
|
||||
wgsl_mul_mat_f16_f16,
|
||||
"mul_mat_f16_f16");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device,
|
||||
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_F16][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_f16_f32,
|
||||
"mul_mat_f16_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device,
|
||||
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q4_0][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_q4_0_f32,
|
||||
"mul_mat_q4_0_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device,
|
||||
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q4_1][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_q4_1_f32,
|
||||
"mul_mat_q4_1_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device,
|
||||
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q5_0][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_q5_0_f32,
|
||||
"mul_mat_q5_0_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device,
|
||||
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q5_1][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_q5_1_f32,
|
||||
"mul_mat_q5_1_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device,
|
||||
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q8_0][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_q8_0_f32,
|
||||
"mul_mat_q8_0_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device,
|
||||
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q2_K][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_q2_k_f32,
|
||||
"mul_mat_q2_k_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device,
|
||||
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q3_K][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_q3_k_f32,
|
||||
"mul_mat_q3_k_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device,
|
||||
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q4_K][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_q4_k_f32,
|
||||
"mul_mat_q4_k_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device,
|
||||
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q5_K][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_q5_k_f32,
|
||||
"mul_mat_q5_k_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device,
|
||||
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q6_K][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_q6_k_f32,
|
||||
"mul_mat_q6_k_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device,
|
||||
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ2_XXS][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_iq2_xxs_f32,
|
||||
"mul_mat_iq2_xxs_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device,
|
||||
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ2_XS][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_iq2_xs_f32,
|
||||
"mul_mat_iq2_xs_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device,
|
||||
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ2_S][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_iq2_s_f32,
|
||||
"mul_mat_iq2_s_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device,
|
||||
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ3_XXS][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_iq3_xxs_f32,
|
||||
"mul_mat_iq3_xxs_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device,
|
||||
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ3_S][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_iq3_s_f32,
|
||||
"mul_mat_iq3_s_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device,
|
||||
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ1_S][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_iq1_s_f32,
|
||||
"mul_mat_iq1_s_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device,
|
||||
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ1_M][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_iq1_m_f32,
|
||||
"mul_mat_iq1_m_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device,
|
||||
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ4_NL][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_iq4_nl_f32,
|
||||
"mul_mat_iq4_nl_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device,
|
||||
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ4_XS][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_iq4_xs_f32,
|
||||
"mul_mat_iq4_xs_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_F32][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_f32_f32, "mul_mat_f32_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_F16][GGML_TYPE_F16],
|
||||
wgsl_mul_mat_f16_f16, "mul_mat_f16_f16");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_F16][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_f16_f32, "mul_mat_f16_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q4_0][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_q4_0_f32, "mul_mat_q4_0_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q4_1][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_q4_1_f32, "mul_mat_q4_1_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q5_0][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_q5_0_f32, "mul_mat_q5_0_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q5_1][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_q5_1_f32, "mul_mat_q5_1_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q8_0][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_q8_0_f32, "mul_mat_q8_0_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q2_K][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_q2_k_f32, "mul_mat_q2_k_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q3_K][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_q3_k_f32, "mul_mat_q3_k_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q4_K][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_q4_k_f32, "mul_mat_q4_k_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q5_K][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_q5_k_f32, "mul_mat_q5_k_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q6_K][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_q6_k_f32, "mul_mat_q6_k_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ2_XXS][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_iq2_xxs_f32, "mul_mat_iq2_xxs_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ2_XS][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_iq2_xs_f32, "mul_mat_iq2_xs_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ2_S][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_iq2_s_f32, "mul_mat_iq2_s_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ3_XXS][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_iq3_xxs_f32, "mul_mat_iq3_xxs_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ3_S][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_iq3_s_f32, "mul_mat_iq3_s_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ1_S][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_iq1_s_f32, "mul_mat_iq1_s_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ1_M][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_iq1_m_f32, "mul_mat_iq1_m_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ4_NL][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_iq4_nl_f32, "mul_mat_iq4_nl_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ4_XS][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_iq4_xs_f32, "mul_mat_iq4_xs_f32");
|
||||
}
|
||||
|
||||
static void ggml_webgpu_init_set_rows_pipeline(webgpu_context & webgpu_ctx) {
|
||||
std::vector<wgpu::ConstantEntry> constants(1);
|
||||
constants[0].key = "wg_size";
|
||||
constants[0].value = webgpu_ctx->limits.maxComputeWorkgroupSizeX;
|
||||
ggml_webgpu_create_pipeline(
|
||||
webgpu_ctx->device, webgpu_ctx->set_rows_pipeline, wgsl_set_rows, "set_rows", constants);
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->set_rows_pipeline, wgsl_set_rows, "set_rows",
|
||||
ggml_webgpu_max_wg_size_entry(webgpu_ctx));
|
||||
}
|
||||
|
||||
static void ggml_webgpu_init_get_rows_pipeline(webgpu_context & webgpu_ctx) {
|
||||
std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_max_wg_size_entry(webgpu_ctx);
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_F32], wgsl_get_rows_f32_vec,
|
||||
"get_rows_f32_vec", constants);
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_f32_no_vec_pipeline, wgsl_get_rows_f32,
|
||||
"get_rows_f32", constants);
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_F16], wgsl_get_rows_f16,
|
||||
"get_rows_f16", constants);
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_I32], wgsl_get_rows_i32,
|
||||
"get_rows_i32", constants);
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_Q4_0], wgsl_get_rows_q4_0,
|
||||
"get_rows_q4_0", constants);
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_Q4_1], wgsl_get_rows_q4_1,
|
||||
"get_rows_q4_1", constants);
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_Q5_0], wgsl_get_rows_q5_0,
|
||||
"get_rows_q5_0", constants);
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_Q5_1], wgsl_get_rows_q5_1,
|
||||
"get_rows_q5_1", constants);
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_Q8_0], wgsl_get_rows_q8_0,
|
||||
"get_rows_q8_0", constants);
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_Q2_K], wgsl_get_rows_q2_k,
|
||||
"get_rows_q2_k", constants);
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_Q3_K], wgsl_get_rows_q3_k,
|
||||
"get_rows_q3_k", constants);
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_Q4_K], wgsl_get_rows_q4_k,
|
||||
"get_rows_q4_k", constants);
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_Q5_K], wgsl_get_rows_q5_k,
|
||||
"get_rows_q5_k", constants);
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_Q6_K], wgsl_get_rows_q6_k,
|
||||
"get_rows_q6_k", constants);
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_IQ2_XXS],
|
||||
wgsl_get_rows_iq2_xxs, "get_rows_iq2_xxs", constants);
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_IQ2_XS],
|
||||
wgsl_get_rows_iq2_xs, "get_rows_iq2_xs", constants);
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_IQ2_S], wgsl_get_rows_iq2_s,
|
||||
"get_rows_iq2_s", constants);
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_IQ3_XXS],
|
||||
wgsl_get_rows_iq3_xxs, "get_rows_iq3_xxs", constants);
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_IQ3_S], wgsl_get_rows_iq3_s,
|
||||
"get_rows_iq3_s", constants);
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_IQ1_S], wgsl_get_rows_iq1_s,
|
||||
"get_rows_iq1_s", constants);
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_IQ1_M], wgsl_get_rows_iq1_m,
|
||||
"get_rows_iq1_m", constants);
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_IQ4_NL],
|
||||
wgsl_get_rows_iq4_nl, "get_rows_iq4_nl", constants);
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_IQ4_XS],
|
||||
wgsl_get_rows_iq4_xs, "get_rows_iq4_xs", constants);
|
||||
}
|
||||
|
||||
static void ggml_webgpu_init_cpy_pipeline(webgpu_context & webgpu_ctx) {
|
||||
std::vector<wgpu::ConstantEntry> constants(1);
|
||||
constants[0].key = "wg_size";
|
||||
constants[0].value = webgpu_ctx->limits.maxComputeWorkgroupSizeX;
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->cpy_pipeline, wgsl_cpy, "cpy", constants);
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->cpy_pipeline, wgsl_cpy, "cpy",
|
||||
ggml_webgpu_max_wg_size_entry(webgpu_ctx));
|
||||
}
|
||||
|
||||
static void ggml_webgpu_init_add_pipeline(webgpu_context & webgpu_ctx) {
|
||||
std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_max_wg_size_entry(webgpu_ctx);
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F32], wgsl_add_f32, "add_f32",
|
||||
constants);
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F16], wgsl_add_f16, "add_f16",
|
||||
constants);
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_ip_pipeline[GGML_TYPE_F32], wgsl_add_in_place_f32,
|
||||
"add_in_place_f32", constants);
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_ip_pipeline[GGML_TYPE_F16], wgsl_add_in_place_f16,
|
||||
"add_in_place_f16", constants);
|
||||
}
|
||||
|
||||
static void ggml_webgpu_init_mul_pipeline(webgpu_context & webgpu_ctx) {
|
||||
std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_max_wg_size_entry(webgpu_ctx);
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F32], wgsl_mul_f32, "mul_f32",
|
||||
constants);
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F16], wgsl_mul_f16, "mul_f16",
|
||||
constants);
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_ip_pipeline[GGML_TYPE_F32], wgsl_mul_in_place_f32,
|
||||
"mul_in_place_f32", constants);
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_ip_pipeline[GGML_TYPE_F16], wgsl_mul_in_place_f16,
|
||||
"mul_in_place_f16", constants);
|
||||
}
|
||||
|
||||
static void ggml_webgpu_init_rms_norm_pipeline(webgpu_context & webgpu_ctx) {
|
||||
std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_max_wg_size_entry(webgpu_ctx);
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rms_norm_pipeline, wgsl_rms_norm, "rms_norm",
|
||||
constants);
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rms_norm_ip_pipeline, wgsl_rms_norm_in_place,
|
||||
"rms_norm_in_place", constants);
|
||||
}
|
||||
|
||||
static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, const char * params) {
|
||||
@@ -1058,24 +1253,77 @@ static bool ggml_backend_webgpu_device_supports_buft(ggml_backend_dev_t dev, ggm
|
||||
return buft->iface.get_name == ggml_backend_webgpu_buffer_type_get_name;
|
||||
}
|
||||
|
||||
static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
GGML_UNUSED(dev);
|
||||
static bool ggml_webgpu_supported_qtype(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ1_S:
|
||||
case GGML_TYPE_IQ1_M:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(dev->context);
|
||||
|
||||
webgpu_context webgpu_ctx = ctx->webgpu_ctx;
|
||||
|
||||
ggml_tensor * src0 = op->src[0];
|
||||
ggml_tensor * src1 = op->src[1];
|
||||
// on smaller devices (or CI), tensors may be larger than the max storage buffer size
|
||||
if (ggml_nbytes(op) > webgpu_ctx->limits.maxStorageBufferBindingSize ||
|
||||
(src0 != nullptr && ggml_nbytes(src0) > webgpu_ctx->limits.maxStorageBufferBindingSize) ||
|
||||
(src1 != nullptr && ggml_nbytes(src1) > webgpu_ctx->limits.maxStorageBufferBindingSize)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool supports_op = false;
|
||||
switch (op->op) {
|
||||
case GGML_OP_NONE:
|
||||
case GGML_OP_VIEW:
|
||||
case GGML_OP_PERMUTE:
|
||||
case GGML_OP_TRANSPOSE:
|
||||
case GGML_OP_RESHAPE:
|
||||
return true;
|
||||
supports_op = true;
|
||||
break;
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_MUL:
|
||||
supports_op = (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && (op->src[0]->type == op->type) &&
|
||||
(op->src[1]->type == op->type);
|
||||
break;
|
||||
case GGML_OP_CPY:
|
||||
case GGML_OP_SET_ROWS:
|
||||
return op->type == GGML_TYPE_F16 && op->src[0]->type == GGML_TYPE_F32;
|
||||
supports_op = (op->type == GGML_TYPE_F16 && op->src[0]->type == GGML_TYPE_F32);
|
||||
break;
|
||||
case GGML_OP_GET_ROWS:
|
||||
if (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16 ||
|
||||
op->src[0]->type == GGML_TYPE_I32 || ggml_webgpu_supported_qtype(op->src[0]->type)) {
|
||||
supports_op = (op->type == GGML_TYPE_F32);
|
||||
}
|
||||
break;
|
||||
case GGML_OP_MUL_MAT:
|
||||
{
|
||||
switch (op->src[1]->type) {
|
||||
case GGML_TYPE_F16:
|
||||
return op->src[0]->type == GGML_TYPE_F16;
|
||||
supports_op = (op->src[0]->type == GGML_TYPE_F16);
|
||||
break;
|
||||
case GGML_TYPE_F32:
|
||||
switch (op->src[0]->type) {
|
||||
case GGML_TYPE_F32:
|
||||
@@ -1099,17 +1347,30 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
|
||||
case GGML_TYPE_IQ1_M:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
return true;
|
||||
supports_op = true;
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
return false;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case GGML_OP_RMS_NORM:
|
||||
supports_op = op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32;
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
break;
|
||||
}
|
||||
#ifdef GGML_WEBGPU_DEBUG
|
||||
if (!supports_op) {
|
||||
WEBGPU_LOG_DEBUG("not supported: " << ggml_op_name(op->op) << " with types dst: " << ggml_type_name(op->type)
|
||||
<< ", src0: " << (op->src[0] ? ggml_type_name(op->src[0]->type) : "null")
|
||||
<< ", src1: " << (op->src[1] ? ggml_type_name(op->src[1]->type) : "null"));
|
||||
}
|
||||
#endif
|
||||
return supports_op;
|
||||
}
|
||||
|
||||
static struct ggml_backend_device_i ggml_backend_webgpu_device_i = {
|
||||
@@ -1155,18 +1416,20 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
|
||||
webgpu_context ctx = reg_ctx->webgpu_ctx;
|
||||
|
||||
wgpu::RequestAdapterOptions options = {};
|
||||
ctx->instance.WaitAny(
|
||||
ctx->instance.RequestAdapter(&options, wgpu::CallbackMode::AllowSpontaneous,
|
||||
[&ctx](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char * message) {
|
||||
if (status != wgpu::RequestAdapterStatus::Success) {
|
||||
GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message);
|
||||
return;
|
||||
}
|
||||
ctx->adapter = std::move(adapter);
|
||||
}), UINT64_MAX);
|
||||
ctx->instance.WaitAny(ctx->instance.RequestAdapter(
|
||||
&options, wgpu::CallbackMode::AllowSpontaneous,
|
||||
[&ctx](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char * message) {
|
||||
if (status != wgpu::RequestAdapterStatus::Success) {
|
||||
GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message);
|
||||
return;
|
||||
}
|
||||
ctx->adapter = std::move(adapter);
|
||||
}),
|
||||
UINT64_MAX);
|
||||
GGML_ASSERT(ctx->adapter != nullptr);
|
||||
|
||||
ctx->adapter.GetLimits(&ctx->limits);
|
||||
ctx->max_wg_size_x = 288; // default value
|
||||
|
||||
wgpu::AdapterInfo info{};
|
||||
ctx->adapter.GetInfo(&info);
|
||||
@@ -1182,21 +1445,21 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
|
||||
wgpu::CallbackMode::AllowSpontaneous,
|
||||
[](const wgpu::Device & device, wgpu::DeviceLostReason reason, wgpu::StringView message) {
|
||||
GGML_UNUSED(device);
|
||||
GGML_LOG_ERROR(
|
||||
"ggml_webgpu: Device lost! Reason: %d, Message: %s\n", static_cast<int>(reason), std::string(message).c_str());
|
||||
GGML_LOG_ERROR("ggml_webgpu: Device lost! Reason: %d, Message: %s\n", static_cast<int>(reason),
|
||||
std::string(message).c_str());
|
||||
});
|
||||
dev_desc.SetUncapturedErrorCallback(
|
||||
[](const wgpu::Device & device, wgpu::ErrorType reason, wgpu::StringView message) {
|
||||
GGML_UNUSED(device);
|
||||
GGML_LOG_ERROR(
|
||||
"ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast<int>(reason), std::string(message).c_str());
|
||||
GGML_LOG_ERROR("ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast<int>(reason),
|
||||
std::string(message).c_str());
|
||||
});
|
||||
ctx->instance.WaitAny(ctx->adapter.RequestDevice(
|
||||
&dev_desc,
|
||||
wgpu::CallbackMode::AllowSpontaneous,
|
||||
&dev_desc, wgpu::CallbackMode::AllowSpontaneous,
|
||||
[ctx](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) {
|
||||
if (status != wgpu::RequestDeviceStatus::Success) {
|
||||
GGML_LOG_ERROR("ggml_webgpu: Failed to get a device: %s\n", std::string(message).c_str());
|
||||
GGML_LOG_ERROR("ggml_webgpu: Failed to get a device: %s\n",
|
||||
std::string(message).c_str());
|
||||
return;
|
||||
}
|
||||
ctx->device = std::move(device);
|
||||
@@ -1208,34 +1471,28 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
|
||||
ctx->queue = ctx->device.GetQueue();
|
||||
|
||||
// Create buffer pool for shader parameters
|
||||
ctx->param_buf_pool.init(ctx->device,
|
||||
WEBGPU_NUM_PARAM_BUFS,
|
||||
WEBGPU_PARAMS_BUF_SIZE_BYTES,
|
||||
ctx->param_buf_pool.init(ctx->device, WEBGPU_NUM_PARAM_BUFS, WEBGPU_PARAMS_BUF_SIZE_BYTES,
|
||||
wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::Uniform,
|
||||
wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::MapWrite);
|
||||
ctx->set_rows_error_buf_pool.init(ctx->device,
|
||||
WEBGPU_NUM_SET_ROWS_ERROR_BUFS,
|
||||
WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES,
|
||||
ctx->set_rows_error_buf_pool.init(ctx->device, WEBGPU_NUM_SET_ROWS_ERROR_BUFS, WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES,
|
||||
wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::Storage,
|
||||
wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead);
|
||||
|
||||
ggml_webgpu_init_memset_pipeline(ctx);
|
||||
ggml_webgpu_init_mul_mat_pipeline(ctx);
|
||||
ggml_webgpu_init_set_rows_pipeline(ctx);
|
||||
ggml_webgpu_init_get_rows_pipeline(ctx);
|
||||
ggml_webgpu_init_cpy_pipeline(ctx);
|
||||
ggml_webgpu_init_add_pipeline(ctx);
|
||||
ggml_webgpu_init_mul_pipeline(ctx);
|
||||
ggml_webgpu_init_rms_norm_pipeline(ctx);
|
||||
|
||||
#ifdef GGML_WEBGPU_DEBUG
|
||||
// Initialize debug buffers
|
||||
ggml_webgpu_create_buffer(ctx->device,
|
||||
ctx->debug_host_buf,
|
||||
WEBGPU_DEBUG_BUF_ELEMS * sizeof(uint32_t),
|
||||
wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead,
|
||||
"debug_host_buf");
|
||||
ggml_webgpu_create_buffer(ctx->device,
|
||||
ctx->debug_dev_buf,
|
||||
WEBGPU_DEBUG_BUF_ELEMS * sizeof(uint32_t),
|
||||
wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc,
|
||||
"debug_dev_buf");
|
||||
ggml_webgpu_create_buffer(ctx->device, ctx->debug_host_buf, WEBGPU_DEBUG_BUF_ELEMS * sizeof(uint32_t),
|
||||
wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead, "debug_host_buf");
|
||||
ggml_webgpu_create_buffer(ctx->device, ctx->debug_dev_buf, WEBGPU_DEBUG_BUF_ELEMS * sizeof(uint32_t),
|
||||
wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc, "debug_dev_buf");
|
||||
#endif
|
||||
|
||||
static ggml_backend_webgpu_device_context device_ctx;
|
||||
@@ -1246,12 +1503,8 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
|
||||
GGML_LOG_INFO(
|
||||
"ggml_webgpu: adapter_info: vendor_id: %u | vendor: %s | architecture: %s | device_id: %u | name: %s | "
|
||||
"device_desc: %s\n",
|
||||
info.vendorID,
|
||||
std::string(info.vendor).c_str(),
|
||||
std::string(info.architecture).c_str(),
|
||||
info.deviceID,
|
||||
std::string(info.device).c_str(),
|
||||
std::string(info.description).c_str());
|
||||
info.vendorID, std::string(info.vendor).c_str(), std::string(info.architecture).c_str(), info.deviceID,
|
||||
std::string(info.device).c_str(), std::string(info.description).c_str());
|
||||
|
||||
// See GGML Backend Device Interface section
|
||||
static ggml_backend_device device = {
|
||||
|
||||
@@ -0,0 +1,44 @@
|
||||
#define(VARIANTS)
|
||||
|
||||
[
|
||||
{
|
||||
"REPLS": {
|
||||
"TYPE" : "f32",
|
||||
}
|
||||
},
|
||||
{
|
||||
"REPLS": {
|
||||
"TYPE" : "f16",
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
#end(VARIANTS)
|
||||
|
||||
#define(SHADER)
|
||||
|
||||
enable f16;
|
||||
|
||||
#include "binary_head.tmpl"
|
||||
|
||||
@group(0) @binding(0)
|
||||
var<storage, read_write> src0: array<{{TYPE}}>;
|
||||
|
||||
@group(0) @binding(1)
|
||||
var<storage, read_write> src1: array<{{TYPE}}>;
|
||||
|
||||
@group(0) @binding(2)
|
||||
var<storage, read_write> dst: array<{{TYPE}}>;
|
||||
|
||||
@group(0) @binding(3)
|
||||
var<uniform> params: Params;
|
||||
|
||||
override wg_size: u32;
|
||||
@compute @workgroup_size(wg_size)
|
||||
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
|
||||
if (gid.x < params.ne) {
|
||||
dst[params.offset_dst + gid.x] = src0[params.offset_src0 + gid.x] + src1[params.offset_src1 + src1_index(gid.x)];
|
||||
}
|
||||
}
|
||||
|
||||
#end(SHADER)
|
||||
@@ -0,0 +1,41 @@
|
||||
#define(VARIANTS)
|
||||
|
||||
[
|
||||
{
|
||||
"REPLS": {
|
||||
"TYPE" : "f32",
|
||||
}
|
||||
},
|
||||
{
|
||||
"REPLS": {
|
||||
"TYPE" : "f16",
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
#end(VARIANTS)
|
||||
|
||||
#define(SHADER)
|
||||
|
||||
enable f16;
|
||||
|
||||
#include "binary_head.tmpl"
|
||||
|
||||
@group(0) @binding(0)
|
||||
var<storage, read_write> src0: array<{{TYPE}}>;
|
||||
|
||||
@group(0) @binding(1)
|
||||
var<storage, read_write> src1: array<{{TYPE}}>;
|
||||
|
||||
@group(0) @binding(2)
|
||||
var<uniform> params: Params;
|
||||
|
||||
override wg_size: u32;
|
||||
@compute @workgroup_size(wg_size)
|
||||
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
|
||||
if (gid.x < params.ne) {
|
||||
src0[params.offset_dst + gid.x] = src0[params.offset_src0 + gid.x] + src1[params.offset_src1 + src1_index(gid.x)];
|
||||
}
|
||||
}
|
||||
|
||||
#end(SHADER)
|
||||
@@ -0,0 +1,45 @@
|
||||
struct Params {
|
||||
ne: u32,
|
||||
|
||||
// offsets in elements
|
||||
offset_src0: u32,
|
||||
offset_src1: u32,
|
||||
offset_dst: u32,
|
||||
|
||||
stride_src1_0: u32,
|
||||
stride_src1_1: u32,
|
||||
stride_src1_2: u32,
|
||||
stride_src1_3: u32,
|
||||
|
||||
a_ne0: u32,
|
||||
a_ne1: u32,
|
||||
a_ne2: u32,
|
||||
|
||||
b_ne0: u32,
|
||||
b_ne1: u32,
|
||||
b_ne2: u32,
|
||||
b_ne3: u32,
|
||||
};
|
||||
|
||||
fn src1_index(_i: u32) -> u32 {
|
||||
var i = _i;
|
||||
let a_i3 = i / (params.a_ne2 * params.a_ne1 * params.a_ne0);
|
||||
i = i % (params.a_ne2 * params.a_ne1 * params.a_ne0);
|
||||
let a_i2 = i / (params.a_ne1 * params.a_ne0);
|
||||
i = i % (params.a_ne1 * params.a_ne0);
|
||||
let a_i1 = i / params.a_ne0;
|
||||
let a_i0 = i % params.a_ne0;
|
||||
|
||||
// handle repetition of b
|
||||
// index loops back to the beginning and repeats after elements are exhausted = modulo
|
||||
let b_i0 = a_i0 % params.b_ne0;
|
||||
let b_i1 = a_i1 % params.b_ne1;
|
||||
let b_i2 = a_i2 % params.b_ne2;
|
||||
let b_i3 = a_i3 % params.b_ne3;
|
||||
|
||||
// compute index for position in b's flat array
|
||||
return b_i0 * params.stride_src1_0 +
|
||||
b_i1 * params.stride_src1_1 +
|
||||
b_i2 * params.stride_src1_2 +
|
||||
b_i3 * params.stride_src1_3;
|
||||
}
|
||||
@@ -0,0 +1,930 @@
|
||||
#decl(BYTE_HELPERS)
|
||||
|
||||
fn get_byte(value: u32, index: u32) -> u32 {
|
||||
return (value >> (index * 8)) & 0xFF;
|
||||
}
|
||||
|
||||
fn get_byte_i32(value: u32, index: u32) -> i32 {
|
||||
return bitcast<i32>(((value >> (index * 8)) & 0xFF) << 24) >> 24;
|
||||
}
|
||||
|
||||
#enddecl(BYTE_HELPERS)
|
||||
|
||||
#decl(Q4_0_T)
|
||||
struct q4_0 {
|
||||
d: f16,
|
||||
qs: array<f16, 8>
|
||||
};
|
||||
#enddecl(Q4_0_T)
|
||||
|
||||
#decl(Q4_1_T)
|
||||
struct q4_1 {
|
||||
d: f16,
|
||||
m: f16,
|
||||
qs: array<u32, 4>
|
||||
};
|
||||
#enddecl(Q4_1_T)
|
||||
|
||||
#decl(Q5_0_T)
|
||||
struct q5_0 {
|
||||
d: f16,
|
||||
qh: array<f16, 2>,
|
||||
qs: array<f16, 8>
|
||||
};
|
||||
#enddecl(Q5_0_T)
|
||||
|
||||
#decl(Q5_1_T)
|
||||
struct q5_1 {
|
||||
d: f16,
|
||||
m: f16,
|
||||
qh: u32,
|
||||
qs: array<u32, 4>
|
||||
};
|
||||
#enddecl(Q5_1_T)
|
||||
|
||||
#decl(Q8_0_T)
|
||||
struct q8_0 {
|
||||
d: f16,
|
||||
qs: array<f16, 16>
|
||||
};
|
||||
#enddecl(Q8_0_T)
|
||||
|
||||
#decl(Q8_1_T)
|
||||
struct q8_1 {
|
||||
d: f16,
|
||||
m: f16,
|
||||
qs: array<u32, 8>
|
||||
};
|
||||
#enddecl(Q8_1_T)
|
||||
|
||||
#decl(Q2_K_T)
|
||||
struct q2_k {
|
||||
scales: array<u32, 4>,
|
||||
qs: array<u32, 16>,
|
||||
d: f16,
|
||||
dmin: f16
|
||||
};
|
||||
#enddecl(Q2_K_T)
|
||||
|
||||
#decl(Q3_K_T)
|
||||
struct q3_k {
|
||||
hmask: array<f16, 16>,
|
||||
qs: array<f16, 32>,
|
||||
scales: array<f16, 6>,
|
||||
d: f16
|
||||
};
|
||||
#enddecl(Q3_K_T)
|
||||
|
||||
#decl(Q45_K_SCALE_MIN)
|
||||
|
||||
fn get_scale_min(is: u32, scales: array<u32, 3>) -> vec2<f32> {
|
||||
if (is < 4) {
|
||||
let sc_byte = get_byte(scales[is / 4], is % 4);
|
||||
let min_byte = get_byte(scales[(is + 4) / 4], is % 4);
|
||||
return vec2(f32(sc_byte & 63), f32(min_byte & 63));
|
||||
} else {
|
||||
let sc_min_lo = get_byte(scales[(is + 4) / 4], (is + 4) % 4);
|
||||
let sc_hi = get_byte(scales[(is - 4) / 4], (is - 4) % 4);
|
||||
let min_hi = get_byte(scales[is / 4], is % 4);
|
||||
let sc = (sc_min_lo & 0xF) | ((sc_hi >> 6) << 4);
|
||||
let m = (sc_min_lo >> 4) | ((min_hi >> 6) << 4);
|
||||
return vec2(f32(sc), f32(m));
|
||||
}
|
||||
}
|
||||
|
||||
#enddecl(Q45_K_SCALE_MIN)
|
||||
|
||||
#decl(Q4_K_T)
|
||||
struct q4_k {
|
||||
d: f16,
|
||||
dmin: f16,
|
||||
scales: array<u32, 3>,
|
||||
qs: array<u32, 32>
|
||||
};
|
||||
#enddecl(Q4_K_T)
|
||||
|
||||
#decl(Q5_K_T)
|
||||
struct q5_k {
|
||||
d: f16,
|
||||
dmin: f16,
|
||||
scales: array<u32, 3>,
|
||||
qh: array<u32, 8>,
|
||||
qs: array<u32, 32>
|
||||
};
|
||||
#enddecl(Q5_K_T)
|
||||
|
||||
#decl(Q6_K_T)
|
||||
struct q6_k {
|
||||
ql: array<f16, 64>,
|
||||
qh: array<f16, 32>,
|
||||
scales: array<f16, 8>,
|
||||
d: f16
|
||||
};
|
||||
#enddecl(Q6_K_T)
|
||||
|
||||
#decl(IQ2_XXS_T)
|
||||
struct iq2_xxs {
|
||||
d: f16,
|
||||
qs: array<f16, 32>
|
||||
};
|
||||
#enddecl(IQ2_XXS_T)
|
||||
|
||||
#decl(IQ2_XS_T)
|
||||
struct iq2_xs {
|
||||
d: f16,
|
||||
qs: array<f16, 32>,
|
||||
scales: array<f16, 4>
|
||||
};
|
||||
#enddecl(IQ2_XS_T)
|
||||
|
||||
#decl(IQ2_S_T)
|
||||
struct iq2_s {
|
||||
d: f16,
|
||||
qs: array<f16, 32>,
|
||||
qh: array<f16, 4>,
|
||||
scales: array<f16, 4>
|
||||
};
|
||||
#enddecl(IQ2_S_T)
|
||||
|
||||
#decl(IQ3_XSS_T)
|
||||
struct iq3_xxs {
|
||||
d: f16,
|
||||
qs: array<f16, 48>
|
||||
};
|
||||
#enddecl(IQ3_XSS_T)
|
||||
|
||||
#decl(IQ3_S_T)
|
||||
struct iq3_s {
|
||||
d: f16,
|
||||
qs: array<f16, 32>,
|
||||
qh: array<f16, 4>,
|
||||
signs: array<f16, 16>,
|
||||
scales: array<f16, 2>
|
||||
};
|
||||
#enddecl(IQ3_S_T)
|
||||
|
||||
#decl(IQ1_S_T)
|
||||
struct iq1_s {
|
||||
d: f16,
|
||||
qs: array<f16, 16>,
|
||||
qh: array<f16, 8>
|
||||
};
|
||||
#enddecl(IQ1_S_T)
|
||||
|
||||
#decl(IQ1_M_T)
|
||||
struct iq1_m {
|
||||
qs: array<u32, 8>,
|
||||
qh: array<u32, 4>,
|
||||
scales: array<u32, 2>
|
||||
};
|
||||
#enddecl(IQ1_M_T)
|
||||
|
||||
#decl(IQ4_NL_T)
|
||||
struct iq4_nl {
|
||||
d: f16,
|
||||
qs: array<f16, 8>,
|
||||
};
|
||||
#enddecl(IQ4_NL_T)
|
||||
|
||||
#decl(IQ4_XS_T)
|
||||
struct iq4_xs {
|
||||
d: f16,
|
||||
scales_h: f16,
|
||||
scales_l: u32,
|
||||
qs: array<u32, 32>
|
||||
};
|
||||
#enddecl(IQ4_XS_T)
|
||||
|
||||
#decl(IQ23_TABLES)
|
||||
const kmask_iq2xs : array<u32, 2> = array<u32, 2>(
|
||||
0x08040201u, // 1, 2, 4, 8
|
||||
0x80402010u // 16, 32, 64, 128
|
||||
);
|
||||
|
||||
const ksigns_iq2xs: array<u32, 32> = array<u32, 32>(
|
||||
0x03828100,0x87060584,0x8b0a0988,0x0f8e8d0c,
|
||||
0x93121190,0x17969514,0x1b9a9918,0x9f1e1d9c,
|
||||
0xa32221a0,0x27a6a524,0x2baaa928,0xaf2e2dac,
|
||||
0x33b2b130,0xb73635b4,0xbb3a39b8,0x3fbebd3c,
|
||||
0xc34241c0,0x47c6c544,0x4bcac948,0xcf4e4dcc,
|
||||
0x53d2d150,0xd75655d4,0xdb5a59d8,0x5fdedd5c,
|
||||
0x63e2e160,0xe76665e4,0xeb6a69e8,0x6feeed6c,
|
||||
0xf37271f0,0x77f6f574,0x7bfaf978,0xff7e7dfc
|
||||
);
|
||||
#enddecl(IQ23_TABLES)
|
||||
|
||||
#decl(IQ2_XXS_GRID)
|
||||
const iq2xxs_grid = array<u32, 512>(
|
||||
0x08080808, 0x08080808, 0x0808082b, 0x08080808, 0x08081919, 0x08080808, 0x08082b08, 0x08080808,
|
||||
0x08082b2b, 0x08080808, 0x08190819, 0x08080808, 0x08191908, 0x08080808, 0x082b0808, 0x08080808,
|
||||
0x082b082b, 0x08080808, 0x082b2b08, 0x08080808, 0x082b2b2b, 0x08080808, 0x19080819, 0x08080808,
|
||||
0x19081908, 0x08080808, 0x19190808, 0x08080808, 0x19192b08, 0x08080808, 0x192b0819, 0x08080808,
|
||||
0x192b1908, 0x08080808, 0x2b080808, 0x08080808, 0x2b08082b, 0x08080808, 0x2b082b2b, 0x08080808,
|
||||
0x2b2b082b, 0x08080808, 0x08080819, 0x08080819, 0x08081908, 0x08080819, 0x08190808, 0x08080819,
|
||||
0x08191919, 0x08080819, 0x19080808, 0x08080819, 0x2b081908, 0x08080819, 0x2b192b08, 0x08080819,
|
||||
0x08080808, 0x0808082b, 0x0808082b, 0x0808082b, 0x082b082b, 0x0808082b, 0x2b08082b, 0x0808082b,
|
||||
0x08080819, 0x08081908, 0x08081908, 0x08081908, 0x08190808, 0x08081908, 0x082b0819, 0x08081908,
|
||||
0x082b1908, 0x08081908, 0x19080808, 0x08081908, 0x1908082b, 0x08081908, 0x19082b08, 0x08081908,
|
||||
0x192b0808, 0x08081908, 0x2b080819, 0x08081908, 0x2b081908, 0x08081908, 0x2b190808, 0x08081908,
|
||||
0x2b2b1908, 0x08081908, 0x08080808, 0x08081919, 0x0808082b, 0x08081919, 0x08082b08, 0x08081919,
|
||||
0x082b0808, 0x08081919, 0x1908192b, 0x08081919, 0x192b2b19, 0x08081919, 0x2b080808, 0x08081919,
|
||||
0x2b190819, 0x08081919, 0x08082b19, 0x0808192b, 0x08190808, 0x0808192b, 0x19080808, 0x0808192b,
|
||||
0x2b081908, 0x0808192b, 0x2b2b1908, 0x0808192b, 0x08080808, 0x08082b08, 0x08081919, 0x08082b08,
|
||||
0x08082b08, 0x08082b08, 0x08191908, 0x08082b08, 0x082b2b08, 0x08082b08, 0x19080819, 0x08082b08,
|
||||
0x19081908, 0x08082b08, 0x19190808, 0x08082b08, 0x1919082b, 0x08082b08, 0x2b082b08, 0x08082b08,
|
||||
0x08081908, 0x08082b19, 0x19080808, 0x08082b19, 0x0808082b, 0x08082b2b, 0x08191908, 0x08082b2b,
|
||||
0x08080819, 0x08190808, 0x08081908, 0x08190808, 0x08190808, 0x08190808, 0x082b0819, 0x08190808,
|
||||
0x19080808, 0x08190808, 0x192b0808, 0x08190808, 0x2b081908, 0x08190808, 0x2b190808, 0x08190808,
|
||||
0x2b191919, 0x08190808, 0x08080808, 0x08190819, 0x08082b08, 0x08190819, 0x082b0808, 0x08190819,
|
||||
0x19190808, 0x08190819, 0x19192b2b, 0x08190819, 0x2b080808, 0x08190819, 0x082b1908, 0x0819082b,
|
||||
0x19081919, 0x0819082b, 0x08080808, 0x08191908, 0x08082b08, 0x08191908, 0x082b0808, 0x08191908,
|
||||
0x082b1919, 0x08191908, 0x19082b19, 0x08191908, 0x2b080808, 0x08191908, 0x08192b08, 0x08191919,
|
||||
0x192b082b, 0x08191919, 0x08080808, 0x0819192b, 0x0819192b, 0x0819192b, 0x08080819, 0x08192b08,
|
||||
0x08081908, 0x08192b08, 0x08190808, 0x08192b08, 0x19080808, 0x08192b08, 0x2b080819, 0x08192b08,
|
||||
0x08080808, 0x08192b19, 0x08081919, 0x08192b19, 0x2b2b0808, 0x08192b19, 0x19190819, 0x08192b2b,
|
||||
0x08080808, 0x082b0808, 0x0808082b, 0x082b0808, 0x08082b2b, 0x082b0808, 0x19081908, 0x082b0808,
|
||||
0x192b0819, 0x082b0808, 0x2b080808, 0x082b0808, 0x2b08082b, 0x082b0808, 0x082b2b19, 0x082b0819,
|
||||
0x19082b08, 0x082b0819, 0x08080808, 0x082b082b, 0x0808082b, 0x082b082b, 0x08080819, 0x082b1908,
|
||||
0x08081908, 0x082b1908, 0x08190808, 0x082b1908, 0x19080808, 0x082b1908, 0x1919192b, 0x082b1908,
|
||||
0x08080808, 0x082b1919, 0x19080819, 0x082b1919, 0x192b1908, 0x082b1919, 0x2b190808, 0x082b192b,
|
||||
0x08082b08, 0x082b2b08, 0x082b0808, 0x082b2b08, 0x2b191908, 0x082b2b08, 0x19081908, 0x082b2b2b,
|
||||
0x08080819, 0x19080808, 0x08081908, 0x19080808, 0x08190808, 0x19080808, 0x08192b08, 0x19080808,
|
||||
0x082b0819, 0x19080808, 0x082b1908, 0x19080808, 0x19080808, 0x19080808, 0x19082b08, 0x19080808,
|
||||
0x1919192b, 0x19080808, 0x192b0808, 0x19080808, 0x2b080819, 0x19080808, 0x2b081908, 0x19080808,
|
||||
0x2b190808, 0x19080808, 0x08080808, 0x19080819, 0x082b0808, 0x19080819, 0x192b0819, 0x19080819,
|
||||
0x2b080808, 0x19080819, 0x2b081919, 0x19080819, 0x08080819, 0x1908082b, 0x08190808, 0x1908082b,
|
||||
0x19082b08, 0x1908082b, 0x1919192b, 0x1908082b, 0x192b2b08, 0x1908082b, 0x08080808, 0x19081908,
|
||||
0x08082b08, 0x19081908, 0x082b0808, 0x19081908, 0x2b080808, 0x19081908, 0x2b192b19, 0x19081908,
|
||||
0x0819082b, 0x19081919, 0x082b1908, 0x19081919, 0x08080808, 0x1908192b, 0x08080819, 0x19082b08,
|
||||
0x08081908, 0x19082b08, 0x08190808, 0x19082b08, 0x19080808, 0x19082b08, 0x19081919, 0x19082b08,
|
||||
0x08080808, 0x19082b19, 0x19192b08, 0x19082b19, 0x192b0819, 0x19082b19, 0x2b08082b, 0x19082b19,
|
||||
0x19081919, 0x19082b2b, 0x2b190808, 0x19082b2b, 0x08080808, 0x19190808, 0x08082b08, 0x19190808,
|
||||
0x08190819, 0x19190808, 0x08192b19, 0x19190808, 0x082b0808, 0x19190808, 0x2b080808, 0x19190808,
|
||||
0x2b082b08, 0x19190808, 0x08081908, 0x19190819, 0x1908082b, 0x19190819, 0x2b2b1908, 0x19190819,
|
||||
0x2b190819, 0x1919082b, 0x2b190808, 0x19191908, 0x2b19082b, 0x19191908, 0x08082b2b, 0x19191919,
|
||||
0x08080819, 0x1919192b, 0x19191908, 0x1919192b, 0x08080808, 0x19192b08, 0x08190819, 0x19192b08,
|
||||
0x08192b19, 0x19192b08, 0x192b1908, 0x19192b08, 0x19080808, 0x19192b19, 0x08082b08, 0x19192b2b,
|
||||
0x08081908, 0x192b0808, 0x08190808, 0x192b0808, 0x19080808, 0x192b0808, 0x192b2b08, 0x192b0808,
|
||||
0x08080808, 0x192b0819, 0x19191919, 0x192b0819, 0x08192b08, 0x192b082b, 0x192b0808, 0x192b082b,
|
||||
0x08080808, 0x192b1908, 0x08081919, 0x192b1908, 0x08190808, 0x192b1919, 0x0819082b, 0x192b1919,
|
||||
0x2b081908, 0x192b1919, 0x1908082b, 0x192b2b08, 0x08080808, 0x2b080808, 0x0808082b, 0x2b080808,
|
||||
0x08082b2b, 0x2b080808, 0x19080819, 0x2b080808, 0x2b08082b, 0x2b080808, 0x08081908, 0x2b080819,
|
||||
0x08192b08, 0x2b080819, 0x19080808, 0x2b080819, 0x08190819, 0x2b08082b, 0x08080819, 0x2b081908,
|
||||
0x08081908, 0x2b081908, 0x08190808, 0x2b081908, 0x08191919, 0x2b081908, 0x19080808, 0x2b081908,
|
||||
0x192b0808, 0x2b081908, 0x08080808, 0x2b081919, 0x1908192b, 0x2b081919, 0x2b191908, 0x2b081919,
|
||||
0x08082b19, 0x2b08192b, 0x19080808, 0x2b08192b, 0x192b0808, 0x2b08192b, 0x0808082b, 0x2b082b08,
|
||||
0x08081908, 0x2b082b19, 0x08190819, 0x2b082b2b, 0x08081908, 0x2b190808, 0x08190808, 0x2b190808,
|
||||
0x082b1908, 0x2b190808, 0x19080808, 0x2b190808, 0x2b2b0819, 0x2b190808, 0x0819192b, 0x2b190819,
|
||||
0x2b080808, 0x2b190819, 0x19081919, 0x2b19082b, 0x08080808, 0x2b191908, 0x082b082b, 0x2b191908,
|
||||
0x19081908, 0x2b191908, 0x19190819, 0x2b191919, 0x2b080819, 0x2b192b08, 0x082b0808, 0x2b192b19,
|
||||
0x0808082b, 0x2b2b0808, 0x19190808, 0x2b2b0808, 0x2b081919, 0x2b2b0808, 0x08082b19, 0x2b2b0819,
|
||||
0x08080808, 0x2b2b082b, 0x08192b08, 0x2b2b1908, 0x19190808, 0x2b2b2b08, 0x08081908, 0x2b2b2b19
|
||||
);
|
||||
#enddecl(IQ2_XXS_GRID)
|
||||
|
||||
#decl(IQ2_XS_GRID)
|
||||
const iq2xs_grid = array<u32, 1024>(
|
||||
0x08080808, 0x08080808, 0x0808082b, 0x08080808, 0x08081919, 0x08080808, 0x08082b08, 0x08080808,
|
||||
0x08082b2b, 0x08080808, 0x08190819, 0x08080808, 0x08191908, 0x08080808, 0x0819192b, 0x08080808,
|
||||
0x08192b19, 0x08080808, 0x082b0808, 0x08080808, 0x082b082b, 0x08080808, 0x082b1919, 0x08080808,
|
||||
0x082b2b08, 0x08080808, 0x19080819, 0x08080808, 0x19081908, 0x08080808, 0x1908192b, 0x08080808,
|
||||
0x19082b19, 0x08080808, 0x19190808, 0x08080808, 0x1919082b, 0x08080808, 0x19191919, 0x08080808,
|
||||
0x19192b08, 0x08080808, 0x192b0819, 0x08080808, 0x192b1908, 0x08080808, 0x2b080808, 0x08080808,
|
||||
0x2b08082b, 0x08080808, 0x2b081919, 0x08080808, 0x2b082b08, 0x08080808, 0x2b190819, 0x08080808,
|
||||
0x2b191908, 0x08080808, 0x2b192b19, 0x08080808, 0x2b2b0808, 0x08080808, 0x08080819, 0x08080819,
|
||||
0x08081908, 0x08080819, 0x0808192b, 0x08080819, 0x08082b19, 0x08080819, 0x08190808, 0x08080819,
|
||||
0x0819082b, 0x08080819, 0x08191919, 0x08080819, 0x08192b08, 0x08080819, 0x08192b2b, 0x08080819,
|
||||
0x082b0819, 0x08080819, 0x082b1908, 0x08080819, 0x19080808, 0x08080819, 0x1908082b, 0x08080819,
|
||||
0x19081919, 0x08080819, 0x19082b08, 0x08080819, 0x19190819, 0x08080819, 0x19191908, 0x08080819,
|
||||
0x192b0808, 0x08080819, 0x192b2b08, 0x08080819, 0x2b080819, 0x08080819, 0x2b081908, 0x08080819,
|
||||
0x2b190808, 0x08080819, 0x08080808, 0x0808082b, 0x0808082b, 0x0808082b, 0x08081919, 0x0808082b,
|
||||
0x08082b08, 0x0808082b, 0x08190819, 0x0808082b, 0x08191908, 0x0808082b, 0x082b0808, 0x0808082b,
|
||||
0x19080819, 0x0808082b, 0x19081908, 0x0808082b, 0x19190808, 0x0808082b, 0x19191919, 0x0808082b,
|
||||
0x2b080808, 0x0808082b, 0x2b082b2b, 0x0808082b, 0x08080819, 0x08081908, 0x08081908, 0x08081908,
|
||||
0x0808192b, 0x08081908, 0x08082b19, 0x08081908, 0x08190808, 0x08081908, 0x0819082b, 0x08081908,
|
||||
0x08191919, 0x08081908, 0x08192b08, 0x08081908, 0x082b0819, 0x08081908, 0x082b1908, 0x08081908,
|
||||
0x19080808, 0x08081908, 0x1908082b, 0x08081908, 0x19081919, 0x08081908, 0x19082b08, 0x08081908,
|
||||
0x19190819, 0x08081908, 0x19191908, 0x08081908, 0x1919192b, 0x08081908, 0x192b0808, 0x08081908,
|
||||
0x2b080819, 0x08081908, 0x2b081908, 0x08081908, 0x2b190808, 0x08081908, 0x08080808, 0x08081919,
|
||||
0x0808082b, 0x08081919, 0x08081919, 0x08081919, 0x08082b08, 0x08081919, 0x08190819, 0x08081919,
|
||||
0x08191908, 0x08081919, 0x082b0808, 0x08081919, 0x19080819, 0x08081919, 0x19081908, 0x08081919,
|
||||
0x19190808, 0x08081919, 0x192b0819, 0x08081919, 0x2b080808, 0x08081919, 0x08080819, 0x0808192b,
|
||||
0x08081908, 0x0808192b, 0x08190808, 0x0808192b, 0x082b192b, 0x0808192b, 0x19080808, 0x0808192b,
|
||||
0x1908082b, 0x0808192b, 0x2b081908, 0x0808192b, 0x08080808, 0x08082b08, 0x0808082b, 0x08082b08,
|
||||
0x08081919, 0x08082b08, 0x08082b08, 0x08082b08, 0x08082b2b, 0x08082b08, 0x08190819, 0x08082b08,
|
||||
0x08191908, 0x08082b08, 0x082b0808, 0x08082b08, 0x082b1919, 0x08082b08, 0x19080819, 0x08082b08,
|
||||
0x19081908, 0x08082b08, 0x19190808, 0x08082b08, 0x19192b08, 0x08082b08, 0x2b080808, 0x08082b08,
|
||||
0x2b2b0808, 0x08082b08, 0x2b2b2b2b, 0x08082b08, 0x08080819, 0x08082b19, 0x08081908, 0x08082b19,
|
||||
0x08190808, 0x08082b19, 0x19080808, 0x08082b19, 0x2b080819, 0x08082b19, 0x2b082b19, 0x08082b19,
|
||||
0x08080808, 0x08082b2b, 0x082b0808, 0x08082b2b, 0x082b2b08, 0x08082b2b, 0x2b19192b, 0x08082b2b,
|
||||
0x2b2b0808, 0x08082b2b, 0x08080819, 0x08190808, 0x08081908, 0x08190808, 0x0808192b, 0x08190808,
|
||||
0x08082b19, 0x08190808, 0x08190808, 0x08190808, 0x0819082b, 0x08190808, 0x08191919, 0x08190808,
|
||||
0x08192b08, 0x08190808, 0x082b0819, 0x08190808, 0x082b1908, 0x08190808, 0x19080808, 0x08190808,
|
||||
0x1908082b, 0x08190808, 0x19081919, 0x08190808, 0x19082b08, 0x08190808, 0x19190819, 0x08190808,
|
||||
0x19191908, 0x08190808, 0x192b0808, 0x08190808, 0x192b2b2b, 0x08190808, 0x2b080819, 0x08190808,
|
||||
0x2b081908, 0x08190808, 0x2b190808, 0x08190808, 0x08080808, 0x08190819, 0x0808082b, 0x08190819,
|
||||
0x08081919, 0x08190819, 0x08082b08, 0x08190819, 0x08190819, 0x08190819, 0x08191908, 0x08190819,
|
||||
0x082b0808, 0x08190819, 0x19080819, 0x08190819, 0x19081908, 0x08190819, 0x19190808, 0x08190819,
|
||||
0x2b080808, 0x08190819, 0x2b191908, 0x08190819, 0x2b19192b, 0x08190819, 0x08080819, 0x0819082b,
|
||||
0x08081908, 0x0819082b, 0x0808192b, 0x0819082b, 0x08190808, 0x0819082b, 0x19080808, 0x0819082b,
|
||||
0x192b0808, 0x0819082b, 0x08080808, 0x08191908, 0x0808082b, 0x08191908, 0x08081919, 0x08191908,
|
||||
0x08082b08, 0x08191908, 0x08190819, 0x08191908, 0x08191908, 0x08191908, 0x082b0808, 0x08191908,
|
||||
0x19080819, 0x08191908, 0x19081908, 0x08191908, 0x19082b19, 0x08191908, 0x19190808, 0x08191908,
|
||||
0x192b1908, 0x08191908, 0x2b080808, 0x08191908, 0x08080819, 0x08191919, 0x08081908, 0x08191919,
|
||||
0x08190808, 0x08191919, 0x19080808, 0x08191919, 0x08080808, 0x0819192b, 0x08191908, 0x0819192b,
|
||||
0x19082b19, 0x0819192b, 0x08080819, 0x08192b08, 0x08081908, 0x08192b08, 0x08190808, 0x08192b08,
|
||||
0x0819082b, 0x08192b08, 0x19080808, 0x08192b08, 0x19191908, 0x08192b08, 0x2b08192b, 0x08192b08,
|
||||
0x08080808, 0x08192b19, 0x08081919, 0x08192b19, 0x192b192b, 0x08192b19, 0x19190819, 0x08192b2b,
|
||||
0x2b2b2b19, 0x08192b2b, 0x08080808, 0x082b0808, 0x0808082b, 0x082b0808, 0x08081919, 0x082b0808,
|
||||
0x08082b08, 0x082b0808, 0x08082b2b, 0x082b0808, 0x08190819, 0x082b0808, 0x08191908, 0x082b0808,
|
||||
0x082b0808, 0x082b0808, 0x19080819, 0x082b0808, 0x19081908, 0x082b0808, 0x19190808, 0x082b0808,
|
||||
0x2b080808, 0x082b0808, 0x2b2b0808, 0x082b0808, 0x08080819, 0x082b0819, 0x08081908, 0x082b0819,
|
||||
0x08190808, 0x082b0819, 0x19080808, 0x082b0819, 0x19082b08, 0x082b0819, 0x192b1919, 0x082b0819,
|
||||
0x08080808, 0x082b082b, 0x082b082b, 0x082b082b, 0x2b080808, 0x082b082b, 0x2b2b2b08, 0x082b082b,
|
||||
0x08080819, 0x082b1908, 0x08081908, 0x082b1908, 0x08190808, 0x082b1908, 0x082b2b19, 0x082b1908,
|
||||
0x19080808, 0x082b1908, 0x08080808, 0x082b1919, 0x19080819, 0x082b1919, 0x1919082b, 0x082b1919,
|
||||
0x2b192b19, 0x082b1919, 0x08080819, 0x082b192b, 0x08192b2b, 0x082b192b, 0x2b2b192b, 0x082b192b,
|
||||
0x08080808, 0x082b2b08, 0x08082b08, 0x082b2b08, 0x08082b2b, 0x082b2b08, 0x082b0808, 0x082b2b08,
|
||||
0x19191919, 0x082b2b08, 0x2b082b08, 0x082b2b08, 0x2b2b082b, 0x082b2b08, 0x192b2b08, 0x082b2b19,
|
||||
0x2b190808, 0x082b2b19, 0x08082b08, 0x082b2b2b, 0x082b0808, 0x082b2b2b, 0x2b08082b, 0x082b2b2b,
|
||||
0x2b082b08, 0x082b2b2b, 0x2b082b2b, 0x082b2b2b, 0x08080819, 0x19080808, 0x08081908, 0x19080808,
|
||||
0x0808192b, 0x19080808, 0x08082b19, 0x19080808, 0x08190808, 0x19080808, 0x0819082b, 0x19080808,
|
||||
0x08191919, 0x19080808, 0x08192b08, 0x19080808, 0x082b0819, 0x19080808, 0x082b1908, 0x19080808,
|
||||
0x19080808, 0x19080808, 0x1908082b, 0x19080808, 0x19081919, 0x19080808, 0x19082b08, 0x19080808,
|
||||
0x19082b2b, 0x19080808, 0x19190819, 0x19080808, 0x19191908, 0x19080808, 0x192b0808, 0x19080808,
|
||||
0x192b1919, 0x19080808, 0x2b080819, 0x19080808, 0x2b081908, 0x19080808, 0x2b190808, 0x19080808,
|
||||
0x08080808, 0x19080819, 0x0808082b, 0x19080819, 0x08081919, 0x19080819, 0x08082b08, 0x19080819,
|
||||
0x08190819, 0x19080819, 0x08191908, 0x19080819, 0x082b0808, 0x19080819, 0x19080819, 0x19080819,
|
||||
0x19081908, 0x19080819, 0x19190808, 0x19080819, 0x2b080808, 0x19080819, 0x2b081919, 0x19080819,
|
||||
0x2b2b082b, 0x19080819, 0x08080819, 0x1908082b, 0x08081908, 0x1908082b, 0x08190808, 0x1908082b,
|
||||
0x0819082b, 0x1908082b, 0x082b2b19, 0x1908082b, 0x19080808, 0x1908082b, 0x08080808, 0x19081908,
|
||||
0x0808082b, 0x19081908, 0x08081919, 0x19081908, 0x08082b08, 0x19081908, 0x08190819, 0x19081908,
|
||||
0x08191908, 0x19081908, 0x08192b19, 0x19081908, 0x082b0808, 0x19081908, 0x19080819, 0x19081908,
|
||||
0x19081908, 0x19081908, 0x19190808, 0x19081908, 0x2b080808, 0x19081908, 0x2b191908, 0x19081908,
|
||||
0x08080819, 0x19081919, 0x08081908, 0x19081919, 0x08190808, 0x19081919, 0x082b1908, 0x19081919,
|
||||
0x19080808, 0x19081919, 0x2b192b2b, 0x19081919, 0x08080808, 0x1908192b, 0x08082b2b, 0x1908192b,
|
||||
0x19081908, 0x1908192b, 0x19190808, 0x1908192b, 0x08080819, 0x19082b08, 0x08081908, 0x19082b08,
|
||||
0x08190808, 0x19082b08, 0x19080808, 0x19082b08, 0x19081919, 0x19082b08, 0x19191908, 0x19082b08,
|
||||
0x192b082b, 0x19082b08, 0x08080808, 0x19082b19, 0x08190819, 0x19082b19, 0x19081908, 0x19082b19,
|
||||
0x19190808, 0x19082b19, 0x192b2b19, 0x19082b19, 0x08081908, 0x19082b2b, 0x08080808, 0x19190808,
|
||||
0x0808082b, 0x19190808, 0x08081919, 0x19190808, 0x08082b08, 0x19190808, 0x08190819, 0x19190808,
|
||||
0x08191908, 0x19190808, 0x082b0808, 0x19190808, 0x082b2b08, 0x19190808, 0x19080819, 0x19190808,
|
||||
0x19081908, 0x19190808, 0x19190808, 0x19190808, 0x2b080808, 0x19190808, 0x08080819, 0x19190819,
|
||||
0x08081908, 0x19190819, 0x08190808, 0x19190819, 0x08191919, 0x19190819, 0x19080808, 0x19190819,
|
||||
0x1908082b, 0x19190819, 0x08080808, 0x1919082b, 0x19081908, 0x1919082b, 0x2b2b2b2b, 0x1919082b,
|
||||
0x08080819, 0x19191908, 0x08081908, 0x19191908, 0x08190808, 0x19191908, 0x082b0819, 0x19191908,
|
||||
0x19080808, 0x19191908, 0x192b0808, 0x19191908, 0x2b080819, 0x19191908, 0x2b2b0819, 0x19191908,
|
||||
0x08080808, 0x19191919, 0x08082b08, 0x19191919, 0x2b080808, 0x19191919, 0x2b082b08, 0x19191919,
|
||||
0x082b0819, 0x1919192b, 0x192b2b08, 0x1919192b, 0x2b2b0819, 0x1919192b, 0x08080808, 0x19192b08,
|
||||
0x08191908, 0x19192b08, 0x19080819, 0x19192b08, 0x19190808, 0x19192b08, 0x2b192b19, 0x19192b08,
|
||||
0x08192b2b, 0x19192b19, 0x19080808, 0x19192b19, 0x1908082b, 0x19192b19, 0x2b081919, 0x19192b2b,
|
||||
0x08080819, 0x192b0808, 0x08081908, 0x192b0808, 0x08190808, 0x192b0808, 0x19080808, 0x192b0808,
|
||||
0x19191908, 0x192b0808, 0x192b082b, 0x192b0808, 0x2b08192b, 0x192b0808, 0x2b2b2b19, 0x192b0808,
|
||||
0x08080808, 0x192b0819, 0x082b1908, 0x192b082b, 0x19082b2b, 0x192b082b, 0x2b19082b, 0x192b082b,
|
||||
0x08080808, 0x192b1908, 0x0819192b, 0x192b1908, 0x08190808, 0x192b1919, 0x19080808, 0x192b1919,
|
||||
0x19081919, 0x192b1919, 0x2b2b1908, 0x192b1919, 0x08080819, 0x192b2b08, 0x192b2b2b, 0x192b2b08,
|
||||
0x082b1919, 0x192b2b19, 0x0808192b, 0x192b2b2b, 0x19191908, 0x192b2b2b, 0x192b082b, 0x192b2b2b,
|
||||
0x08080808, 0x2b080808, 0x0808082b, 0x2b080808, 0x08081919, 0x2b080808, 0x08082b08, 0x2b080808,
|
||||
0x08190819, 0x2b080808, 0x08191908, 0x2b080808, 0x082b0808, 0x2b080808, 0x082b2b2b, 0x2b080808,
|
||||
0x19080819, 0x2b080808, 0x19081908, 0x2b080808, 0x19190808, 0x2b080808, 0x2b080808, 0x2b080808,
|
||||
0x2b08082b, 0x2b080808, 0x2b2b2b08, 0x2b080808, 0x2b2b2b2b, 0x2b080808, 0x08080819, 0x2b080819,
|
||||
0x08081908, 0x2b080819, 0x0808192b, 0x2b080819, 0x08190808, 0x2b080819, 0x19080808, 0x2b080819,
|
||||
0x19190819, 0x2b080819, 0x19192b19, 0x2b080819, 0x08080808, 0x2b08082b, 0x082b0808, 0x2b08082b,
|
||||
0x2b080808, 0x2b08082b, 0x2b08082b, 0x2b08082b, 0x2b2b0808, 0x2b08082b, 0x2b2b2b08, 0x2b08082b,
|
||||
0x08080819, 0x2b081908, 0x08081908, 0x2b081908, 0x08190808, 0x2b081908, 0x0819082b, 0x2b081908,
|
||||
0x08191919, 0x2b081908, 0x19080808, 0x2b081908, 0x192b0808, 0x2b081908, 0x2b082b19, 0x2b081908,
|
||||
0x08080808, 0x2b081919, 0x19081908, 0x2b081919, 0x2b2b1919, 0x2b081919, 0x08192b08, 0x2b08192b,
|
||||
0x192b2b2b, 0x2b08192b, 0x08080808, 0x2b082b08, 0x08082b08, 0x2b082b08, 0x082b1919, 0x2b082b08,
|
||||
0x19192b2b, 0x2b082b08, 0x2b080808, 0x2b082b08, 0x2b08082b, 0x2b082b08, 0x2b2b2b08, 0x2b082b08,
|
||||
0x0808192b, 0x2b082b19, 0x082b082b, 0x2b082b2b, 0x2b080808, 0x2b082b2b, 0x2b082b08, 0x2b082b2b,
|
||||
0x2b19192b, 0x2b082b2b, 0x2b2b2b08, 0x2b082b2b, 0x08080819, 0x2b190808, 0x08081908, 0x2b190808,
|
||||
0x08190808, 0x2b190808, 0x19080808, 0x2b190808, 0x1919192b, 0x2b190808, 0x2b081908, 0x2b190808,
|
||||
0x08080808, 0x2b190819, 0x082b082b, 0x2b190819, 0x192b1908, 0x2b190819, 0x1919192b, 0x2b19082b,
|
||||
0x2b082b19, 0x2b19082b, 0x08080808, 0x2b191908, 0x08081919, 0x2b191908, 0x19081908, 0x2b191908,
|
||||
0x19190808, 0x2b191908, 0x19192b08, 0x2b191908, 0x082b2b19, 0x2b191919, 0x2b190808, 0x2b191919,
|
||||
0x2b19082b, 0x2b191919, 0x19080819, 0x2b19192b, 0x19190819, 0x2b192b08, 0x2b2b192b, 0x2b192b08,
|
||||
0x19082b19, 0x2b192b19, 0x08191919, 0x2b192b2b, 0x192b0808, 0x2b192b2b, 0x08080808, 0x2b2b0808,
|
||||
0x0808082b, 0x2b2b0808, 0x08082b08, 0x2b2b0808, 0x08082b2b, 0x2b2b0808, 0x082b0808, 0x2b2b0808,
|
||||
0x082b2b2b, 0x2b2b0808, 0x2b2b0808, 0x2b2b0808, 0x19190819, 0x2b2b0819, 0x19192b19, 0x2b2b0819,
|
||||
0x2b2b192b, 0x2b2b0819, 0x08080808, 0x2b2b082b, 0x0808082b, 0x2b2b082b, 0x08082b08, 0x2b2b082b,
|
||||
0x082b2b2b, 0x2b2b082b, 0x2b080808, 0x2b2b082b, 0x2b2b0808, 0x2b2b082b, 0x19080808, 0x2b2b1908,
|
||||
0x2b191919, 0x2b2b1908, 0x192b1919, 0x2b2b192b, 0x2b192b08, 0x2b2b192b, 0x08082b2b, 0x2b2b2b08,
|
||||
0x082b0808, 0x2b2b2b08, 0x082b082b, 0x2b2b2b08, 0x082b2b08, 0x2b2b2b08, 0x2b2b0808, 0x2b2b2b08,
|
||||
0x2b2b2b08, 0x2b2b2b08, 0x08081908, 0x2b2b2b19, 0x2b081908, 0x2b2b2b19, 0x2b08192b, 0x2b2b2b19,
|
||||
0x082b2b08, 0x2b2b2b2b, 0x082b2b2b, 0x2b2b2b2b, 0x2b190819, 0x2b2b2b2b, 0x2b2b2b2b, 0x2b2b2b2b
|
||||
);
|
||||
#enddecl(IQ2_XS_GRID)
|
||||
|
||||
#decl(IQ2_S_GRID)
|
||||
const iq2s_grid = array<u32, 2048>(
|
||||
0x08080808, 0x08080808, 0x0808082b, 0x08080808, 0x08081919, 0x08080808, 0x08082b08, 0x08080808,
|
||||
0x08082b2b, 0x08080808, 0x08190819, 0x08080808, 0x08191908, 0x08080808, 0x0819192b, 0x08080808,
|
||||
0x08192b19, 0x08080808, 0x082b0808, 0x08080808, 0x082b082b, 0x08080808, 0x082b1919, 0x08080808,
|
||||
0x082b2b08, 0x08080808, 0x19080819, 0x08080808, 0x19081908, 0x08080808, 0x1908192b, 0x08080808,
|
||||
0x19082b19, 0x08080808, 0x19190808, 0x08080808, 0x1919082b, 0x08080808, 0x19191919, 0x08080808,
|
||||
0x19192b08, 0x08080808, 0x192b0819, 0x08080808, 0x192b1908, 0x08080808, 0x192b192b, 0x08080808,
|
||||
0x192b2b19, 0x08080808, 0x2b080808, 0x08080808, 0x2b08082b, 0x08080808, 0x2b081919, 0x08080808,
|
||||
0x2b082b08, 0x08080808, 0x2b190819, 0x08080808, 0x2b191908, 0x08080808, 0x2b2b0808, 0x08080808,
|
||||
0x2b2b1919, 0x08080808, 0x2b2b2b2b, 0x08080808, 0x08080819, 0x08080819, 0x08081908, 0x08080819,
|
||||
0x0808192b, 0x08080819, 0x08082b19, 0x08080819, 0x08190808, 0x08080819, 0x0819082b, 0x08080819,
|
||||
0x08191919, 0x08080819, 0x08192b08, 0x08080819, 0x082b0819, 0x08080819, 0x082b1908, 0x08080819,
|
||||
0x19080808, 0x08080819, 0x1908082b, 0x08080819, 0x19081919, 0x08080819, 0x19082b08, 0x08080819,
|
||||
0x19190819, 0x08080819, 0x19191908, 0x08080819, 0x1919192b, 0x08080819, 0x19192b19, 0x08080819,
|
||||
0x192b0808, 0x08080819, 0x192b1919, 0x08080819, 0x192b2b08, 0x08080819, 0x2b080819, 0x08080819,
|
||||
0x2b081908, 0x08080819, 0x2b190808, 0x08080819, 0x2b19082b, 0x08080819, 0x2b191919, 0x08080819,
|
||||
0x2b2b0819, 0x08080819, 0x2b2b1908, 0x08080819, 0x08080808, 0x0808082b, 0x0808082b, 0x0808082b,
|
||||
0x08081919, 0x0808082b, 0x08082b08, 0x0808082b, 0x08190819, 0x0808082b, 0x08191908, 0x0808082b,
|
||||
0x082b0808, 0x0808082b, 0x082b2b2b, 0x0808082b, 0x19080819, 0x0808082b, 0x19081908, 0x0808082b,
|
||||
0x1908192b, 0x0808082b, 0x19082b19, 0x0808082b, 0x19190808, 0x0808082b, 0x19191919, 0x0808082b,
|
||||
0x2b080808, 0x0808082b, 0x2b081919, 0x0808082b, 0x2b082b2b, 0x0808082b, 0x2b191908, 0x0808082b,
|
||||
0x2b2b082b, 0x0808082b, 0x08080819, 0x08081908, 0x08081908, 0x08081908, 0x0808192b, 0x08081908,
|
||||
0x08082b19, 0x08081908, 0x08190808, 0x08081908, 0x0819082b, 0x08081908, 0x08191919, 0x08081908,
|
||||
0x08192b08, 0x08081908, 0x082b0819, 0x08081908, 0x082b1908, 0x08081908, 0x082b192b, 0x08081908,
|
||||
0x082b2b19, 0x08081908, 0x19080808, 0x08081908, 0x1908082b, 0x08081908, 0x19081919, 0x08081908,
|
||||
0x19082b08, 0x08081908, 0x19082b2b, 0x08081908, 0x19190819, 0x08081908, 0x19191908, 0x08081908,
|
||||
0x1919192b, 0x08081908, 0x19192b19, 0x08081908, 0x192b0808, 0x08081908, 0x192b082b, 0x08081908,
|
||||
0x192b1919, 0x08081908, 0x2b080819, 0x08081908, 0x2b081908, 0x08081908, 0x2b08192b, 0x08081908,
|
||||
0x2b082b19, 0x08081908, 0x2b190808, 0x08081908, 0x2b191919, 0x08081908, 0x2b192b08, 0x08081908,
|
||||
0x2b2b0819, 0x08081908, 0x2b2b1908, 0x08081908, 0x08080808, 0x08081919, 0x0808082b, 0x08081919,
|
||||
0x08081919, 0x08081919, 0x08082b08, 0x08081919, 0x08082b2b, 0x08081919, 0x08190819, 0x08081919,
|
||||
0x08191908, 0x08081919, 0x0819192b, 0x08081919, 0x08192b19, 0x08081919, 0x082b0808, 0x08081919,
|
||||
0x082b1919, 0x08081919, 0x082b2b08, 0x08081919, 0x19080819, 0x08081919, 0x19081908, 0x08081919,
|
||||
0x1908192b, 0x08081919, 0x19082b19, 0x08081919, 0x19190808, 0x08081919, 0x1919082b, 0x08081919,
|
||||
0x19191919, 0x08081919, 0x19192b08, 0x08081919, 0x192b0819, 0x08081919, 0x192b1908, 0x08081919,
|
||||
0x2b080808, 0x08081919, 0x2b08082b, 0x08081919, 0x2b081919, 0x08081919, 0x2b082b08, 0x08081919,
|
||||
0x2b190819, 0x08081919, 0x2b191908, 0x08081919, 0x2b2b0808, 0x08081919, 0x08080819, 0x0808192b,
|
||||
0x08081908, 0x0808192b, 0x0808192b, 0x0808192b, 0x08082b19, 0x0808192b, 0x08190808, 0x0808192b,
|
||||
0x08191919, 0x0808192b, 0x19080808, 0x0808192b, 0x19081919, 0x0808192b, 0x19082b08, 0x0808192b,
|
||||
0x19190819, 0x0808192b, 0x19191908, 0x0808192b, 0x192b0808, 0x0808192b, 0x2b080819, 0x0808192b,
|
||||
0x2b081908, 0x0808192b, 0x2b190808, 0x0808192b, 0x08080808, 0x08082b08, 0x0808082b, 0x08082b08,
|
||||
0x08081919, 0x08082b08, 0x08082b08, 0x08082b08, 0x08190819, 0x08082b08, 0x08191908, 0x08082b08,
|
||||
0x0819192b, 0x08082b08, 0x08192b19, 0x08082b08, 0x082b0808, 0x08082b08, 0x082b1919, 0x08082b08,
|
||||
0x082b2b2b, 0x08082b08, 0x19080819, 0x08082b08, 0x19081908, 0x08082b08, 0x1908192b, 0x08082b08,
|
||||
0x19082b19, 0x08082b08, 0x19190808, 0x08082b08, 0x1919082b, 0x08082b08, 0x19191919, 0x08082b08,
|
||||
0x19192b08, 0x08082b08, 0x192b0819, 0x08082b08, 0x192b1908, 0x08082b08, 0x2b080808, 0x08082b08,
|
||||
0x2b081919, 0x08082b08, 0x2b191908, 0x08082b08, 0x2b2b2b2b, 0x08082b08, 0x08080819, 0x08082b19,
|
||||
0x08081908, 0x08082b19, 0x08190808, 0x08082b19, 0x0819082b, 0x08082b19, 0x08191919, 0x08082b19,
|
||||
0x08192b08, 0x08082b19, 0x082b0819, 0x08082b19, 0x19080808, 0x08082b19, 0x19081919, 0x08082b19,
|
||||
0x19082b08, 0x08082b19, 0x19190819, 0x08082b19, 0x19191908, 0x08082b19, 0x192b0808, 0x08082b19,
|
||||
0x2b080819, 0x08082b19, 0x2b190808, 0x08082b19, 0x08080808, 0x08082b2b, 0x08190819, 0x08082b2b,
|
||||
0x08191908, 0x08082b2b, 0x082b082b, 0x08082b2b, 0x082b2b08, 0x08082b2b, 0x082b2b2b, 0x08082b2b,
|
||||
0x19190808, 0x08082b2b, 0x2b192b19, 0x08082b2b, 0x08080819, 0x08190808, 0x08081908, 0x08190808,
|
||||
0x0808192b, 0x08190808, 0x08082b19, 0x08190808, 0x08190808, 0x08190808, 0x0819082b, 0x08190808,
|
||||
0x08191919, 0x08190808, 0x08192b08, 0x08190808, 0x082b0819, 0x08190808, 0x082b1908, 0x08190808,
|
||||
0x082b192b, 0x08190808, 0x19080808, 0x08190808, 0x1908082b, 0x08190808, 0x19081919, 0x08190808,
|
||||
0x19082b08, 0x08190808, 0x19190819, 0x08190808, 0x19191908, 0x08190808, 0x1919192b, 0x08190808,
|
||||
0x19192b19, 0x08190808, 0x192b0808, 0x08190808, 0x192b082b, 0x08190808, 0x192b1919, 0x08190808,
|
||||
0x192b2b08, 0x08190808, 0x2b080819, 0x08190808, 0x2b081908, 0x08190808, 0x2b08192b, 0x08190808,
|
||||
0x2b190808, 0x08190808, 0x2b191919, 0x08190808, 0x2b192b08, 0x08190808, 0x2b2b0819, 0x08190808,
|
||||
0x2b2b1908, 0x08190808, 0x08080808, 0x08190819, 0x0808082b, 0x08190819, 0x08081919, 0x08190819,
|
||||
0x08082b08, 0x08190819, 0x08082b2b, 0x08190819, 0x08190819, 0x08190819, 0x08191908, 0x08190819,
|
||||
0x0819192b, 0x08190819, 0x08192b19, 0x08190819, 0x082b0808, 0x08190819, 0x082b082b, 0x08190819,
|
||||
0x082b1919, 0x08190819, 0x082b2b08, 0x08190819, 0x19080819, 0x08190819, 0x19081908, 0x08190819,
|
||||
0x1908192b, 0x08190819, 0x19082b19, 0x08190819, 0x19190808, 0x08190819, 0x1919082b, 0x08190819,
|
||||
0x19191919, 0x08190819, 0x19192b08, 0x08190819, 0x192b0819, 0x08190819, 0x192b1908, 0x08190819,
|
||||
0x2b080808, 0x08190819, 0x2b08082b, 0x08190819, 0x2b081919, 0x08190819, 0x2b082b08, 0x08190819,
|
||||
0x2b190819, 0x08190819, 0x2b191908, 0x08190819, 0x08080819, 0x0819082b, 0x08081908, 0x0819082b,
|
||||
0x08082b19, 0x0819082b, 0x08190808, 0x0819082b, 0x08191919, 0x0819082b, 0x082b0819, 0x0819082b,
|
||||
0x082b1908, 0x0819082b, 0x19080808, 0x0819082b, 0x19081919, 0x0819082b, 0x19190819, 0x0819082b,
|
||||
0x19191908, 0x0819082b, 0x2b080819, 0x0819082b, 0x2b081908, 0x0819082b, 0x2b190808, 0x0819082b,
|
||||
0x08080808, 0x08191908, 0x0808082b, 0x08191908, 0x08081919, 0x08191908, 0x08082b08, 0x08191908,
|
||||
0x08190819, 0x08191908, 0x08191908, 0x08191908, 0x0819192b, 0x08191908, 0x08192b19, 0x08191908,
|
||||
0x082b0808, 0x08191908, 0x082b1919, 0x08191908, 0x082b2b08, 0x08191908, 0x19080819, 0x08191908,
|
||||
0x19081908, 0x08191908, 0x1908192b, 0x08191908, 0x19082b19, 0x08191908, 0x19190808, 0x08191908,
|
||||
0x1919082b, 0x08191908, 0x19191919, 0x08191908, 0x19192b08, 0x08191908, 0x192b0819, 0x08191908,
|
||||
0x192b1908, 0x08191908, 0x2b080808, 0x08191908, 0x2b08082b, 0x08191908, 0x2b081919, 0x08191908,
|
||||
0x2b082b08, 0x08191908, 0x2b190819, 0x08191908, 0x2b191908, 0x08191908, 0x2b2b0808, 0x08191908,
|
||||
0x08080819, 0x08191919, 0x08081908, 0x08191919, 0x0808192b, 0x08191919, 0x08082b19, 0x08191919,
|
||||
0x08190808, 0x08191919, 0x0819082b, 0x08191919, 0x08191919, 0x08191919, 0x08192b08, 0x08191919,
|
||||
0x082b0819, 0x08191919, 0x082b1908, 0x08191919, 0x19080808, 0x08191919, 0x1908082b, 0x08191919,
|
||||
0x19081919, 0x08191919, 0x19082b08, 0x08191919, 0x19190819, 0x08191919, 0x19191908, 0x08191919,
|
||||
0x192b0808, 0x08191919, 0x2b080819, 0x08191919, 0x2b081908, 0x08191919, 0x2b190808, 0x08191919,
|
||||
0x08080808, 0x0819192b, 0x08081919, 0x0819192b, 0x08082b08, 0x0819192b, 0x08190819, 0x0819192b,
|
||||
0x08191908, 0x0819192b, 0x082b0808, 0x0819192b, 0x19080819, 0x0819192b, 0x19081908, 0x0819192b,
|
||||
0x19190808, 0x0819192b, 0x2b080808, 0x0819192b, 0x2b2b2b2b, 0x0819192b, 0x08080819, 0x08192b08,
|
||||
0x08081908, 0x08192b08, 0x0808192b, 0x08192b08, 0x08082b19, 0x08192b08, 0x08190808, 0x08192b08,
|
||||
0x08191919, 0x08192b08, 0x08192b08, 0x08192b08, 0x082b0819, 0x08192b08, 0x19080808, 0x08192b08,
|
||||
0x1908082b, 0x08192b08, 0x19081919, 0x08192b08, 0x19082b08, 0x08192b08, 0x19190819, 0x08192b08,
|
||||
0x19191908, 0x08192b08, 0x192b0808, 0x08192b08, 0x2b080819, 0x08192b08, 0x2b081908, 0x08192b08,
|
||||
0x08080808, 0x08192b19, 0x0808082b, 0x08192b19, 0x08081919, 0x08192b19, 0x08082b08, 0x08192b19,
|
||||
0x08190819, 0x08192b19, 0x08191908, 0x08192b19, 0x082b0808, 0x08192b19, 0x19080819, 0x08192b19,
|
||||
0x19081908, 0x08192b19, 0x19190808, 0x08192b19, 0x192b2b19, 0x08192b19, 0x2b2b082b, 0x08192b19,
|
||||
0x08081908, 0x08192b2b, 0x08190808, 0x08192b2b, 0x19080808, 0x08192b2b, 0x1919192b, 0x08192b2b,
|
||||
0x08080808, 0x082b0808, 0x0808082b, 0x082b0808, 0x08081919, 0x082b0808, 0x08082b08, 0x082b0808,
|
||||
0x08190819, 0x082b0808, 0x08191908, 0x082b0808, 0x0819192b, 0x082b0808, 0x08192b19, 0x082b0808,
|
||||
0x082b0808, 0x082b0808, 0x082b1919, 0x082b0808, 0x082b2b2b, 0x082b0808, 0x19080819, 0x082b0808,
|
||||
0x19081908, 0x082b0808, 0x19190808, 0x082b0808, 0x1919082b, 0x082b0808, 0x19191919, 0x082b0808,
|
||||
0x192b1908, 0x082b0808, 0x2b080808, 0x082b0808, 0x2b082b2b, 0x082b0808, 0x2b191908, 0x082b0808,
|
||||
0x2b2b2b2b, 0x082b0808, 0x08080819, 0x082b0819, 0x08081908, 0x082b0819, 0x08190808, 0x082b0819,
|
||||
0x0819082b, 0x082b0819, 0x08191919, 0x082b0819, 0x082b0819, 0x082b0819, 0x19080808, 0x082b0819,
|
||||
0x1908082b, 0x082b0819, 0x19081919, 0x082b0819, 0x19190819, 0x082b0819, 0x19191908, 0x082b0819,
|
||||
0x192b0808, 0x082b0819, 0x2b080819, 0x082b0819, 0x2b081908, 0x082b0819, 0x2b190808, 0x082b0819,
|
||||
0x08080808, 0x082b082b, 0x08082b2b, 0x082b082b, 0x082b082b, 0x082b082b, 0x082b2b08, 0x082b082b,
|
||||
0x082b2b2b, 0x082b082b, 0x19081908, 0x082b082b, 0x19190808, 0x082b082b, 0x2b082b08, 0x082b082b,
|
||||
0x2b082b2b, 0x082b082b, 0x2b2b2b08, 0x082b082b, 0x08080819, 0x082b1908, 0x08081908, 0x082b1908,
|
||||
0x0808192b, 0x082b1908, 0x08082b19, 0x082b1908, 0x08190808, 0x082b1908, 0x08191919, 0x082b1908,
|
||||
0x08192b08, 0x082b1908, 0x082b0819, 0x082b1908, 0x082b1908, 0x082b1908, 0x19080808, 0x082b1908,
|
||||
0x1908082b, 0x082b1908, 0x19081919, 0x082b1908, 0x19082b08, 0x082b1908, 0x19190819, 0x082b1908,
|
||||
0x19191908, 0x082b1908, 0x192b0808, 0x082b1908, 0x2b080819, 0x082b1908, 0x2b081908, 0x082b1908,
|
||||
0x2b190808, 0x082b1908, 0x08080808, 0x082b1919, 0x08081919, 0x082b1919, 0x08082b08, 0x082b1919,
|
||||
0x08190819, 0x082b1919, 0x08191908, 0x082b1919, 0x082b0808, 0x082b1919, 0x19080819, 0x082b1919,
|
||||
0x19081908, 0x082b1919, 0x19190808, 0x082b1919, 0x192b192b, 0x082b1919, 0x2b080808, 0x082b1919,
|
||||
0x08080819, 0x082b192b, 0x08081908, 0x082b192b, 0x08190808, 0x082b192b, 0x19080808, 0x082b192b,
|
||||
0x19192b19, 0x082b192b, 0x08080808, 0x082b2b08, 0x08081919, 0x082b2b08, 0x08190819, 0x082b2b08,
|
||||
0x08191908, 0x082b2b08, 0x19080819, 0x082b2b08, 0x19081908, 0x082b2b08, 0x19190808, 0x082b2b08,
|
||||
0x2b082b2b, 0x082b2b08, 0x2b2b2b2b, 0x082b2b08, 0x08080819, 0x082b2b19, 0x08081908, 0x082b2b19,
|
||||
0x08190808, 0x082b2b19, 0x2b191919, 0x082b2b19, 0x08082b2b, 0x082b2b2b, 0x082b082b, 0x082b2b2b,
|
||||
0x192b1908, 0x082b2b2b, 0x2b082b08, 0x082b2b2b, 0x2b082b2b, 0x082b2b2b, 0x08080819, 0x19080808,
|
||||
0x08081908, 0x19080808, 0x0808192b, 0x19080808, 0x08082b19, 0x19080808, 0x08190808, 0x19080808,
|
||||
0x0819082b, 0x19080808, 0x08191919, 0x19080808, 0x08192b08, 0x19080808, 0x08192b2b, 0x19080808,
|
||||
0x082b0819, 0x19080808, 0x082b1908, 0x19080808, 0x082b192b, 0x19080808, 0x19080808, 0x19080808,
|
||||
0x1908082b, 0x19080808, 0x19081919, 0x19080808, 0x19082b08, 0x19080808, 0x19082b2b, 0x19080808,
|
||||
0x19190819, 0x19080808, 0x19191908, 0x19080808, 0x1919192b, 0x19080808, 0x19192b19, 0x19080808,
|
||||
0x192b0808, 0x19080808, 0x192b082b, 0x19080808, 0x192b1919, 0x19080808, 0x2b080819, 0x19080808,
|
||||
0x2b081908, 0x19080808, 0x2b190808, 0x19080808, 0x2b191919, 0x19080808, 0x2b192b08, 0x19080808,
|
||||
0x2b2b0819, 0x19080808, 0x2b2b1908, 0x19080808, 0x08080808, 0x19080819, 0x0808082b, 0x19080819,
|
||||
0x08081919, 0x19080819, 0x08082b08, 0x19080819, 0x08190819, 0x19080819, 0x08191908, 0x19080819,
|
||||
0x0819192b, 0x19080819, 0x08192b19, 0x19080819, 0x082b0808, 0x19080819, 0x082b082b, 0x19080819,
|
||||
0x082b1919, 0x19080819, 0x19080819, 0x19080819, 0x19081908, 0x19080819, 0x1908192b, 0x19080819,
|
||||
0x19082b19, 0x19080819, 0x19190808, 0x19080819, 0x1919082b, 0x19080819, 0x19191919, 0x19080819,
|
||||
0x19192b08, 0x19080819, 0x192b0819, 0x19080819, 0x192b1908, 0x19080819, 0x2b080808, 0x19080819,
|
||||
0x2b08082b, 0x19080819, 0x2b081919, 0x19080819, 0x2b082b08, 0x19080819, 0x2b190819, 0x19080819,
|
||||
0x2b191908, 0x19080819, 0x2b2b0808, 0x19080819, 0x08080819, 0x1908082b, 0x08081908, 0x1908082b,
|
||||
0x08190808, 0x1908082b, 0x0819082b, 0x1908082b, 0x08191919, 0x1908082b, 0x08192b08, 0x1908082b,
|
||||
0x082b1908, 0x1908082b, 0x19080808, 0x1908082b, 0x19081919, 0x1908082b, 0x19082b08, 0x1908082b,
|
||||
0x19190819, 0x1908082b, 0x19191908, 0x1908082b, 0x192b0808, 0x1908082b, 0x2b080819, 0x1908082b,
|
||||
0x2b081908, 0x1908082b, 0x08080808, 0x19081908, 0x0808082b, 0x19081908, 0x08081919, 0x19081908,
|
||||
0x08082b08, 0x19081908, 0x08082b2b, 0x19081908, 0x08190819, 0x19081908, 0x08191908, 0x19081908,
|
||||
0x0819192b, 0x19081908, 0x08192b19, 0x19081908, 0x082b0808, 0x19081908, 0x082b082b, 0x19081908,
|
||||
0x082b1919, 0x19081908, 0x082b2b08, 0x19081908, 0x19080819, 0x19081908, 0x19081908, 0x19081908,
|
||||
0x1908192b, 0x19081908, 0x19082b19, 0x19081908, 0x19190808, 0x19081908, 0x1919082b, 0x19081908,
|
||||
0x19191919, 0x19081908, 0x19192b08, 0x19081908, 0x192b0819, 0x19081908, 0x192b1908, 0x19081908,
|
||||
0x2b080808, 0x19081908, 0x2b08082b, 0x19081908, 0x2b081919, 0x19081908, 0x2b082b08, 0x19081908,
|
||||
0x2b190819, 0x19081908, 0x2b191908, 0x19081908, 0x2b2b0808, 0x19081908, 0x08080819, 0x19081919,
|
||||
0x08081908, 0x19081919, 0x0808192b, 0x19081919, 0x08082b19, 0x19081919, 0x08190808, 0x19081919,
|
||||
0x0819082b, 0x19081919, 0x08191919, 0x19081919, 0x08192b08, 0x19081919, 0x082b0819, 0x19081919,
|
||||
0x082b1908, 0x19081919, 0x19080808, 0x19081919, 0x1908082b, 0x19081919, 0x19081919, 0x19081919,
|
||||
0x19082b08, 0x19081919, 0x19190819, 0x19081919, 0x19191908, 0x19081919, 0x192b0808, 0x19081919,
|
||||
0x192b2b2b, 0x19081919, 0x2b080819, 0x19081919, 0x2b081908, 0x19081919, 0x2b190808, 0x19081919,
|
||||
0x08080808, 0x1908192b, 0x0808082b, 0x1908192b, 0x08081919, 0x1908192b, 0x08082b08, 0x1908192b,
|
||||
0x08190819, 0x1908192b, 0x08191908, 0x1908192b, 0x082b0808, 0x1908192b, 0x19080819, 0x1908192b,
|
||||
0x19081908, 0x1908192b, 0x19190808, 0x1908192b, 0x2b080808, 0x1908192b, 0x2b2b1919, 0x1908192b,
|
||||
0x08080819, 0x19082b08, 0x08081908, 0x19082b08, 0x08082b19, 0x19082b08, 0x08190808, 0x19082b08,
|
||||
0x0819082b, 0x19082b08, 0x08191919, 0x19082b08, 0x08192b08, 0x19082b08, 0x082b0819, 0x19082b08,
|
||||
0x082b1908, 0x19082b08, 0x19080808, 0x19082b08, 0x1908082b, 0x19082b08, 0x19081919, 0x19082b08,
|
||||
0x19082b08, 0x19082b08, 0x19190819, 0x19082b08, 0x19191908, 0x19082b08, 0x192b0808, 0x19082b08,
|
||||
0x2b081908, 0x19082b08, 0x2b190808, 0x19082b08, 0x08080808, 0x19082b19, 0x0808082b, 0x19082b19,
|
||||
0x08081919, 0x19082b19, 0x08082b08, 0x19082b19, 0x08190819, 0x19082b19, 0x08191908, 0x19082b19,
|
||||
0x082b0808, 0x19082b19, 0x19080819, 0x19082b19, 0x19081908, 0x19082b19, 0x19190808, 0x19082b19,
|
||||
0x2b080808, 0x19082b19, 0x2b19192b, 0x19082b19, 0x08080819, 0x19082b2b, 0x08081908, 0x19082b2b,
|
||||
0x08190808, 0x19082b2b, 0x19080808, 0x19082b2b, 0x08080808, 0x19190808, 0x0808082b, 0x19190808,
|
||||
0x08081919, 0x19190808, 0x08082b08, 0x19190808, 0x08190819, 0x19190808, 0x08191908, 0x19190808,
|
||||
0x0819192b, 0x19190808, 0x08192b19, 0x19190808, 0x082b0808, 0x19190808, 0x082b082b, 0x19190808,
|
||||
0x082b1919, 0x19190808, 0x082b2b08, 0x19190808, 0x19080819, 0x19190808, 0x19081908, 0x19190808,
|
||||
0x1908192b, 0x19190808, 0x19082b19, 0x19190808, 0x19190808, 0x19190808, 0x1919082b, 0x19190808,
|
||||
0x19191919, 0x19190808, 0x19192b08, 0x19190808, 0x192b0819, 0x19190808, 0x192b1908, 0x19190808,
|
||||
0x2b080808, 0x19190808, 0x2b08082b, 0x19190808, 0x2b081919, 0x19190808, 0x2b082b08, 0x19190808,
|
||||
0x2b190819, 0x19190808, 0x2b191908, 0x19190808, 0x08080819, 0x19190819, 0x08081908, 0x19190819,
|
||||
0x0808192b, 0x19190819, 0x08082b19, 0x19190819, 0x08190808, 0x19190819, 0x0819082b, 0x19190819,
|
||||
0x08191919, 0x19190819, 0x08192b08, 0x19190819, 0x082b0819, 0x19190819, 0x082b1908, 0x19190819,
|
||||
0x19080808, 0x19190819, 0x1908082b, 0x19190819, 0x19081919, 0x19190819, 0x19082b08, 0x19190819,
|
||||
0x19190819, 0x19190819, 0x19191908, 0x19190819, 0x192b0808, 0x19190819, 0x2b080819, 0x19190819,
|
||||
0x2b081908, 0x19190819, 0x2b190808, 0x19190819, 0x08080808, 0x1919082b, 0x08081919, 0x1919082b,
|
||||
0x08082b08, 0x1919082b, 0x08190819, 0x1919082b, 0x08191908, 0x1919082b, 0x082b0808, 0x1919082b,
|
||||
0x19080819, 0x1919082b, 0x19081908, 0x1919082b, 0x19190808, 0x1919082b, 0x192b2b19, 0x1919082b,
|
||||
0x2b080808, 0x1919082b, 0x08080819, 0x19191908, 0x08081908, 0x19191908, 0x0808192b, 0x19191908,
|
||||
0x08082b19, 0x19191908, 0x08190808, 0x19191908, 0x0819082b, 0x19191908, 0x08191919, 0x19191908,
|
||||
0x08192b08, 0x19191908, 0x082b0819, 0x19191908, 0x082b1908, 0x19191908, 0x19080808, 0x19191908,
|
||||
0x1908082b, 0x19191908, 0x19081919, 0x19191908, 0x19082b08, 0x19191908, 0x19190819, 0x19191908,
|
||||
0x19191908, 0x19191908, 0x192b0808, 0x19191908, 0x2b080819, 0x19191908, 0x2b081908, 0x19191908,
|
||||
0x2b190808, 0x19191908, 0x08080808, 0x19191919, 0x0808082b, 0x19191919, 0x08081919, 0x19191919,
|
||||
0x08082b08, 0x19191919, 0x08190819, 0x19191919, 0x08191908, 0x19191919, 0x082b0808, 0x19191919,
|
||||
0x19080819, 0x19191919, 0x19081908, 0x19191919, 0x19190808, 0x19191919, 0x2b080808, 0x19191919,
|
||||
0x08080819, 0x1919192b, 0x08081908, 0x1919192b, 0x08190808, 0x1919192b, 0x082b192b, 0x1919192b,
|
||||
0x19080808, 0x1919192b, 0x08080808, 0x19192b08, 0x0808082b, 0x19192b08, 0x08081919, 0x19192b08,
|
||||
0x08082b08, 0x19192b08, 0x08190819, 0x19192b08, 0x08191908, 0x19192b08, 0x082b0808, 0x19192b08,
|
||||
0x19080819, 0x19192b08, 0x19081908, 0x19192b08, 0x19190808, 0x19192b08, 0x19192b2b, 0x19192b08,
|
||||
0x2b080808, 0x19192b08, 0x08080819, 0x19192b19, 0x08081908, 0x19192b19, 0x08190808, 0x19192b19,
|
||||
0x19080808, 0x19192b19, 0x08080808, 0x19192b2b, 0x08192b19, 0x19192b2b, 0x2b081919, 0x19192b2b,
|
||||
0x2b2b2b08, 0x19192b2b, 0x08080819, 0x192b0808, 0x08081908, 0x192b0808, 0x0808192b, 0x192b0808,
|
||||
0x08190808, 0x192b0808, 0x0819082b, 0x192b0808, 0x08191919, 0x192b0808, 0x08192b08, 0x192b0808,
|
||||
0x082b0819, 0x192b0808, 0x082b1908, 0x192b0808, 0x19080808, 0x192b0808, 0x19081919, 0x192b0808,
|
||||
0x19082b08, 0x192b0808, 0x19190819, 0x192b0808, 0x19191908, 0x192b0808, 0x192b0808, 0x192b0808,
|
||||
0x2b081908, 0x192b0808, 0x2b190808, 0x192b0808, 0x08080808, 0x192b0819, 0x0808082b, 0x192b0819,
|
||||
0x08081919, 0x192b0819, 0x08082b08, 0x192b0819, 0x08190819, 0x192b0819, 0x08191908, 0x192b0819,
|
||||
0x082b0808, 0x192b0819, 0x19080819, 0x192b0819, 0x19081908, 0x192b0819, 0x19190808, 0x192b0819,
|
||||
0x2b080808, 0x192b0819, 0x2b192b19, 0x192b0819, 0x08081908, 0x192b082b, 0x08190808, 0x192b082b,
|
||||
0x19080808, 0x192b082b, 0x1919192b, 0x192b082b, 0x2b2b0819, 0x192b082b, 0x08080808, 0x192b1908,
|
||||
0x08081919, 0x192b1908, 0x08082b08, 0x192b1908, 0x08190819, 0x192b1908, 0x08191908, 0x192b1908,
|
||||
0x082b0808, 0x192b1908, 0x19080819, 0x192b1908, 0x19081908, 0x192b1908, 0x19190808, 0x192b1908,
|
||||
0x2b080808, 0x192b1908, 0x08080819, 0x192b1919, 0x08081908, 0x192b1919, 0x08190808, 0x192b1919,
|
||||
0x19080808, 0x192b1919, 0x19082b2b, 0x192b1919, 0x192b2b08, 0x192b1919, 0x2b19082b, 0x192b1919,
|
||||
0x08080808, 0x192b192b, 0x2b191908, 0x192b192b, 0x08080819, 0x192b2b08, 0x08081908, 0x192b2b08,
|
||||
0x08190808, 0x192b2b08, 0x192b1919, 0x192b2b08, 0x2b192b08, 0x192b2b08, 0x08080808, 0x192b2b19,
|
||||
0x082b2b2b, 0x192b2b19, 0x1908082b, 0x192b2b2b, 0x2b2b0819, 0x192b2b2b, 0x08080808, 0x2b080808,
|
||||
0x0808082b, 0x2b080808, 0x08081919, 0x2b080808, 0x08082b08, 0x2b080808, 0x08190819, 0x2b080808,
|
||||
0x08191908, 0x2b080808, 0x08192b19, 0x2b080808, 0x082b0808, 0x2b080808, 0x082b1919, 0x2b080808,
|
||||
0x19080819, 0x2b080808, 0x19081908, 0x2b080808, 0x19190808, 0x2b080808, 0x1919082b, 0x2b080808,
|
||||
0x19191919, 0x2b080808, 0x19192b08, 0x2b080808, 0x192b0819, 0x2b080808, 0x2b080808, 0x2b080808,
|
||||
0x2b081919, 0x2b080808, 0x2b190819, 0x2b080808, 0x2b191908, 0x2b080808, 0x08080819, 0x2b080819,
|
||||
0x08081908, 0x2b080819, 0x08082b19, 0x2b080819, 0x08190808, 0x2b080819, 0x0819082b, 0x2b080819,
|
||||
0x08191919, 0x2b080819, 0x08192b08, 0x2b080819, 0x082b0819, 0x2b080819, 0x082b1908, 0x2b080819,
|
||||
0x19080808, 0x2b080819, 0x1908082b, 0x2b080819, 0x19081919, 0x2b080819, 0x19082b08, 0x2b080819,
|
||||
0x19190819, 0x2b080819, 0x19191908, 0x2b080819, 0x2b080819, 0x2b080819, 0x2b081908, 0x2b080819,
|
||||
0x2b190808, 0x2b080819, 0x2b2b2b19, 0x2b080819, 0x08080808, 0x2b08082b, 0x08081919, 0x2b08082b,
|
||||
0x08082b2b, 0x2b08082b, 0x08190819, 0x2b08082b, 0x08191908, 0x2b08082b, 0x19080819, 0x2b08082b,
|
||||
0x19081908, 0x2b08082b, 0x19190808, 0x2b08082b, 0x08080819, 0x2b081908, 0x08081908, 0x2b081908,
|
||||
0x0808192b, 0x2b081908, 0x08082b19, 0x2b081908, 0x08190808, 0x2b081908, 0x0819082b, 0x2b081908,
|
||||
0x08191919, 0x2b081908, 0x08192b08, 0x2b081908, 0x082b0819, 0x2b081908, 0x19080808, 0x2b081908,
|
||||
0x1908082b, 0x2b081908, 0x19081919, 0x2b081908, 0x19082b08, 0x2b081908, 0x19190819, 0x2b081908,
|
||||
0x19191908, 0x2b081908, 0x192b0808, 0x2b081908, 0x2b080819, 0x2b081908, 0x2b081908, 0x2b081908,
|
||||
0x2b190808, 0x2b081908, 0x08080808, 0x2b081919, 0x0808082b, 0x2b081919, 0x08081919, 0x2b081919,
|
||||
0x08082b08, 0x2b081919, 0x08190819, 0x2b081919, 0x08191908, 0x2b081919, 0x082b0808, 0x2b081919,
|
||||
0x19080819, 0x2b081919, 0x19081908, 0x2b081919, 0x19190808, 0x2b081919, 0x2b080808, 0x2b081919,
|
||||
0x2b082b2b, 0x2b081919, 0x08080819, 0x2b08192b, 0x08081908, 0x2b08192b, 0x08190808, 0x2b08192b,
|
||||
0x082b2b19, 0x2b08192b, 0x19080808, 0x2b08192b, 0x08080808, 0x2b082b08, 0x08081919, 0x2b082b08,
|
||||
0x08190819, 0x2b082b08, 0x08191908, 0x2b082b08, 0x19080819, 0x2b082b08, 0x19081908, 0x2b082b08,
|
||||
0x19190808, 0x2b082b08, 0x2b2b082b, 0x2b082b08, 0x08080819, 0x2b082b19, 0x08081908, 0x2b082b19,
|
||||
0x19080808, 0x2b082b19, 0x192b1919, 0x2b082b19, 0x082b082b, 0x2b082b2b, 0x19192b08, 0x2b082b2b,
|
||||
0x19192b2b, 0x2b082b2b, 0x2b08082b, 0x2b082b2b, 0x2b2b082b, 0x2b082b2b, 0x08080819, 0x2b190808,
|
||||
0x08081908, 0x2b190808, 0x08082b19, 0x2b190808, 0x08190808, 0x2b190808, 0x0819082b, 0x2b190808,
|
||||
0x08191919, 0x2b190808, 0x08192b08, 0x2b190808, 0x082b1908, 0x2b190808, 0x19080808, 0x2b190808,
|
||||
0x1908082b, 0x2b190808, 0x19081919, 0x2b190808, 0x19082b08, 0x2b190808, 0x19190819, 0x2b190808,
|
||||
0x19191908, 0x2b190808, 0x192b0808, 0x2b190808, 0x2b080819, 0x2b190808, 0x2b081908, 0x2b190808,
|
||||
0x2b190808, 0x2b190808, 0x08080808, 0x2b190819, 0x08081919, 0x2b190819, 0x08190819, 0x2b190819,
|
||||
0x08191908, 0x2b190819, 0x19080819, 0x2b190819, 0x19081908, 0x2b190819, 0x19190808, 0x2b190819,
|
||||
0x19192b2b, 0x2b190819, 0x08080819, 0x2b19082b, 0x08081908, 0x2b19082b, 0x08190808, 0x2b19082b,
|
||||
0x19080808, 0x2b19082b, 0x2b2b192b, 0x2b19082b, 0x08080808, 0x2b191908, 0x0808082b, 0x2b191908,
|
||||
0x08081919, 0x2b191908, 0x08082b08, 0x2b191908, 0x08190819, 0x2b191908, 0x08191908, 0x2b191908,
|
||||
0x082b0808, 0x2b191908, 0x19080819, 0x2b191908, 0x19081908, 0x2b191908, 0x19190808, 0x2b191908,
|
||||
0x2b080808, 0x2b191908, 0x2b19192b, 0x2b191908, 0x08080819, 0x2b191919, 0x08081908, 0x2b191919,
|
||||
0x08190808, 0x2b191919, 0x19080808, 0x2b191919, 0x2b192b08, 0x2b191919, 0x2b2b0819, 0x2b191919,
|
||||
0x08080808, 0x2b19192b, 0x1908192b, 0x2b19192b, 0x192b1908, 0x2b19192b, 0x08080819, 0x2b192b08,
|
||||
0x08081908, 0x2b192b08, 0x08190808, 0x2b192b08, 0x082b192b, 0x2b192b08, 0x19080808, 0x2b192b08,
|
||||
0x2b2b2b19, 0x2b192b08, 0x08080808, 0x2b192b19, 0x19082b19, 0x2b192b19, 0x1919082b, 0x2b192b19,
|
||||
0x2b190808, 0x2b192b2b, 0x08080808, 0x2b2b0808, 0x08081919, 0x2b2b0808, 0x08082b2b, 0x2b2b0808,
|
||||
0x08191908, 0x2b2b0808, 0x082b082b, 0x2b2b0808, 0x082b2b2b, 0x2b2b0808, 0x19080819, 0x2b2b0808,
|
||||
0x19081908, 0x2b2b0808, 0x19190808, 0x2b2b0808, 0x2b2b082b, 0x2b2b0808, 0x2b2b2b2b, 0x2b2b0808,
|
||||
0x19080808, 0x2b2b0819, 0x192b1919, 0x2b2b0819, 0x0808082b, 0x2b2b082b, 0x08082b2b, 0x2b2b082b,
|
||||
0x082b082b, 0x2b2b082b, 0x082b2b08, 0x2b2b082b, 0x082b2b2b, 0x2b2b082b, 0x2b08082b, 0x2b2b082b,
|
||||
0x2b082b08, 0x2b2b082b, 0x2b082b2b, 0x2b2b082b, 0x2b2b2b08, 0x2b2b082b, 0x08080819, 0x2b2b1908,
|
||||
0x08081908, 0x2b2b1908, 0x08190808, 0x2b2b1908, 0x19080808, 0x2b2b1908, 0x2b082b19, 0x2b2b1908,
|
||||
0x2b2b1908, 0x2b2b1908, 0x08080808, 0x2b2b1919, 0x08192b19, 0x2b2b1919, 0x19190819, 0x2b2b192b,
|
||||
0x08082b2b, 0x2b2b2b08, 0x082b2b08, 0x2b2b2b08, 0x2b2b082b, 0x2b2b2b08, 0x19191908, 0x2b2b2b19,
|
||||
0x2b08192b, 0x2b2b2b19, 0x08082b08, 0x2b2b2b2b, 0x08082b2b, 0x2b2b2b2b, 0x082b0808, 0x2b2b2b2b,
|
||||
0x082b082b, 0x2b2b2b2b, 0x082b2b08, 0x2b2b2b2b, 0x2b082b08, 0x2b2b2b2b, 0x2b2b2b2b, 0x2b2b2b2b
|
||||
);
|
||||
#enddecl(IQ2_S_GRID)
|
||||
|
||||
#decl(IQ3_XSS_GRID)
|
||||
|
||||
const iq3xxs_grid = array<u32, 256>(
|
||||
0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
|
||||
0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
|
||||
0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
|
||||
0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e,
|
||||
0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c,
|
||||
0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
|
||||
0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34,
|
||||
0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c,
|
||||
0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
|
||||
0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04,
|
||||
0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c,
|
||||
0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
|
||||
0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434,
|
||||
0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c,
|
||||
0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
|
||||
0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24,
|
||||
0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24,
|
||||
0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
|
||||
0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c,
|
||||
0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14,
|
||||
0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
|
||||
0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e,
|
||||
0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404,
|
||||
0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
|
||||
0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c,
|
||||
0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14,
|
||||
0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
|
||||
0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c,
|
||||
0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14,
|
||||
0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
|
||||
0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c,
|
||||
0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04
|
||||
);
|
||||
#enddecl(IQ3_XSS_GRID)
|
||||
|
||||
#decl(IQ3_S_GRID)
|
||||
|
||||
const iq3s_grid = array<u32, 512>(
|
||||
0x01010101, 0x01010103, 0x01010105, 0x0101010b, 0x0101010f, 0x01010301, 0x01010303, 0x01010305,
|
||||
0x01010309, 0x0101030d, 0x01010501, 0x01010503, 0x0101050b, 0x01010707, 0x01010901, 0x01010905,
|
||||
0x0101090b, 0x0101090f, 0x01010b03, 0x01010b07, 0x01010d01, 0x01010d05, 0x01010f03, 0x01010f09,
|
||||
0x01010f0f, 0x01030101, 0x01030103, 0x01030105, 0x01030109, 0x01030301, 0x01030303, 0x0103030b,
|
||||
0x01030501, 0x01030507, 0x0103050f, 0x01030703, 0x0103070b, 0x01030909, 0x01030d03, 0x01030d0b,
|
||||
0x01030f05, 0x01050101, 0x01050103, 0x0105010b, 0x0105010f, 0x01050301, 0x01050307, 0x0105030d,
|
||||
0x01050503, 0x0105050b, 0x01050701, 0x01050709, 0x01050905, 0x0105090b, 0x0105090f, 0x01050b03,
|
||||
0x01050b07, 0x01050f01, 0x01050f07, 0x01070107, 0x01070303, 0x0107030b, 0x01070501, 0x01070505,
|
||||
0x01070703, 0x01070707, 0x0107070d, 0x01070909, 0x01070b01, 0x01070b05, 0x01070d0f, 0x01070f03,
|
||||
0x01070f0b, 0x01090101, 0x01090307, 0x0109030f, 0x01090503, 0x01090509, 0x01090705, 0x01090901,
|
||||
0x01090907, 0x01090b03, 0x01090f01, 0x010b0105, 0x010b0109, 0x010b0501, 0x010b0505, 0x010b050d,
|
||||
0x010b0707, 0x010b0903, 0x010b090b, 0x010b090f, 0x010b0d0d, 0x010b0f07, 0x010d010d, 0x010d0303,
|
||||
0x010d0307, 0x010d0703, 0x010d0b05, 0x010d0f03, 0x010f0101, 0x010f0105, 0x010f0109, 0x010f0501,
|
||||
0x010f0505, 0x010f050d, 0x010f0707, 0x010f0b01, 0x010f0b09, 0x03010101, 0x03010103, 0x03010105,
|
||||
0x03010109, 0x03010301, 0x03010303, 0x03010307, 0x0301030b, 0x0301030f, 0x03010501, 0x03010505,
|
||||
0x03010703, 0x03010709, 0x0301070d, 0x03010b09, 0x03010b0d, 0x03010d03, 0x03010f05, 0x03030101,
|
||||
0x03030103, 0x03030107, 0x0303010d, 0x03030301, 0x03030309, 0x03030503, 0x03030701, 0x03030707,
|
||||
0x03030903, 0x03030b01, 0x03030b05, 0x03030f01, 0x03030f0d, 0x03050101, 0x03050305, 0x0305030b,
|
||||
0x0305030f, 0x03050501, 0x03050509, 0x03050705, 0x03050901, 0x03050907, 0x03050b0b, 0x03050d01,
|
||||
0x03050f05, 0x03070103, 0x03070109, 0x0307010f, 0x03070301, 0x03070307, 0x03070503, 0x0307050f,
|
||||
0x03070701, 0x03070709, 0x03070903, 0x03070d05, 0x03070f01, 0x03090107, 0x0309010b, 0x03090305,
|
||||
0x03090309, 0x03090703, 0x03090707, 0x03090905, 0x0309090d, 0x03090b01, 0x03090b09, 0x030b0103,
|
||||
0x030b0301, 0x030b0307, 0x030b0503, 0x030b0701, 0x030b0705, 0x030b0b03, 0x030d0501, 0x030d0509,
|
||||
0x030d050f, 0x030d0909, 0x030d090d, 0x030f0103, 0x030f0107, 0x030f0301, 0x030f0305, 0x030f0503,
|
||||
0x030f070b, 0x030f0903, 0x030f0d05, 0x030f0f01, 0x05010101, 0x05010103, 0x05010107, 0x0501010b,
|
||||
0x0501010f, 0x05010301, 0x05010305, 0x05010309, 0x0501030d, 0x05010503, 0x05010507, 0x0501050f,
|
||||
0x05010701, 0x05010705, 0x05010903, 0x05010907, 0x0501090b, 0x05010b01, 0x05010b05, 0x05010d0f,
|
||||
0x05010f01, 0x05010f07, 0x05010f0b, 0x05030101, 0x05030105, 0x05030301, 0x05030307, 0x0503030f,
|
||||
0x05030505, 0x0503050b, 0x05030703, 0x05030709, 0x05030905, 0x05030b03, 0x05050103, 0x05050109,
|
||||
0x0505010f, 0x05050503, 0x05050507, 0x05050701, 0x0505070f, 0x05050903, 0x05050b07, 0x05050b0f,
|
||||
0x05050f03, 0x05050f09, 0x05070101, 0x05070105, 0x0507010b, 0x05070303, 0x05070505, 0x05070509,
|
||||
0x05070703, 0x05070707, 0x05070905, 0x05070b01, 0x05070d0d, 0x05090103, 0x0509010f, 0x05090501,
|
||||
0x05090507, 0x05090705, 0x0509070b, 0x05090903, 0x05090f05, 0x05090f0b, 0x050b0109, 0x050b0303,
|
||||
0x050b0505, 0x050b070f, 0x050b0901, 0x050b0b07, 0x050b0f01, 0x050d0101, 0x050d0105, 0x050d010f,
|
||||
0x050d0503, 0x050d0b0b, 0x050d0d03, 0x050f010b, 0x050f0303, 0x050f050d, 0x050f0701, 0x050f0907,
|
||||
0x050f0b01, 0x07010105, 0x07010303, 0x07010307, 0x0701030b, 0x0701030f, 0x07010505, 0x07010703,
|
||||
0x07010707, 0x0701070b, 0x07010905, 0x07010909, 0x0701090f, 0x07010b03, 0x07010d07, 0x07010f03,
|
||||
0x07030103, 0x07030107, 0x0703010b, 0x07030309, 0x07030503, 0x07030507, 0x07030901, 0x07030d01,
|
||||
0x07030f05, 0x07030f0d, 0x07050101, 0x07050305, 0x07050501, 0x07050705, 0x07050709, 0x07050b01,
|
||||
0x07070103, 0x07070301, 0x07070309, 0x07070503, 0x07070507, 0x0707050f, 0x07070701, 0x07070903,
|
||||
0x07070907, 0x0707090f, 0x07070b0b, 0x07070f07, 0x07090107, 0x07090303, 0x0709030d, 0x07090505,
|
||||
0x07090703, 0x07090b05, 0x07090d01, 0x07090d09, 0x070b0103, 0x070b0301, 0x070b0305, 0x070b050b,
|
||||
0x070b0705, 0x070b0909, 0x070b0b0d, 0x070b0f07, 0x070d030d, 0x070d0903, 0x070f0103, 0x070f0107,
|
||||
0x070f0501, 0x070f0505, 0x070f070b, 0x09010101, 0x09010109, 0x09010305, 0x09010501, 0x09010509,
|
||||
0x0901050f, 0x09010705, 0x09010903, 0x09010b01, 0x09010f01, 0x09030105, 0x0903010f, 0x09030303,
|
||||
0x09030307, 0x09030505, 0x09030701, 0x0903070b, 0x09030907, 0x09030b03, 0x09030b0b, 0x09050103,
|
||||
0x09050107, 0x09050301, 0x0905030b, 0x09050503, 0x09050707, 0x09050901, 0x09050b0f, 0x09050d05,
|
||||
0x09050f01, 0x09070109, 0x09070303, 0x09070307, 0x09070501, 0x09070505, 0x09070703, 0x0907070b,
|
||||
0x09090101, 0x09090105, 0x09090509, 0x0909070f, 0x09090901, 0x09090f03, 0x090b010b, 0x090b010f,
|
||||
0x090b0503, 0x090b0d05, 0x090d0307, 0x090d0709, 0x090d0d01, 0x090f0301, 0x090f030b, 0x090f0701,
|
||||
0x090f0907, 0x090f0b03, 0x0b010105, 0x0b010301, 0x0b010309, 0x0b010505, 0x0b010901, 0x0b010909,
|
||||
0x0b01090f, 0x0b010b05, 0x0b010d0d, 0x0b010f09, 0x0b030103, 0x0b030107, 0x0b03010b, 0x0b030305,
|
||||
0x0b030503, 0x0b030705, 0x0b030f05, 0x0b050101, 0x0b050303, 0x0b050507, 0x0b050701, 0x0b05070d,
|
||||
0x0b050b07, 0x0b070105, 0x0b07010f, 0x0b070301, 0x0b07050f, 0x0b070909, 0x0b070b03, 0x0b070d0b,
|
||||
0x0b070f07, 0x0b090103, 0x0b090109, 0x0b090501, 0x0b090705, 0x0b09090d, 0x0b0b0305, 0x0b0b050d,
|
||||
0x0b0b0b03, 0x0b0b0b07, 0x0b0d0905, 0x0b0f0105, 0x0b0f0109, 0x0b0f0505, 0x0d010303, 0x0d010307,
|
||||
0x0d01030b, 0x0d010703, 0x0d010707, 0x0d010d01, 0x0d030101, 0x0d030501, 0x0d03050f, 0x0d030d09,
|
||||
0x0d050305, 0x0d050709, 0x0d050905, 0x0d050b0b, 0x0d050d05, 0x0d050f01, 0x0d070101, 0x0d070309,
|
||||
0x0d070503, 0x0d070901, 0x0d09050b, 0x0d090907, 0x0d090d05, 0x0d0b0101, 0x0d0b0107, 0x0d0b0709,
|
||||
0x0d0b0d01, 0x0d0d010b, 0x0d0d0901, 0x0d0f0303, 0x0d0f0307, 0x0f010101, 0x0f010109, 0x0f01010f,
|
||||
0x0f010501, 0x0f010505, 0x0f01070d, 0x0f010901, 0x0f010b09, 0x0f010d05, 0x0f030105, 0x0f030303,
|
||||
0x0f030509, 0x0f030907, 0x0f03090b, 0x0f050103, 0x0f050109, 0x0f050301, 0x0f05030d, 0x0f050503,
|
||||
0x0f050701, 0x0f050b03, 0x0f070105, 0x0f070705, 0x0f07070b, 0x0f070b07, 0x0f090103, 0x0f09010b,
|
||||
0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101
|
||||
);
|
||||
#enddecl(IQ3_S_GRID)
|
||||
|
||||
#decl(IQ1_GRID)
|
||||
|
||||
const IQ1_DELTA: f32 = 0.125;
|
||||
|
||||
const iq1_grid = array<u32, 1024>(
|
||||
0xfffdffff, 0xfff7fff0, 0xffccfff5, 0xffdfffc0, 0xffd7ffdd, 0xff30ffd5, 0xff03ff0c, 0xff10ff01,
|
||||
0xff7dff7f, 0xff75ff77, 0xff5fff40, 0xff57ff5d, 0xfcf3ff55, 0xfcccfcf0, 0xfcc1fcc3, 0xfcc5fcc4,
|
||||
0xfc3cfcd0, 0xfc34fc31, 0xfc00fc0d, 0xfc1cfc05, 0xfc11fc13, 0xfc70fc17, 0xfc43fc4c, 0xfc50fc41,
|
||||
0xfdfdfdff, 0xfdf5fdf7, 0xfddffdc0, 0xfdd7fddd, 0xfd30fdd5, 0xfd04fd0c, 0xfd14fd13, 0xfd7dfd7f,
|
||||
0xfd75fd77, 0xfd40fd4c, 0xfd5ffd44, 0xfd57fd5d, 0xf3ccfd55, 0xf3c1f3c3, 0xf33cf3d0, 0xf300f334,
|
||||
0xf313f305, 0xf34cf310, 0xf350f344, 0xf0f3f0fc, 0xf0f1f0f0, 0xf0c7f0c0, 0xf0d4f0c5, 0xf030f03f,
|
||||
0xf00ff035, 0xf003f00c, 0xf001f000, 0xf01ff004, 0xf010f01d, 0xf015f017, 0xf04cf07c, 0xf047f040,
|
||||
0xf05cf045, 0xf050f053, 0xf054f051, 0xf1c4f1c3, 0xf133f13c, 0xf10df10f, 0xf107f100, 0xf11cf11f,
|
||||
0xf114f111, 0xf14cf170, 0xf144f143, 0xf7fdf7ff, 0xf7f5f7f7, 0xf7dff7c0, 0xf7d7f7dd, 0xf730f7d5,
|
||||
0xf701f70c, 0xf77ff710, 0xf777f77d, 0xf740f775, 0xf75df75f, 0xf755f757, 0xf4ccf4f0, 0xf4c4f4c3,
|
||||
0xf4d0f4d3, 0xf40ff43c, 0xf400f40c, 0xf413f41c, 0xf44cf414, 0xf441f443, 0xf450f444, 0xf5fdf5ff,
|
||||
0xf5f5f5f7, 0xf5dff5c0, 0xf5d7f5dd, 0xf530f5d5, 0xf504f50c, 0xf510f51c, 0xf57df57f, 0xf577f570,
|
||||
0xf540f575, 0xf55df55f, 0xf555f557, 0xcfcccfcf, 0xcfc4cfc3, 0xcfd0cfd3, 0xcf33cf3c, 0xcf00cf0f,
|
||||
0xcf1ccf07, 0xcf10cf13, 0xcf4ccf14, 0xcf41cf43, 0xcf50cf5c, 0xccf3ccfc, 0xccf4ccf1, 0xcccdcccf,
|
||||
0xccc7ccc0, 0xccd3ccdc, 0xcc30ccd4, 0xcc0fcc35, 0xcc0dcc0c, 0xcc00cc03, 0xcc04cc01, 0xcc10cc1f,
|
||||
0xcc4dcc73, 0xcc5ccc40, 0xcdcccc53, 0xcdc1cdc3, 0xcd3fcdd0, 0xcd34cd31, 0xcd00cd0d, 0xcd05cd07,
|
||||
0xcd11cd13, 0xcd4ccd70, 0xcd41cd43, 0xc3fccd50, 0xc3f4c3f1, 0xc3c0c3c3, 0xc3c4c3c7, 0xc3d1c3dc,
|
||||
0xc330c33c, 0xc337c331, 0xc30cc335, 0xc300c303, 0xc304c301, 0xc310c31d, 0xc373c317, 0xc34fc374,
|
||||
0xc340c343, 0xc344c347, 0xc35cc345, 0xc350c353, 0xc0fdc354, 0xc0f5c0f0, 0xc0c3c0cc, 0xc0c1c0c0,
|
||||
0xc0dfc0c4, 0xc0d0c0dd, 0xc0d5c0d7, 0xc033c03c, 0xc031c030, 0xc00dc00c, 0xc000c003, 0xc004c001,
|
||||
0xc01cc005, 0xc010c013, 0xc014c011, 0xc07dc07f, 0xc070c073, 0xc075c077, 0xc04cc04f, 0xc040c043,
|
||||
0xc044c041, 0xc05fc045, 0xc050c05d, 0xc1f3c1fc, 0xc1f1c1f0, 0xc1c1c1c0, 0xc1c5c1c7, 0xc1d1c1dc,
|
||||
0xc13dc13f, 0xc130c133, 0xc135c137, 0xc100c10c, 0xc107c101, 0xc11cc104, 0xc110c113, 0xc114c117,
|
||||
0xc171c115, 0xc14dc175, 0xc153c140, 0xc7ccc154, 0xc7d0c7c1, 0xc733c73c, 0xc734c731, 0xc700c70f,
|
||||
0xc705c707, 0xc71cc71f, 0xc711c713, 0xc770c714, 0xc743c74c, 0xc4cfc750, 0xc4c0c4cd, 0xc4dcc4c5,
|
||||
0xc43dc4d0, 0xc430c433, 0xc40cc437, 0xc400c403, 0xc404c401, 0xc41fc405, 0xc415c410, 0xc44cc474,
|
||||
0xc440c44d, 0xc45cc447, 0xc454c451, 0xc5c1c5f4, 0xc5d1c5d3, 0xc531c533, 0xc50fc534, 0xc500c50d,
|
||||
0xc51cc507, 0xc514c511, 0xc54cc570, 0xc545c541, 0xdffddfff, 0xdff5dff7, 0xdfdfdfc0, 0xdfd0dfdd,
|
||||
0xdfd5dfd7, 0xdf0cdf30, 0xdf1cdf04, 0xdf7fdf10, 0xdf77df7d, 0xdf40df75, 0xdf5ddf5f, 0xdf57df50,
|
||||
0xdcf0df55, 0xdcc3dccc, 0xdcd0dcc4, 0xdc33dc3d, 0xdc00dc34, 0xdc05dc07, 0xdc13dc1c, 0xdc11dc10,
|
||||
0xdc4fdc70, 0xdc44dc41, 0xddfcdc50, 0xddf5ddf7, 0xddc0ddcc, 0xdddddddf, 0xddd5ddd7, 0xdd0cdd30,
|
||||
0xdd04dd01, 0xdd7cdd10, 0xdd75dd77, 0xdd40dd4c, 0xdd5ddd5f, 0xdd55dd57, 0xd3c3d3f0, 0xd3c4d3c1,
|
||||
0xd333d3d0, 0xd331d330, 0xd30dd334, 0xd307d300, 0xd311d305, 0xd34cd370, 0xd344d343, 0xd350d35c,
|
||||
0xd0c0d0f4, 0xd0d4d0dc, 0xd030d03f, 0xd00cd037, 0xd000d003, 0xd01dd004, 0xd017d010, 0xd04fd074,
|
||||
0xd040d043, 0xd045d047, 0xd053d05c, 0xd054d051, 0xd1cfd1f0, 0xd1c4d1cd, 0xd13cd1d0, 0xd100d134,
|
||||
0xd11cd11f, 0xd173d114, 0xd14fd171, 0xd7ffd145, 0xd7f7d7fd, 0xd7c0d7f5, 0xd7ddd7df, 0xd7d5d7d7,
|
||||
0xd70cd730, 0xd710d703, 0xd77dd77f, 0xd775d777, 0xd75dd75f, 0xd755d757, 0xd4ccd4f4, 0xd4c4d4c3,
|
||||
0xd431d4d0, 0xd40dd434, 0xd41cd400, 0xd411d413, 0xd470d414, 0xd441d44f, 0xd453d444, 0xd5ffd450,
|
||||
0xd5f7d5fd, 0xd5dfd5f5, 0xd5d7d5dd, 0xd530d5d5, 0xd501d50c, 0xd510d504, 0xd57dd57f, 0xd575d577,
|
||||
0xd55fd540, 0xd557d55d, 0x3ff0d555, 0x3fc13fcc, 0x3f343fd0, 0x3f003f0d, 0x3f053f07, 0x3f133f1c,
|
||||
0x3f433f11, 0x3f5c3f44, 0x3cff3f51, 0x3cf33cfc, 0x3cf43cf1, 0x3cc03ccd, 0x3cc73cc1, 0x3cdc3cc5,
|
||||
0x3cd43cd1, 0x3c373c30, 0x3c0c3c35, 0x3c003c03, 0x3c043c01, 0x3c103c05, 0x3c153c17, 0x3c733c7c,
|
||||
0x3c4f3c71, 0x3c403c4d, 0x3c5c3c5f, 0x3df03c5d, 0x3dc33dcc, 0x3dd03dc1, 0x3d0d3d3c, 0x3d053d00,
|
||||
0x3d143d13, 0x3d433d74, 0x33fc3d50, 0x33c433c0, 0x333033d4, 0x33353337, 0x3303330c, 0x33013300,
|
||||
0x331d331c, 0x33173310, 0x337c3315, 0x33743371, 0x334d334f, 0x335f3340, 0x3354335c, 0x30fd30fc,
|
||||
0x30f530f0, 0x30c330cc, 0x30c130c0, 0x30df30c4, 0x30d530d0, 0x3033303c, 0x30313030, 0x300f3034,
|
||||
0x3003300c, 0x30013000, 0x30043007, 0x3013301c, 0x30113010, 0x307d3014, 0x30703073, 0x304c3077,
|
||||
0x30403043, 0x30443041, 0x30503045, 0x30553057, 0x31f031fc, 0x31c331f4, 0x31c731c0, 0x31dc31c5,
|
||||
0x31d431d3, 0x313d313f, 0x31373130, 0x310c310f, 0x3100310d, 0x31043101, 0x3110311d, 0x317c3117,
|
||||
0x31753170, 0x31403143, 0x3153315c, 0x37f03151, 0x37c037cc, 0x37d037c5, 0x3734373d, 0x3700370f,
|
||||
0x371c3707, 0x37113713, 0x37703714, 0x3743374c, 0x37443741, 0x34fc3750, 0x34f134f0, 0x34cf34f5,
|
||||
0x34c034c3, 0x34dc34c7, 0x34d134d3, 0x3430343f, 0x340c3435, 0x3403340d, 0x34013400, 0x341f3404,
|
||||
0x3410341d, 0x34153411, 0x34743471, 0x3440344d, 0x34473441, 0x3453345c, 0x34543451, 0x353335c1,
|
||||
0x35343531, 0x35073500, 0x35133505, 0x35433514, 0x0ffc3550, 0x0ff00ff3, 0x0ff40ff1, 0x0fc00fcd,
|
||||
0x0fdc0fc5, 0x0fd40fd3, 0x0f300f3f, 0x0f0c0f37, 0x0f000f03, 0x0f040f01, 0x0f170f10, 0x0f740f71,
|
||||
0x0f470f40, 0x0f5c0f5f, 0x0f540f51, 0x0cf70cf0, 0x0cf50cf4, 0x0cc30ccc, 0x0cc10cc0, 0x0cc40cc7,
|
||||
0x0cd00cdf, 0x0cd70cd1, 0x0c3c0cd5, 0x0c300c33, 0x0c340c31, 0x0c0c0c0f, 0x0c030c0d, 0x0c010c00,
|
||||
0x0c040c07, 0x0c1c0c05, 0x0c100c13, 0x0c140c11, 0x0c700c7d, 0x0c430c4c, 0x0c410c40, 0x0c5f0c44,
|
||||
0x0c550c50, 0x0df10dfc, 0x0dc00dcd, 0x0ddc0dc5, 0x0d3d0dd3, 0x0d350d30, 0x0d030d0c, 0x0d010d00,
|
||||
0x0d1d0d04, 0x0d700d10, 0x0d4d0d4f, 0x0d440d40, 0x0d530d45, 0x03f003f3, 0x03c303cc, 0x03c103c0,
|
||||
0x03c403c7, 0x03d003dc, 0x03d503d7, 0x0333033c, 0x03310330, 0x03350334, 0x030c030f, 0x03000303,
|
||||
0x03070301, 0x03050304, 0x031d031c, 0x03100313, 0x03140311, 0x0377037f, 0x034c0375, 0x03400343,
|
||||
0x03440341, 0x0353035c, 0x03550350, 0x00fd00fc, 0x00f000f3, 0x00f400f1, 0x00cc00cf, 0x00c300cd,
|
||||
0x00c100c0, 0x00c500c4, 0x00d300dc, 0x00d100d0, 0x003f00d4, 0x003d003c, 0x00300033, 0x00370031,
|
||||
0x000f0034, 0x000d000c, 0x00000003, 0x00070001, 0x00050004, 0x001c001f, 0x00100013, 0x00170011,
|
||||
0x00150014, 0x0073007c, 0x00740070, 0x004f0075, 0x0043004c, 0x00410040, 0x00440047, 0x0053005c,
|
||||
0x00510050, 0x01ff0054, 0x01fd01fc, 0x01f101f3, 0x01f401f7, 0x01c301cc, 0x01c701c0, 0x01df01c4,
|
||||
0x01dd01dc, 0x01d001d3, 0x01d701d1, 0x013c01d4, 0x01310130, 0x01340137, 0x010f0135, 0x010d010c,
|
||||
0x01000103, 0x01070101, 0x01050104, 0x0113011c, 0x01140110, 0x0170017d, 0x01770171, 0x01750174,
|
||||
0x0140014c, 0x015d0145, 0x01510150, 0x01540157, 0x07f007f3, 0x07f407f1, 0x07c007cf, 0x07dc07c7,
|
||||
0x073007d5, 0x07350737, 0x0703070c, 0x07010700, 0x07040707, 0x071d071f, 0x07100713, 0x0774077d,
|
||||
0x074d074f, 0x07470740, 0x0754075c, 0x04fd04fc, 0x04f504f0, 0x04c304cc, 0x04c104c0, 0x04d004c4,
|
||||
0x0433043c, 0x04310430, 0x040f0434, 0x040d040c, 0x04000403, 0x04070401, 0x04050404, 0x0413041c,
|
||||
0x04110410, 0x047c0414, 0x04740470, 0x0443044c, 0x04410440, 0x04440447, 0x05f30450, 0x05c005f7,
|
||||
0x05df05c5, 0x05d105d0, 0x053005d4, 0x05340537, 0x0500050c, 0x05070501, 0x051d0504, 0x05170510,
|
||||
0x057c0515, 0x054d0575, 0x05410540, 0x05450547, 0x1ff0055c, 0x1fc11fc3, 0x1fd01fc4, 0x1f0f1f33,
|
||||
0x1f011f00, 0x1f051f07, 0x1f131f1c, 0x1f141f11, 0x1f411f7c, 0x1cfc1f50, 0x1cf11cf3, 0x1ccd1cf4,
|
||||
0x1cdc1cc0, 0x1cd11cdd, 0x1c301cd4, 0x1c0c1c34, 0x1c011c00, 0x1c101c04, 0x1c151c11, 0x1c751c73,
|
||||
0x1c401c4d, 0x1c511c5c, 0x1dcc1c54, 0x1dc41dc1, 0x1d3c1d3f, 0x1d001d31, 0x1d071d01, 0x1d701d1f,
|
||||
0x1d411d4c, 0x13cc1d50, 0x13c013cd, 0x13c513c1, 0x13d113dc, 0x133f13d4, 0x1330133d, 0x13351337,
|
||||
0x1303130c, 0x13011300, 0x13051304, 0x131d131f, 0x13731310, 0x13741370, 0x134d134f, 0x13401343,
|
||||
0x13471341, 0x135c1345, 0x13541353, 0x10f710f0, 0x10cc10f5, 0x10c110c0, 0x103310c4, 0x10311030,
|
||||
0x100f1034, 0x1003100c, 0x10011000, 0x101c1004, 0x10101013, 0x10141011, 0x10741071, 0x104c1075,
|
||||
0x10411040, 0x10451044, 0x1050105d, 0x10571051, 0x11f411fd, 0x11df11c0, 0x11d711d1, 0x113f11d4,
|
||||
0x11371130, 0x110c1135, 0x11001103, 0x11071101, 0x111f1105, 0x11171110, 0x117d117f, 0x11751170,
|
||||
0x11411143, 0x11441147, 0x1153115f, 0x11551151, 0x17c417c1, 0x173c17d0, 0x1700170d, 0x171c1705,
|
||||
0x17701714, 0x1747174c, 0x14fc1751, 0x14cf14f3, 0x14dc14c0, 0x14d114d3, 0x143f14d4, 0x1430143c,
|
||||
0x14371431, 0x1403140c, 0x14011400, 0x141f1404, 0x14151410, 0x1473147d, 0x14401475, 0x1453145c,
|
||||
0x14541450, 0x15c115cc, 0x153c15c7, 0x15341533, 0x1500150f, 0x15051507, 0x15101513, 0x15711514,
|
||||
0x15471543, 0x15511545, 0x7ffd7fff, 0x7ff57ff7, 0x7fdd7fdf, 0x7fd57fd7, 0x7f0f7f30, 0x7f037f0c,
|
||||
0x7f047f01, 0x7f7f7f10, 0x7f777f7d, 0x7f407f75, 0x7f5d7f5f, 0x7f557f57, 0x7ccc7cf0, 0x7cc17cc3,
|
||||
0x7cd07cc4, 0x7c337c3c, 0x7c0f7c34, 0x7c007c0d, 0x7c077c01, 0x7c137c04, 0x7c147c11, 0x7c747c70,
|
||||
0x7c417c43, 0x7c507c44, 0x7dfd7dff, 0x7df57df7, 0x7ddf7dc0, 0x7dd77ddd, 0x7d0c7dd5, 0x7d047d03,
|
||||
0x7d7f7d10, 0x7d777d7d, 0x7d407d75, 0x7d5d7d5f, 0x7d557d57, 0x73c473c3, 0x7333733c, 0x7300730c,
|
||||
0x731c7305, 0x73147313, 0x73447343, 0x70f470fc, 0x70c070cd, 0x70d170c5, 0x703f70d4, 0x7030703c,
|
||||
0x700c7037, 0x70007003, 0x70047001, 0x70107005, 0x70177011, 0x707c7015, 0x70717073, 0x704f7074,
|
||||
0x7040704d, 0x70517047, 0x71c171cc, 0x71d071c4, 0x7133713c, 0x71357134, 0x7100710f, 0x71057104,
|
||||
0x7111711c, 0x71707115, 0x7145714c, 0x77ff7153, 0x77f777fd, 0x77c077f5, 0x77dd77df, 0x77d577d7,
|
||||
0x7730773c, 0x7703770c, 0x77107704, 0x777f7714, 0x7777777d, 0x77407775, 0x775d775f, 0x77557757,
|
||||
0x74f174f0, 0x74c374cc, 0x74d074c1, 0x7433743c, 0x74347431, 0x740d740f, 0x74057400, 0x7413741c,
|
||||
0x74417470, 0x74507444, 0x75fd75ff, 0x75f575f7, 0x75df75c0, 0x75d775dd, 0x753075d5, 0x7503750c,
|
||||
0x757f7501, 0x7577757d, 0x75407575, 0x755d755f, 0x75557557, 0x4fcc4ff0, 0x4fc74fc1, 0x4fd04fc4,
|
||||
0x4f314f3c, 0x4f004f34, 0x4f054f07, 0x4f154f14, 0x4f4c4f70, 0x4f414f43, 0x4f504f44, 0x4cf34cfc,
|
||||
0x4cf44cf1, 0x4cc04ccf, 0x4cc54cc7, 0x4cd34cdc, 0x4cd44cd1, 0x4c304c3f, 0x4c0c4c0f, 0x4c004c03,
|
||||
0x4c044c01, 0x4c104c1d, 0x4c714c73, 0x4c404c4d, 0x4c5c4c47, 0x4c514c53, 0x4df04c54, 0x4dc34dcc,
|
||||
0x4dd04dc4, 0x4d314d33, 0x4d0f4d34, 0x4d004d0d, 0x4d114d07, 0x4d704d14, 0x4d414d43, 0x43fc4d54,
|
||||
0x43f143f3, 0x43c043cf, 0x43d143c7, 0x4335433f, 0x4303430c, 0x43014300, 0x43044307, 0x431c431f,
|
||||
0x4310431d, 0x43714373, 0x4343434d, 0x43474340, 0x4354435c, 0x40f040ff, 0x40f540f7, 0x40cc40cf,
|
||||
0x40c040c3, 0x40c440c1, 0x40d040dc, 0x40d540d4, 0x4033403c, 0x40314030, 0x400f4034, 0x400d400c,
|
||||
0x40004003, 0x40074001, 0x40054004, 0x4013401c, 0x40114010, 0x407c4014, 0x40774070, 0x404d404c,
|
||||
0x40404043, 0x40444041, 0x405f4045, 0x4050405d, 0x40554057, 0x41f341fc, 0x41c041cf, 0x41df41c4,
|
||||
0x41d441d1, 0x41374130, 0x410c4134, 0x4100410d, 0x41044101, 0x41174110, 0x4173417d, 0x41754174,
|
||||
0x4143414d, 0x41534140, 0x41544151, 0x47c147f0, 0x47d047c4, 0x4731473c, 0x470d470f, 0x47014700,
|
||||
0x47134705, 0x47704710, 0x4741474c, 0x47504744, 0x44f144f3, 0x44cf44f4, 0x44c044cd, 0x44c544c7,
|
||||
0x44dc44df, 0x44d144d3, 0x443d443f, 0x44374430, 0x440c4435, 0x44004403, 0x44044401, 0x4410441d,
|
||||
0x44154411, 0x4473447c, 0x444d444f, 0x44454440, 0x4451445c, 0x45c045f0, 0x453345d0, 0x45344531,
|
||||
0x4500450f, 0x451c4507, 0x454c4570, 0x45404543, 0x5fff4541, 0x5ff75ffd, 0x5fc05ff5, 0x5fdd5fdf,
|
||||
0x5fd55fd7, 0x5f0c5f30, 0x5f015f03, 0x5f7f5f04, 0x5f775f7d, 0x5f405f75, 0x5f5d5f5f, 0x5f555f57,
|
||||
0x5cf45cf0, 0x5cc35ccc, 0x5cc45cc1, 0x5c315cc5, 0x5c0c5c34, 0x5c075c00, 0x5c1c5c05, 0x5c705c13,
|
||||
0x5c4d5c4f, 0x5c445c41, 0x5df75dfd, 0x5dcf5df5, 0x5ddd5dc4, 0x5dd55dd7, 0x5d0c5d30, 0x5d045d01,
|
||||
0x5d7f5d10, 0x5d775d7d, 0x5d405d75, 0x5d5d5d5f, 0x5d555d57, 0x53d053c4, 0x5333533c, 0x5303530f,
|
||||
0x53075300, 0x531c5305, 0x53115310, 0x53145317, 0x50f15370, 0x50cf50f4, 0x50c050cd, 0x50d150c7,
|
||||
0x503d50d4, 0x500c5030, 0x50005003, 0x50045001, 0x50155010, 0x5073507c, 0x50715070, 0x504d5074,
|
||||
0x50475040, 0x51cc51f0, 0x51c551c1, 0x51d051dc, 0x51315133, 0x510d5135, 0x51015100, 0x511f5107,
|
||||
0x5171511d, 0x5140514f, 0x51445141, 0x5153515c, 0x57ff5151, 0x57f757fd, 0x57df57f5, 0x57d757dd,
|
||||
0x570c57d5, 0x57015703, 0x577f5704, 0x5777577d, 0x57405775, 0x575d575f, 0x57555757, 0x54c354f0,
|
||||
0x54dc54c4, 0x543c54d0, 0x5400540f, 0x541c5405, 0x54145411, 0x5441544f, 0x55fd55ff, 0x55f555f7,
|
||||
0x55dd55df, 0x55d555d7, 0x5503550c, 0x557f5501, 0x5577557d, 0x55405575, 0x555d555f, 0x55555557
|
||||
);
|
||||
|
||||
#enddecl(IQ1_GRID)
|
||||
|
||||
#decl(IQ4_GRID)
|
||||
|
||||
const kvalues_iq4nl = array<i32, 16>(
|
||||
-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113
|
||||
);
|
||||
|
||||
#enddecl(IQ4_GRID)
|
||||
@@ -27,6 +27,26 @@ def replace_placeholders(shader_text, replacements):
|
||||
return shader_text
|
||||
|
||||
|
||||
def expand_includes(shader, input_dir):
|
||||
"""
|
||||
Replace #include "file" lines in the text with the contents of that file.
|
||||
Searches for files relative to input_dir.
|
||||
"""
|
||||
include_pattern = re.compile(r'^\s*#include\s+"([^"]+)"\s*$', re.MULTILINE)
|
||||
|
||||
def replacer(match):
|
||||
fname = match.group(1)
|
||||
file_path = os.path.join(input_dir, fname)
|
||||
if not os.path.exists(file_path):
|
||||
raise FileNotFoundError(f"Included file not found: {file_path}")
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
included_code = f.read()
|
||||
# Recursively expand includes inside the included file
|
||||
return expand_includes(included_code, input_dir)
|
||||
|
||||
return include_pattern.sub(replacer, shader)
|
||||
|
||||
|
||||
def write_shader(shader_name, shader_code, output_dir, outfile):
|
||||
if output_dir:
|
||||
wgsl_filename = os.path.join(output_dir, f"{shader_name}.wgsl")
|
||||
@@ -35,8 +55,9 @@ def write_shader(shader_name, shader_code, output_dir, outfile):
|
||||
outfile.write(f'const char* wgsl_{shader_name} = R"({shader_code})";\n\n')
|
||||
|
||||
|
||||
def generate_variants(shader_path, output_dir, outfile):
|
||||
shader_base_name = shader_path.split("/")[-1].split(".")[0]
|
||||
def generate_variants(fname, input_dir, output_dir, outfile):
|
||||
shader_path = os.path.join(input_dir, fname)
|
||||
shader_base_name = fname.split(".")[0]
|
||||
|
||||
with open(shader_path, "r", encoding="utf-8") as f:
|
||||
text = f.read()
|
||||
@@ -46,11 +67,21 @@ def generate_variants(shader_path, output_dir, outfile):
|
||||
except ValueError:
|
||||
write_shader(shader_base_name, text, output_dir, outfile)
|
||||
else:
|
||||
decls_map = parse_decls(extract_block(text, "DECLS"))
|
||||
shader_template = extract_block(text, "SHADER")
|
||||
try:
|
||||
decls_map = parse_decls(extract_block(text, "DECLS"))
|
||||
except ValueError:
|
||||
decls_map = {}
|
||||
|
||||
with open(os.path.join(input_dir, "common_decls.tmpl"), "r", encoding="utf-8") as f:
|
||||
common_decls = f.read()
|
||||
decls_map.update(parse_decls(common_decls))
|
||||
|
||||
shader_template = extract_block(text, "SHADER")
|
||||
for variant in variants:
|
||||
decls = variant["DECLS"]
|
||||
if "DECLS" in variant:
|
||||
decls = variant["DECLS"]
|
||||
else:
|
||||
decls = []
|
||||
decls_code = ""
|
||||
for key in decls:
|
||||
if key not in decls_map:
|
||||
@@ -59,8 +90,16 @@ def generate_variants(shader_path, output_dir, outfile):
|
||||
|
||||
shader_variant = replace_placeholders(shader_template, variant["REPLS"])
|
||||
final_shader = re.sub(r'\bDECLS\b', decls_code, shader_variant)
|
||||
final_shader = expand_includes(final_shader, input_dir)
|
||||
|
||||
output_name = f"{shader_base_name}_" + "_".join([variant["REPLS"]["SRC0_TYPE"], variant["REPLS"]["SRC1_TYPE"]])
|
||||
if "SRC0_TYPE" in variant["REPLS"] and "SRC1_TYPE" in variant["REPLS"]:
|
||||
output_name = f"{shader_base_name}_" + "_".join([variant["REPLS"]["SRC0_TYPE"], variant["REPLS"]["SRC1_TYPE"]])
|
||||
elif "TYPE_SUFFIX" in variant["REPLS"]:
|
||||
output_name = f"{shader_base_name}_" + variant["REPLS"]["TYPE_SUFFIX"]
|
||||
elif "TYPE" in variant["REPLS"]:
|
||||
output_name = f"{shader_base_name}_" + variant["REPLS"]["TYPE"]
|
||||
else:
|
||||
output_name = shader_base_name
|
||||
write_shader(output_name, final_shader, output_dir, outfile)
|
||||
|
||||
|
||||
@@ -78,7 +117,7 @@ def main():
|
||||
out.write("// Auto-generated shader embedding\n\n")
|
||||
for fname in sorted(os.listdir(args.input_dir)):
|
||||
if fname.endswith(".wgsl"):
|
||||
generate_variants(os.path.join(args.input_dir, fname), args.output_dir, out)
|
||||
generate_variants(fname, args.input_dir, args.output_dir, out)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -0,0 +1,874 @@
|
||||
#define(VARIANTS)
|
||||
|
||||
[
|
||||
{
|
||||
"REPLS": {
|
||||
"TYPE" : "vec4<f32>",
|
||||
"TYPE_SUFFIX": "f32_vec",
|
||||
"DST_TYPE": "vec4<f32>",
|
||||
"BLOCK_SIZE": 4
|
||||
},
|
||||
"DECLS": ["F32_VEC"]
|
||||
},
|
||||
{
|
||||
"REPLS": {
|
||||
"TYPE" : "f32",
|
||||
"DST_TYPE": "f32",
|
||||
"BLOCK_SIZE": 1
|
||||
},
|
||||
"DECLS": ["F32"]
|
||||
},
|
||||
{
|
||||
"REPLS": {
|
||||
"TYPE" : "f16",
|
||||
"DST_TYPE": "f32",
|
||||
"BLOCK_SIZE": 1
|
||||
},
|
||||
"DECLS": ["F16"]
|
||||
},
|
||||
{
|
||||
"REPLS": {
|
||||
"TYPE" : "i32",
|
||||
"DST_TYPE": "i32",
|
||||
"BLOCK_SIZE": 1
|
||||
},
|
||||
"DECLS": ["I32"]
|
||||
},
|
||||
{
|
||||
"REPLS": {
|
||||
"TYPE" : "q4_0",
|
||||
"DST_TYPE": "f32",
|
||||
"BLOCK_SIZE": 32
|
||||
},
|
||||
"DECLS": ["BYTE_HELPERS", "Q4_0_T", "Q4_0"]
|
||||
},
|
||||
{
|
||||
"REPLS": {
|
||||
"TYPE" : "q4_1",
|
||||
"DST_TYPE": "f32",
|
||||
"BLOCK_SIZE": 32
|
||||
},
|
||||
"DECLS": ["BYTE_HELPERS", "Q4_1_T", "Q4_1"]
|
||||
},
|
||||
{
|
||||
"REPLS": {
|
||||
"TYPE" : "q5_0",
|
||||
"DST_TYPE": "f32",
|
||||
"BLOCK_SIZE": 32
|
||||
},
|
||||
"DECLS": ["BYTE_HELPERS", "Q5_0_T", "Q5_0"]
|
||||
},
|
||||
{
|
||||
"REPLS": {
|
||||
"TYPE" : "q5_1",
|
||||
"DST_TYPE": "f32",
|
||||
"BLOCK_SIZE": 32
|
||||
},
|
||||
"DECLS": ["BYTE_HELPERS", "Q5_1_T", "Q5_1"]
|
||||
},
|
||||
{
|
||||
"REPLS": {
|
||||
"TYPE" : "q8_0",
|
||||
"DST_TYPE": "f32",
|
||||
"BLOCK_SIZE": 32
|
||||
},
|
||||
"DECLS": ["BYTE_HELPERS", "Q8_0_T", "Q8_0"]
|
||||
},
|
||||
{
|
||||
"REPLS": {
|
||||
"TYPE" : "q2_k",
|
||||
"DST_TYPE": "f32",
|
||||
"BLOCK_SIZE": 256
|
||||
},
|
||||
"DECLS": ["BYTE_HELPERS", "Q2_K_T", "Q2_K"]
|
||||
},
|
||||
{
|
||||
"REPLS": {
|
||||
"TYPE" : "q3_k",
|
||||
"DST_TYPE": "f32",
|
||||
"BLOCK_SIZE": 256
|
||||
},
|
||||
"DECLS": ["BYTE_HELPERS", "Q3_K_T", "Q3_K"]
|
||||
},
|
||||
{
|
||||
"REPLS": {
|
||||
"TYPE" : "q4_k",
|
||||
"DST_TYPE": "f32",
|
||||
"BLOCK_SIZE": 256
|
||||
},
|
||||
"DECLS": ["Q45_K_SCALE_MIN", "BYTE_HELPERS", "Q4_K_T", "Q4_K"]
|
||||
},
|
||||
{
|
||||
"REPLS": {
|
||||
"TYPE" : "q5_k",
|
||||
"DST_TYPE": "f32",
|
||||
"BLOCK_SIZE": 256
|
||||
},
|
||||
"DECLS": ["Q45_K_SCALE_MIN", "BYTE_HELPERS", "Q5_K_T", "Q5_K"]
|
||||
},
|
||||
{
|
||||
"REPLS": {
|
||||
"TYPE" : "q6_k",
|
||||
"DST_TYPE": "f32",
|
||||
"BLOCK_SIZE": 256
|
||||
},
|
||||
"DECLS": ["BYTE_HELPERS", "Q6_K_T", "Q6_K"]
|
||||
},
|
||||
{
|
||||
"REPLS": {
|
||||
"TYPE" : "iq2_xxs",
|
||||
"DST_TYPE": "f32",
|
||||
"BLOCK_SIZE": 256
|
||||
},
|
||||
"DECLS": ["BYTE_HELPERS", "IQ23_TABLES", "IQ2_XXS_GRID", "IQ2_XXS_T", "IQ2_XXS"]
|
||||
},
|
||||
{
|
||||
"REPLS": {
|
||||
"TYPE" : "iq2_xs",
|
||||
"DST_TYPE": "f32",
|
||||
"BLOCK_SIZE": 256
|
||||
},
|
||||
"DECLS": ["BYTE_HELPERS", "IQ23_TABLES", "IQ2_XS_GRID", "IQ2_XS_T", "IQ2_XS"]
|
||||
},
|
||||
{
|
||||
"REPLS": {
|
||||
"TYPE": "iq2_s",
|
||||
"DST_TYPE": "f32",
|
||||
"BLOCK_SIZE": 256
|
||||
},
|
||||
"DECLS": ["BYTE_HELPERS", "IQ23_TABLES", "IQ2_S_GRID", "IQ2_S_T", "IQ2_S"]
|
||||
},
|
||||
{
|
||||
"REPLS": {
|
||||
"TYPE": "iq3_xxs",
|
||||
"DST_TYPE": "f32",
|
||||
"BLOCK_SIZE": 256
|
||||
},
|
||||
"DECLS": ["BYTE_HELPERS", "IQ23_TABLES", "IQ3_XSS_GRID", "IQ3_XSS_T", "IQ3_XSS"]
|
||||
},
|
||||
{
|
||||
"REPLS": {
|
||||
"TYPE": "iq3_s",
|
||||
"DST_TYPE": "f32",
|
||||
"BLOCK_SIZE": 256
|
||||
},
|
||||
"DECLS": ["BYTE_HELPERS", "IQ23_TABLES", "IQ3_S_GRID", "IQ3_S_T", "IQ3_S"]
|
||||
},
|
||||
{
|
||||
"REPLS": {
|
||||
"TYPE": "iq1_s",
|
||||
"DST_TYPE": "f32",
|
||||
"BLOCK_SIZE": 256
|
||||
},
|
||||
"DECLS": ["BYTE_HELPERS", "IQ1_GRID", "IQ1_S_T", "IQ1_S"]
|
||||
},
|
||||
{
|
||||
"REPLS": {
|
||||
"TYPE": "iq1_m",
|
||||
"DST_TYPE": "f32",
|
||||
"BLOCK_SIZE": 256
|
||||
},
|
||||
"DECLS": ["BYTE_HELPERS", "IQ1_GRID", "IQ1_M_T", "IQ1_M"]
|
||||
},
|
||||
{
|
||||
"REPLS": {
|
||||
"TYPE": "iq4_nl",
|
||||
"DST_TYPE": "f32",
|
||||
"BLOCK_SIZE": 32,
|
||||
},
|
||||
"DECLS": ["BYTE_HELPERS", "IQ4_GRID", "IQ4_NL_T", "IQ4_NL"]
|
||||
},
|
||||
{
|
||||
"REPLS": {
|
||||
"TYPE": "iq4_xs",
|
||||
"DST_TYPE": "f32",
|
||||
"BLOCK_SIZE": 256,
|
||||
},
|
||||
"DECLS": ["BYTE_HELPERS", "IQ4_GRID", "IQ4_XS_T", "IQ4_XS"]
|
||||
}
|
||||
]
|
||||
|
||||
#end(VARIANTS)
|
||||
|
||||
#define(DECLS)
|
||||
|
||||
#decl(F32_VEC)
|
||||
fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
|
||||
dst[(dst_base / 4) + offset] = src[(src_base / 4) + offset];
|
||||
}
|
||||
#enddecl(F32_VEC)
|
||||
|
||||
#decl(F32)
|
||||
fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
|
||||
dst[dst_base + offset] = src[src_base + offset];
|
||||
}
|
||||
#enddecl(F32)
|
||||
|
||||
#decl(F16)
|
||||
fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
|
||||
dst[dst_base + offset] = f32(src[src_base + offset]);
|
||||
}
|
||||
#enddecl(F16)
|
||||
|
||||
#decl(I32)
|
||||
fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
|
||||
dst[dst_base + offset] = src[src_base + offset];
|
||||
}
|
||||
#enddecl(I32)
|
||||
|
||||
#decl(Q4_0)
|
||||
fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
|
||||
let block_q4_0 = src[src_base + offset];
|
||||
let d = f32(block_q4_0.d);
|
||||
for (var j: u32 = 0; j < 4; j++) {
|
||||
let q_packed = bitcast<u32>(vec2(block_q4_0.qs[2 * j], block_q4_0.qs[2 * j + 1]));
|
||||
for (var k: u32 = 0; k < 4; k++) {
|
||||
let q_byte = get_byte(q_packed, k);
|
||||
let q_hi = (f32((q_byte >> 4) & 0xF) - 8.0f) * d;
|
||||
let q_lo = (f32(q_byte & 0xF) - 8.0f) * d;
|
||||
let dst_offset = dst_base + offset * 32 + j * 4 + k;
|
||||
dst[dst_offset] = q_lo;
|
||||
dst[dst_offset + 16] = q_hi;
|
||||
}
|
||||
}
|
||||
}
|
||||
#enddecl(Q4_0)
|
||||
|
||||
#decl(Q4_1)
|
||||
fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
|
||||
let block_q4_1 = src[src_base + offset];
|
||||
let d = f32(block_q4_1.d);
|
||||
let m = f32(block_q4_1.m);
|
||||
for (var j: u32 = 0; j < 4; j++) {
|
||||
let q_packed = block_q4_1.qs[j];
|
||||
for (var k: u32 = 0; k < 4; k++) {
|
||||
let q_byte = get_byte(q_packed, k);
|
||||
let q_hi = f32((q_byte >> 4) & 0xF) * d + m;
|
||||
let q_lo = f32(q_byte & 0xF) * d + m;
|
||||
let dst_offset = dst_base + offset * 32 + j * 4 + k;
|
||||
dst[dst_offset] = q_lo;
|
||||
dst[dst_offset + 16] = q_hi;
|
||||
}
|
||||
}
|
||||
}
|
||||
#enddecl(Q4_1)
|
||||
|
||||
#decl(Q5_0)
|
||||
fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
|
||||
let block_q5_0 = src[src_base + offset];
|
||||
let d = f32(block_q5_0.d);
|
||||
let qh_packed = bitcast<u32>(vec2(block_q5_0.qh[0], block_q5_0.qh[1]));
|
||||
for (var j: u32 = 0; j < 4; j++) {
|
||||
let q_packed = bitcast<u32>(vec2(block_q5_0.qs[2 * j], block_q5_0.qs[2 * j + 1]));
|
||||
for (var k: u32 = 0; k < 4; k++) {
|
||||
let q_byte = get_byte(q_packed, k);
|
||||
let qh_hi = (qh_packed >> (j * 4 + k + 12)) & 0x10;
|
||||
let q_hi = (f32(((q_byte >> 4) & 0xF) | qh_hi) - 16.0) * d;
|
||||
let qh_lo = ((qh_packed >> (j * 4 + k)) << 4) & 0x10;
|
||||
let q_lo = (f32((q_byte & 0xF) | qh_lo) - 16.0) * d;
|
||||
let dst_offset = dst_base + offset * 32 + j * 4 + k;
|
||||
dst[dst_offset] = q_lo;
|
||||
dst[dst_offset + 16] = q_hi;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#enddecl(Q5_0)
|
||||
|
||||
#decl(Q5_1)
|
||||
fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
|
||||
let block_q5_1 = src[src_base + offset];
|
||||
let d = f32(block_q5_1.d);
|
||||
let m = f32(block_q5_1.m);
|
||||
for (var j: u32 = 0; j < 4; j++) {
|
||||
let q_packed = block_q5_1.qs[j];
|
||||
for (var k: u32 = 0; k < 4; k++) {
|
||||
let q_byte = get_byte(q_packed, k);
|
||||
let qh_hi = (block_q5_1.qh >> (j * 4 + k + 12)) & 0x10;
|
||||
let q_hi = f32(((q_byte >> 4) & 0xF) | qh_hi) * d + m;
|
||||
let qh_lo = ((block_q5_1.qh >> (j * 4 + k)) << 4) & 0x10;
|
||||
let q_lo = f32((q_byte & 0xF) | qh_lo) * d + m;
|
||||
let dst_offset = dst_base + offset * 32 + j * 4 + k;
|
||||
dst[dst_offset] = q_lo;
|
||||
dst[dst_offset + 16] = q_hi;
|
||||
}
|
||||
}
|
||||
}
|
||||
#enddecl(Q5_1)
|
||||
|
||||
#decl(Q8_0)
|
||||
fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
|
||||
let block_q8_0 = src[src_base + offset];
|
||||
let d = f32(block_q8_0.d);
|
||||
for (var j: u32 = 0; j < 8; j++) {
|
||||
let q_packed = bitcast<u32>(vec2(block_q8_0.qs[2 * j], block_q8_0.qs[2 * j + 1]));
|
||||
for (var k: u32 = 0; k < 4; k++) {
|
||||
let q_byte = get_byte_i32(q_packed, k);
|
||||
let q_val = f32(q_byte) * d;
|
||||
let dst_offset = dst_base + offset * 32 + j * 4 + k;
|
||||
dst[dst_offset] = q_val;
|
||||
}
|
||||
}
|
||||
}
|
||||
#enddecl(Q8_0)
|
||||
|
||||
#decl(Q2_K)
|
||||
fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
|
||||
let block = src[src_base + offset];
|
||||
let d = f32(block.d);
|
||||
let m = f32(block.dmin);
|
||||
var dst_i = dst_base + offset * 256;
|
||||
var is: u32 = 0;
|
||||
// 2 halves of the block (128 elements each)
|
||||
for (var q_b_idx: u32 = 0; q_b_idx < 64; q_b_idx += 32) {
|
||||
// 4 groups (each group has 2 blocks of 16 elements)
|
||||
for (var shift: u32 = 0; shift < 8; shift += 2) {
|
||||
// 2 blocks
|
||||
for (var k: u32 = 0; k < 32; k += 16) {
|
||||
let sc = get_byte(block.scales[is / 4], is % 4);
|
||||
is++;
|
||||
let dl = d * f32(sc & 0xF);
|
||||
let ml = m * f32(sc >> 4);
|
||||
for (var l: u32 = 0u; l < 16; l++) {
|
||||
let q_idx = q_b_idx + k + l;
|
||||
let q_byte = get_byte(block.qs[q_idx / 4], q_idx % 4);
|
||||
let qs_val = (q_byte >> shift) & 3;
|
||||
dst[dst_i] = (f32(qs_val) * dl - ml);
|
||||
dst_i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#enddecl(Q2_K)
|
||||
|
||||
#decl(Q3_K)
|
||||
fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
|
||||
let block = src[src_base + offset];
|
||||
let d = f32(block.d);
|
||||
|
||||
// extract 6-bit scales, which consist of 4-bits from first 8 bytes of scale,
|
||||
// and 2-bits from the last 4 bytes
|
||||
let kmask1: u32 = 0x03030303;
|
||||
let kmask2: u32 = 0x0f0f0f0f;
|
||||
var scale_vals: array<u32, 4>;
|
||||
for (var i: u32 = 0; i < 4; i++) {
|
||||
scale_vals[i] = bitcast<u32>(vec2(block.scales[2 * i], block.scales[2 * i + 1]));
|
||||
}
|
||||
var tmp: u32 = scale_vals[2];
|
||||
scale_vals[2] = ((scale_vals[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
|
||||
scale_vals[3] = ((scale_vals[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
|
||||
scale_vals[0] = (scale_vals[0] & kmask2) | ((tmp & kmask1) << 4);
|
||||
scale_vals[1] = (scale_vals[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
|
||||
|
||||
// convert arrays of f16 -> u32
|
||||
var hmask_vals: array<u32, 8>;
|
||||
for (var i: u32 = 0; i < 8; i++) {
|
||||
hmask_vals[i] = bitcast<u32>(vec2(block.hmask[2 * i], block.hmask[2 * i + 1]));
|
||||
}
|
||||
var qs_vals: array<u32, 16>;
|
||||
for (var i: u32 = 0; i < 16; i++) {
|
||||
qs_vals[i] = bitcast<u32>(vec2(block.qs[2 * i], block.qs[2 * i + 1]));
|
||||
}
|
||||
|
||||
var dst_i = dst_base + offset * 256;
|
||||
var is: u32 = 0;
|
||||
var m: u32 = 1;
|
||||
// 2 halves of the block (128 elements each)
|
||||
for (var q_b_idx: u32 = 0; q_b_idx < 64; q_b_idx += 32) {
|
||||
// 4 groups (each group has 2 blocks of 16 elements)
|
||||
for (var shift: u32 = 0; shift < 8; shift += 2) {
|
||||
// 2 blocks
|
||||
for (var k: u32 = 0; k < 32; k += 16) {
|
||||
let sc = get_byte(scale_vals[is / 4], is % 4);
|
||||
is++;
|
||||
let dl = d * (f32(sc) - 32.0);
|
||||
for (var l: u32 = 0u; l < 16u; l++) {
|
||||
let q_idx = q_b_idx + k + l;
|
||||
let hm_idx = k + l;
|
||||
let q_byte = get_byte(qs_vals[q_idx / 4], q_idx % 4);
|
||||
let hmask_byte = get_byte(hmask_vals[hm_idx / 4], hm_idx % 4);
|
||||
let hm = select(4.0, 0.0, (hmask_byte & m) != 0);
|
||||
let qs_val = (q_byte >> shift) & 3;
|
||||
dst[dst_i] = (f32(qs_val) - hm) * dl;
|
||||
dst_i++;
|
||||
}
|
||||
}
|
||||
m <<= 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
#enddecl(Q3_K)
|
||||
|
||||
#decl(Q4_K)
|
||||
// 8 blocks of 32 elements each
|
||||
fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
|
||||
let block = src[src_base + offset];
|
||||
let d = f32(block.d);
|
||||
let m = f32(block.dmin);
|
||||
var dst_i = dst_base + offset * 256;
|
||||
var is: u32 = 0;
|
||||
// 2 blocks each iteration
|
||||
for (var q_b_idx: u32 = 0; q_b_idx < 128; q_b_idx += 32) {
|
||||
for (var shift: u32 = 0; shift < 8; shift += 4) {
|
||||
let scale_min = get_scale_min(is, block.scales);
|
||||
is++;
|
||||
let dl = d * scale_min.x;
|
||||
let ml = m * scale_min.y;
|
||||
for (var l: u32 = 0; l < 32; l++) {
|
||||
let q_idx = q_b_idx + l;
|
||||
let q_byte = get_byte(block.qs[q_idx / 4], q_idx % 4);
|
||||
let qs_val = (q_byte >> shift) & 0xF;
|
||||
dst[dst_i] = (f32(qs_val) * dl - ml);
|
||||
dst_i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#enddecl(Q4_K)
|
||||
|
||||
#decl(Q5_K)
|
||||
fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
|
||||
let block = src[src_base + offset];
|
||||
let d = f32(block.d);
|
||||
let m = f32(block.dmin);
|
||||
var dst_i = dst_base + offset * 256;
|
||||
var is: u32 = 0;
|
||||
var u: u32 = 1;
|
||||
// 2 blocks each iteration
|
||||
for (var q_b_idx: u32 = 0; q_b_idx < 128; q_b_idx += 32) {
|
||||
for (var shift: u32 = 0; shift < 8; shift += 4) {
|
||||
let scale_min = get_scale_min(is, block.scales);
|
||||
is++;
|
||||
let dl = d * scale_min.x;
|
||||
let ml = m * scale_min.y;
|
||||
for (var l: u32 = 0; l < 32; l++) {
|
||||
let q_idx = q_b_idx + l;
|
||||
let q_byte = get_byte(block.qs[q_idx / 4], q_idx % 4);
|
||||
let qh_byte = get_byte(block.qh[l / 4], l % 4);
|
||||
let qs_val = (q_byte >> shift) & 0xF;
|
||||
let qh_val = select(0.0, 16.0, (qh_byte & u) != 0);
|
||||
dst[dst_i] = (f32(qs_val) + qh_val) * dl - ml;
|
||||
dst_i++;
|
||||
}
|
||||
u <<= 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
#enddecl(Q5_K)
|
||||
|
||||
#decl(Q6_K)
|
||||
// 16 blocks of 16 elements each
|
||||
fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
|
||||
let block = src[src_base + offset];
|
||||
let d = f32(block.d);
|
||||
|
||||
// convert arrays of f16 -> u32
|
||||
var ql_vals: array<u32, 32>;
|
||||
for (var i: u32 = 0; i < 32; i++) {
|
||||
ql_vals[i] = bitcast<u32>(vec2(block.ql[2 * i], block.ql[2 * i + 1]));
|
||||
}
|
||||
var qh_vals: array<u32, 16>;
|
||||
for (var i: u32 = 0; i < 16; i++) {
|
||||
qh_vals[i] = bitcast<u32>(vec2(block.qh[2 * i], block.qh[2 * i + 1]));
|
||||
}
|
||||
var scale_vals: array<u32, 4>;
|
||||
for (var i: u32 = 0; i < 4; i++) {
|
||||
scale_vals[i] = bitcast<u32>(vec2(block.scales[2 * i], block.scales[2 * i + 1]));
|
||||
}
|
||||
|
||||
var dst_i = dst_base + offset * 256;
|
||||
var qh_b_idx: u32 = 0;
|
||||
var sc_b_idx: u32 = 0;
|
||||
for (var ql_b_idx: u32 = 0; ql_b_idx < 128; ql_b_idx += 64) {
|
||||
for (var l: u32 = 0; l < 32; l++) {
|
||||
let ql13_b = get_byte(ql_vals[(ql_b_idx + l) / 4], (ql_b_idx + l) % 4);
|
||||
let ql24_b = get_byte(ql_vals[(ql_b_idx + l + 32) / 4], (ql_b_idx + l + 32) % 4);
|
||||
let qh_b = get_byte(qh_vals[(qh_b_idx + l) / 4], (qh_b_idx + l) % 4);
|
||||
|
||||
let q1 = f32((ql13_b & 0xF) | ((qh_b & 3) << 4)) - 32.0;
|
||||
let q2 = f32((ql24_b & 0xF) | (((qh_b >> 2) & 3) << 4)) - 32.0;
|
||||
let q3 = f32((ql13_b >> 4) | (((qh_b >> 4) & 3) << 4)) - 32.0;
|
||||
let q4 = f32((ql24_b >> 4) | (((qh_b >> 6) & 3) << 4)) - 32.0;
|
||||
|
||||
let is = l/16;
|
||||
let is1 = sc_b_idx + is;
|
||||
let sc1 = get_byte_i32(scale_vals[is1 / 4], is1 % 4);
|
||||
let is2 = sc_b_idx + is + 2;
|
||||
let sc2 = get_byte_i32(scale_vals[is2 / 4], is2 % 4);
|
||||
let is3 = sc_b_idx + is + 4;
|
||||
let sc3 = get_byte_i32(scale_vals[is3 / 4], is3 % 4);
|
||||
let is4 = sc_b_idx + is + 6;
|
||||
let sc4 = get_byte_i32(scale_vals[is4 / 4], is4 % 4);
|
||||
|
||||
dst[dst_i + l] = (q1 * f32(sc1)) * d;
|
||||
dst[dst_i + l + 32] = (q2 * f32(sc2)) * d;
|
||||
dst[dst_i + l + 64] = (q3 * f32(sc3)) * d;
|
||||
dst[dst_i + l + 96] = (q4 * f32(sc4)) * d;
|
||||
}
|
||||
dst_i += 128;
|
||||
qh_b_idx += 32;
|
||||
sc_b_idx += 8;
|
||||
}
|
||||
}
|
||||
|
||||
#enddecl(Q6_K)
|
||||
|
||||
#decl(IQ2_XXS)
|
||||
fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
|
||||
let block = src[src_base + offset];
|
||||
let d = f32(block.d);
|
||||
var dst_i = dst_base + offset * 256;
|
||||
for (var ib: u32 = 0; ib < 32; ib += 4) {
|
||||
let aux0 = bitcast<u32>(vec2(block.qs[ib], block.qs[ib + 1]));
|
||||
let aux1 = bitcast<u32>(vec2(block.qs[ib + 2], block.qs[ib + 3]));
|
||||
let db = d * (0.5 + f32(aux1 >> 28)) * 0.25;
|
||||
for (var l: u32 = 0; l < 4; l++) {
|
||||
let ig = get_byte(aux0, l) * 8;
|
||||
let is = (aux1 >> (7 * l)) & 127;
|
||||
let signs = get_byte(ksigns_iq2xs[is / 4], is % 4);
|
||||
for (var j: u32 = 0; j < 8; j++) {
|
||||
let g = get_byte(iq2xxs_grid[(ig + j) / 4], (ig + j) % 4);
|
||||
let m = select(1.0, -1.0, (get_byte(kmask_iq2xs[j / 4], j % 4) & signs) != 0);
|
||||
dst[dst_i] = db * f32(g) * m;
|
||||
dst_i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#enddecl(IQ2_XXS)
|
||||
|
||||
#decl(IQ2_XS)
|
||||
fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
|
||||
let block = src[src_base + offset];
|
||||
let d = f32(block.d);
|
||||
var dst_i = dst_base + offset * 256;
|
||||
var scale_vals = array<u32, 2>(
|
||||
bitcast<u32>(vec2(block.scales[0], block.scales[1])),
|
||||
bitcast<u32>(vec2(block.scales[2], block.scales[3]))
|
||||
);
|
||||
for (var ib: u32 = 0; ib < 32; ib += 4) {
|
||||
let s = get_byte(scale_vals[ib / 16], (ib % 16) / 4);
|
||||
let db = array<f32, 2>(
|
||||
d * (0.5 + f32(s & 0xF)) * 0.25,
|
||||
d * (0.5 + f32(s >> 4)) * 0.25
|
||||
);
|
||||
for (var l: u32 = 0; l < 4; l++) {
|
||||
let qs_val = bitcast<u32>(vec2(block.qs[ib + l], 0.0));
|
||||
let ig = (qs_val & 511) * 8;
|
||||
let is = qs_val >> 9;
|
||||
let signs = get_byte(ksigns_iq2xs[is / 4], is % 4);
|
||||
let dl = db[l/2];
|
||||
for (var j: u32 = 0; j < 8; j++) {
|
||||
let g = get_byte(iq2xs_grid[(ig + j) / 4], (ig + j) % 4);
|
||||
let m = select(1.0, -1.0, (get_byte(kmask_iq2xs[j / 4], j % 4) & signs) != 0);
|
||||
dst[dst_i] = dl * f32(g) * m;
|
||||
dst_i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#enddecl(IQ2_XS)
|
||||
|
||||
#decl(IQ2_S)
|
||||
fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
|
||||
let block = src[src_base + offset];
|
||||
let d = f32(block.d);
|
||||
var dst_i = dst_base + offset * 256;
|
||||
var qs_vals : array<u32, 16>;
|
||||
for (var i: u32 = 0; i < 16; i++) {
|
||||
qs_vals[i] = bitcast<u32>(vec2(block.qs[i * 2], block.qs[i * 2 + 1]));
|
||||
}
|
||||
var qh_vals = array<u32, 2>(
|
||||
bitcast<u32>(vec2(block.qh[0], block.qh[1])),
|
||||
bitcast<u32>(vec2(block.qh[2], block.qh[3]))
|
||||
);
|
||||
var scale_vals = array<u32, 2>(
|
||||
bitcast<u32>(vec2(block.scales[0], block.scales[1])),
|
||||
bitcast<u32>(vec2(block.scales[2], block.scales[3]))
|
||||
);
|
||||
for (var ib: u32 = 0; ib < 8; ib ++) {
|
||||
let s = get_byte(scale_vals[ib / 4], ib % 4);
|
||||
let db = array<f32, 2>(
|
||||
d * (0.5 + f32(s & 0xF)) * 0.25,
|
||||
d * (0.5 + f32(s >> 4)) * 0.25
|
||||
);
|
||||
let qs_w = qs_vals[ib];
|
||||
for (var l: u32 = 0; l < 4; l++) {
|
||||
let qh_b = (get_byte(qh_vals[ib / 4], ib % 4) << (8 - 2 * l)) & 0x300;
|
||||
let ig = (get_byte(qs_w, l) | qh_b) * 8;
|
||||
let signs = get_byte(qs_vals[ib + 8], l);
|
||||
let dl = db[l/2];
|
||||
for (var j: u32 = 0; j < 8; j++) {
|
||||
let g = get_byte(iq2s_grid[(ig + j) / 4], (ig + j) % 4);
|
||||
let m = select(1.0, -1.0, (get_byte(kmask_iq2xs[j / 4], j % 4) & signs) != 0);
|
||||
dst[dst_i] = dl * f32(g) * m;
|
||||
dst_i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#enddecl(IQ2_S)
|
||||
|
||||
#decl(IQ3_XSS)
|
||||
fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
|
||||
let block = src[src_base + offset];
|
||||
let d = f32(block.d);
|
||||
var dst_i = dst_base + offset * 256;
|
||||
for (var ib: u32 = 0; ib < 16; ib += 2) {
|
||||
let sc_sign = bitcast<u32>(vec2(block.qs[ib + 32], block.qs[ib + 33]));
|
||||
let db = d * (0.5 + f32(sc_sign >> 28)) * 0.5;
|
||||
for (var l: u32 = 0; l < 4; l++) {
|
||||
let is = (sc_sign >> (7 * l)) & 127;
|
||||
let signs = get_byte(ksigns_iq2xs[is / 4], is % 4);
|
||||
let ig_val = bitcast<u32>(vec2(block.qs[ib * 2 + l], 0.0));
|
||||
let ig1 = get_byte(ig_val, 0);
|
||||
let ig2 = get_byte(ig_val, 1);
|
||||
for (var j: u32 = 0; j < 4; j++) {
|
||||
let g1 = get_byte(iq3xxs_grid[ig1], j);
|
||||
let g2 = get_byte(iq3xxs_grid[ig2], j);
|
||||
let m1 = select(1.0, -1.0, (get_byte(kmask_iq2xs[0], j) & signs) != 0);
|
||||
let m2 = select(1.0, -1.0, (get_byte(kmask_iq2xs[1], j) & signs) != 0);
|
||||
dst[dst_i] = db * f32(g1) * m1;
|
||||
dst[dst_i + 4] = db * f32(g2) * m2;
|
||||
dst_i++;
|
||||
}
|
||||
dst_i += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
#enddecl(IQ3_XSS)
|
||||
|
||||
#decl(IQ3_S)
|
||||
fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
|
||||
let block = src[src_base + offset];
|
||||
let d = f32(block.d);
|
||||
var dst_i = dst_base + offset * 256;
|
||||
var qh_vals = array<u32, 2>(
|
||||
bitcast<u32>(vec2(block.qh[0], block.qh[1])),
|
||||
bitcast<u32>(vec2(block.qh[2], block.qh[3]))
|
||||
);
|
||||
var sign_vals: array<u32, 8>;
|
||||
for (var i: u32 = 0; i < 8; i++) {
|
||||
sign_vals[i] = bitcast<u32>(vec2(block.signs[i * 2], block.signs[i * 2 + 1]));
|
||||
}
|
||||
var scale_vals = bitcast<u32>(vec2(block.scales[0], block.scales[1]));
|
||||
for (var ib: u32 = 0; ib < 4; ib++) {
|
||||
let s = get_byte(scale_vals, ib);
|
||||
let db = array<f32, 2>(
|
||||
d * (1.0 + 2.0 * f32(s & 0xF)),
|
||||
d * (1.0 + 2.0 * f32(s >> 4))
|
||||
);
|
||||
for (var k: u32 = 0; k < 2; k++) {
|
||||
let dl = db[k];
|
||||
let qh_byte = get_byte(qh_vals[ib / 2], (ib % 2) * 2 + k);
|
||||
let sign_w = sign_vals[ib * 2 + k];
|
||||
for (var l: u32 = 0; l < 4; l++) {
|
||||
let signs = get_byte(sign_w, l);
|
||||
let ig_val = bitcast<u32>(vec2(block.qs[ib * 8 + k * 4 + l], 0.0));
|
||||
let ig1 = get_byte(ig_val, 0) | ((qh_byte << ((8 - (2 * l)))) & 256);
|
||||
let ig2 = get_byte(ig_val, 1) | ((qh_byte << ((7 - (2 * l)))) & 256);
|
||||
for (var j: u32 = 0; j < 4; j++) {
|
||||
let g1 = get_byte(iq3s_grid[ig1], j);
|
||||
let g2 = get_byte(iq3s_grid[ig2], j);
|
||||
let m1 = select(1.0, -1.0, (get_byte(kmask_iq2xs[0], j) & signs) != 0);
|
||||
let m2 = select(1.0, -1.0, (get_byte(kmask_iq2xs[1], j) & signs) != 0);
|
||||
dst[dst_i] = dl * f32(g1) * m1;
|
||||
dst[dst_i + 4] = dl * f32(g2) * m2;
|
||||
dst_i++;
|
||||
}
|
||||
dst_i += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#enddecl(IQ3_S)
|
||||
|
||||
#decl(IQ1_S)
|
||||
fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
|
||||
let block = src[src_base + offset];
|
||||
let d = f32(block.d);
|
||||
var dst_i = dst_base + offset * 256;
|
||||
for (var ib: u32 = 0; ib < 8; ib++) {
|
||||
let qh = bitcast<u32>(vec2(block.qh[ib], 0.0));
|
||||
let dl = d * (2 * f32((qh >> 12) & 7) + 1);
|
||||
let delta = select(IQ1_DELTA, -IQ1_DELTA, (qh & 0x8000) != 0);
|
||||
let qs_w = bitcast<u32>(vec2(block.qs[ib * 2], block.qs[ib * 2 + 1]));
|
||||
for (var l: u32 = 0; l < 4; l++) {
|
||||
let ig = (get_byte(qs_w, l) | (((qh >> (3 * l)) & 7) << 8)) * 8;
|
||||
for (var j: u32 = 0; j < 8; j++) {
|
||||
let gw = iq1_grid[(ig + j) / 16];
|
||||
let g = (gw >> (((ig + j) % 16) * 2)) & 3;
|
||||
let gs = bitcast<i32>(g << 30) >> 30;
|
||||
dst[dst_i] = dl * (f32(gs) + delta);
|
||||
dst_i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#enddecl(IQ1_S)
|
||||
|
||||
#decl(IQ1_M)
|
||||
fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
|
||||
let block = src[src_base + offset];
|
||||
|
||||
let scale = ((block.scales[0] >> 12) & 0xF) | ((block.scales[0] >> 24) & 0x00F0) | ((block.scales[1] >> 4) & 0x0F00) | ((block.scales[1] >> 16) & 0xF000);
|
||||
let d = f32(bitcast<vec2<f16>>(scale).x);
|
||||
var dst_i = dst_base + offset * 256;
|
||||
for (var ib: u32 = 0; ib < 8; ib++) {
|
||||
let sw = (block.scales[ib / 4] >> (16 * ((ib / 2) % 2))) & 0xFFFF;
|
||||
let s1 : u32 = (sw >> (6 * (ib % 2))) & 0x7;
|
||||
let s2 : u32 = (sw >> (6 * (ib % 2) + 3)) & 0x7;
|
||||
var dl = array<f32, 2>(
|
||||
d * f32(2 * s1 + 1),
|
||||
d * f32(2 * s2 + 1)
|
||||
);
|
||||
|
||||
let qh = block.qh[ib / 2] >> (16 * (ib % 2));
|
||||
var idx = array<u32, 4>(
|
||||
get_byte(block.qs[ib], 0) | ((qh << 8) & 0x700),
|
||||
get_byte(block.qs[ib], 1) | ((qh << 4) & 0x700),
|
||||
get_byte(block.qs[ib], 2) | ((qh) & 0x700),
|
||||
get_byte(block.qs[ib], 3) | ((qh >> 4) & 0x700)
|
||||
);
|
||||
var delta = array<f32, 4>(
|
||||
select(IQ1_DELTA, -IQ1_DELTA, (qh & 0x08) != 0),
|
||||
select(IQ1_DELTA, -IQ1_DELTA, (qh & 0x80) != 0),
|
||||
select(IQ1_DELTA, -IQ1_DELTA, ((qh >> 8) & 0x08) != 0),
|
||||
select(IQ1_DELTA, -IQ1_DELTA, ((qh >> 8) & 0x80) != 0)
|
||||
);
|
||||
for (var l: u32 = 0; l < 4; l++) {
|
||||
let ig = idx[l] * 8;
|
||||
for (var j: u32 = 0; j < 8; j++) {
|
||||
let gw = iq1_grid[(ig + j) / 16];
|
||||
let g = (gw >> (((ig + j) % 16) * 2)) & 3;
|
||||
let gs = bitcast<i32>(g << 30) >> 30;
|
||||
dst[dst_i] = dl[l/2] * (f32(gs) + delta[l]);
|
||||
dst_i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#enddecl(IQ1_M)
|
||||
|
||||
#decl(IQ4_NL)
|
||||
fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
|
||||
let block = src[src_base + offset];
|
||||
let d = f32(block.d);
|
||||
var dst_i = dst_base + offset * 32;
|
||||
var qs: array<u32, 4>;
|
||||
for (var i: u32 = 0; i < 4; i++) {
|
||||
qs[i] = bitcast<u32>(vec2(block.qs[i * 2], block.qs[i * 2 + 1]));
|
||||
}
|
||||
for (var j: u32 = 0; j < 16; j++) {
|
||||
let qsb = get_byte(qs[j / 4], j % 4);
|
||||
dst[dst_i] = d * f32(kvalues_iq4nl[qsb & 0xF]);
|
||||
dst[dst_i + 16] = d * f32(kvalues_iq4nl[qsb >> 4]);
|
||||
dst_i++;
|
||||
}
|
||||
}
|
||||
#enddecl(IQ4_NL)
|
||||
|
||||
#decl(IQ4_XS)
|
||||
fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
|
||||
let block = src[src_base + offset];
|
||||
let d = f32(block.d);
|
||||
let scales_h = bitcast<u32>(vec2(block.scales_h, 0.0));
|
||||
var dst_i = dst_base + offset * 256;
|
||||
for (var ib: u32 = 0; ib < 8; ib++) {
|
||||
let ls = ((get_byte(block.scales_l, ib / 2) >> (4 * (ib % 2))) & 0xF) | (((scales_h >> (2 * ib)) & 3) << 4);
|
||||
let dl = d * (f32(ls) - 32.0);
|
||||
for (var j: u32 = 0; j < 16; j++) {
|
||||
let iqs = ib * 16 + j;
|
||||
let qsb = get_byte(block.qs[iqs / 4], iqs % 4);
|
||||
dst[dst_i] = dl * f32(kvalues_iq4nl[qsb & 0xF]);
|
||||
dst[dst_i + 16] = dl * f32(kvalues_iq4nl[qsb >> 4]);
|
||||
dst_i++;
|
||||
}
|
||||
dst_i += 16;
|
||||
}
|
||||
}
|
||||
#enddecl(IQ4_XS)
|
||||
|
||||
#end(DECLS)
|
||||
|
||||
#define(SHADER)
|
||||
|
||||
enable f16;
|
||||
|
||||
DECLS
|
||||
|
||||
@group(0) @binding(0)
|
||||
var<storage, read_write> src: array<{{TYPE}}>;
|
||||
|
||||
@group(0) @binding(1)
|
||||
var<storage, read_write> idx: array<i32>;
|
||||
|
||||
@group(0) @binding(2)
|
||||
var<storage, read_write> dst: array<{{DST_TYPE}}>;
|
||||
|
||||
struct Params {
|
||||
offset_src: u32, // in elements
|
||||
offset_idx: u32, // in elements
|
||||
offset_dst: u32, // in elements
|
||||
|
||||
// Strides (in elements)
|
||||
stride_src1: u32,
|
||||
stride_src2: u32,
|
||||
stride_src3: u32,
|
||||
|
||||
stride_idx0: u32,
|
||||
stride_idx1: u32,
|
||||
stride_idx2: u32,
|
||||
|
||||
stride_dst1: u32,
|
||||
stride_dst2: u32,
|
||||
stride_dst3: u32,
|
||||
|
||||
// Shape of dst
|
||||
ne0: u32,
|
||||
n_rows: u32,
|
||||
ne2: u32,
|
||||
ne3: u32,
|
||||
|
||||
// Shape of idx
|
||||
idx1: u32,
|
||||
idx2: u32,
|
||||
};
|
||||
|
||||
@group(0) @binding(3)
|
||||
var<uniform> params: Params;
|
||||
|
||||
override wg_size: u32;
|
||||
@compute @workgroup_size(wg_size)
|
||||
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
|
||||
if (gid.x >= params.n_rows * params.ne2 * params.ne3) {
|
||||
return;
|
||||
}
|
||||
var i = gid.x;
|
||||
let i_dst3 = i / (params.ne2 * params.n_rows);
|
||||
|
||||
i = i % (params.ne2 * params.n_rows);
|
||||
let i_dst2 = i / params.n_rows;
|
||||
let i_dst1 = i % params.n_rows;
|
||||
|
||||
let i_idx2 = i_dst3 % params.idx2;
|
||||
let i_idx1 = i_dst2 % params.idx1;
|
||||
let i_idx0 = i_dst1;
|
||||
|
||||
let i_idx = params.offset_idx + i_idx0 * params.stride_idx0 + i_idx1 * params.stride_idx1 + i_idx2 * params.stride_idx2;
|
||||
|
||||
let idx_val = u32(idx[i_idx]);
|
||||
|
||||
let i_src_row = params.offset_src + idx_val * params.stride_src1 + i_dst2 * params.stride_src2 + i_dst3 * params.stride_src3;
|
||||
let i_dst_row = params.offset_dst + i_dst1 * params.stride_dst1 + i_dst2 * params.stride_dst2 + i_dst3 * params.stride_dst3;
|
||||
|
||||
for (var i: u32 = 0; i < params.ne0/{{BLOCK_SIZE}}; i++) {
|
||||
copy_elements(i_src_row, i_dst_row, i);
|
||||
}
|
||||
}
|
||||
|
||||
#end(SHADER)
|
||||
@@ -0,0 +1,44 @@
|
||||
#define(VARIANTS)
|
||||
|
||||
[
|
||||
{
|
||||
"REPLS": {
|
||||
"TYPE" : "f32",
|
||||
}
|
||||
},
|
||||
{
|
||||
"REPLS": {
|
||||
"TYPE" : "f16",
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
#end(VARIANTS)
|
||||
|
||||
#define(SHADER)
|
||||
|
||||
enable f16;
|
||||
|
||||
#include "binary_head.tmpl"
|
||||
|
||||
@group(0) @binding(0)
|
||||
var<storage, read_write> src0: array<{{TYPE}}>;
|
||||
|
||||
@group(0) @binding(1)
|
||||
var<storage, read_write> src1: array<{{TYPE}}>;
|
||||
|
||||
@group(0) @binding(2)
|
||||
var<storage, read_write> dst: array<{{TYPE}}>;
|
||||
|
||||
@group(0) @binding(3)
|
||||
var<uniform> params: Params;
|
||||
|
||||
override wg_size: u32;
|
||||
@compute @workgroup_size(wg_size)
|
||||
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
|
||||
if (gid.x < params.ne) {
|
||||
dst[params.offset_dst + gid.x] = src0[params.offset_src0 + gid.x] * src1[params.offset_src1 + src1_index(gid.x)];
|
||||
}
|
||||
}
|
||||
|
||||
#end(SHADER)
|
||||
@@ -0,0 +1,41 @@
|
||||
#define(VARIANTS)
|
||||
|
||||
[
|
||||
{
|
||||
"REPLS": {
|
||||
"TYPE" : "f32",
|
||||
}
|
||||
},
|
||||
{
|
||||
"REPLS": {
|
||||
"TYPE" : "f16",
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
#end(VARIANTS)
|
||||
|
||||
#define(SHADER)
|
||||
|
||||
enable f16;
|
||||
|
||||
#include "binary_head.tmpl"
|
||||
|
||||
@group(0) @binding(0)
|
||||
var<storage, read_write> src0: array<{{TYPE}}>;
|
||||
|
||||
@group(0) @binding(1)
|
||||
var<storage, read_write> src1: array<{{TYPE}}>;
|
||||
|
||||
@group(0) @binding(2)
|
||||
var<uniform> params: Params;
|
||||
|
||||
override wg_size: u32;
|
||||
@compute @workgroup_size(wg_size)
|
||||
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
|
||||
if (gid.x < params.ne) {
|
||||
src0[params.offset_dst + gid.x] = src0[params.offset_src0 + gid.x] * src1[params.offset_src1 + src1_index(gid.x)];
|
||||
}
|
||||
}
|
||||
|
||||
#end(SHADER)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,57 @@
|
||||
@group(0) @binding(0)
|
||||
var<storage, read_write> src: array<f32>;
|
||||
|
||||
@group(0) @binding(1)
|
||||
var<storage, read_write> dst: array<f32>;
|
||||
|
||||
struct Params {
|
||||
offset_src: u32, // in elements
|
||||
offset_dst: u32, // in elements
|
||||
|
||||
// Strides (in elements)
|
||||
stride_src1: u32,
|
||||
stride_src2: u32,
|
||||
stride_src3: u32,
|
||||
|
||||
stride_dst1: u32,
|
||||
stride_dst2: u32,
|
||||
stride_dst3: u32,
|
||||
|
||||
// Shape of src/dst
|
||||
ne0: u32,
|
||||
ne1: u32,
|
||||
ne2: u32,
|
||||
ne3: u32,
|
||||
|
||||
eps: u32
|
||||
};
|
||||
|
||||
@group(0) @binding(2)
|
||||
var<uniform> params: Params;
|
||||
|
||||
override wg_size: u32;
|
||||
@compute @workgroup_size(wg_size)
|
||||
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
|
||||
if (gid.x >= params.ne1 * params.ne2 * params.ne3) {
|
||||
return;
|
||||
}
|
||||
|
||||
// one thread per row
|
||||
var i = gid.x;
|
||||
let i3 = i / (params.ne2 * params.ne1);
|
||||
i = i % (params.ne2 * params.ne1);
|
||||
let i2 = i / params.ne1;
|
||||
let i1 = i % params.ne1;
|
||||
let i_src_row = params.offset_src + i3 * params.stride_src3 + i2 * params.stride_src2 + i1 * params.stride_src1;
|
||||
let i_dst_row = params.offset_src + i3 * params.stride_dst3 + i2 * params.stride_dst2 + i1 * params.stride_dst1;
|
||||
|
||||
var sum = 0.0f;
|
||||
for (var j: u32 = 0; j < params.ne0; j++) {
|
||||
sum += src[i_src_row + j] * src[i_src_row + j];
|
||||
}
|
||||
let eps = bitcast<f32>(params.eps);
|
||||
let scale = 1.0/sqrt(sum/f32(params.ne0) + eps);
|
||||
for (var j: u32 = 0; j < params.ne0; j++) {
|
||||
dst[i_dst_row + j] = scale * src[i_src_row + j];
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,48 @@
|
||||
@group(0) @binding(0)
|
||||
var<storage, read_write> a: array<f32>;
|
||||
|
||||
struct Params {
|
||||
offset: u32, // in elements
|
||||
|
||||
// Strides (in elements)
|
||||
stride1: u32,
|
||||
stride2: u32,
|
||||
stride3: u32,
|
||||
|
||||
// Shape
|
||||
ne0: u32,
|
||||
ne1: u32,
|
||||
ne2: u32,
|
||||
ne3: u32,
|
||||
|
||||
eps: u32
|
||||
};
|
||||
|
||||
@group(0) @binding(1)
|
||||
var<uniform> params: Params;
|
||||
|
||||
override wg_size: u32;
|
||||
@compute @workgroup_size(wg_size)
|
||||
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
|
||||
if (gid.x >= params.ne1 * params.ne2 * params.ne3) {
|
||||
return;
|
||||
}
|
||||
|
||||
// one thread per row
|
||||
var i = gid.x;
|
||||
let i3 = i / (params.ne2 * params.ne1);
|
||||
i = i % (params.ne2 * params.ne1);
|
||||
let i2 = i / params.ne1;
|
||||
let i1 = i % params.ne1;
|
||||
let i_row = params.offset + i3 * params.stride3 + i2 * params.stride2 + i1 * params.stride1;
|
||||
|
||||
var sum = 0.0f;
|
||||
for (var j: u32 = 0; j < params.ne0; j++) {
|
||||
sum += a[i_row + j] * a[i_row + j];
|
||||
}
|
||||
let eps = bitcast<f32>(params.eps);
|
||||
let scale = 1.0/sqrt(sum/f32(params.ne0) + eps);
|
||||
for (var j: u32 = 0; j < params.ne0; j++) {
|
||||
a[i_row + j] = scale * a[i_row + j];
|
||||
}
|
||||
}
|
||||
@@ -52,7 +52,6 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
|
||||
}
|
||||
var i = gid.x;
|
||||
let i_src3 = i / (params.ne2 * params.n_rows);
|
||||
let i_dst3 = i / (params.ne2 * 3);
|
||||
|
||||
i = i % (params.ne2 * params.n_rows);
|
||||
let i_src2 = i / params.n_rows;
|
||||
|
||||
@@ -76,6 +76,7 @@ struct ggml_backend_zdnn_context {
|
||||
|
||||
struct ggml_backend_zdnn_buffer {
|
||||
void * data;
|
||||
ggml_backend_zdnn_buffer * extra; // for bias, etc.
|
||||
size_t size;
|
||||
|
||||
zdnn_tensor_desc pre_tfm_desc;
|
||||
|
||||
@@ -115,9 +115,7 @@ static void ggml_zdnn_mul_mat_op(ggml_backend_zdnn_context * ctx, const ggml_ten
|
||||
ggml_backend_zdnn_buffer * weights_extra = (ggml_backend_zdnn_buffer *)weights->extra;
|
||||
ggml_backend_zdnn_buffer * inputs_extra = (ggml_backend_zdnn_buffer *)inputs->extra;
|
||||
ggml_backend_zdnn_buffer * output_extra = (ggml_backend_zdnn_buffer *)output->extra;
|
||||
|
||||
zdnn_tensor_desc ptd_bias, td_bias;
|
||||
zdnn_ztensor zt_bias;
|
||||
ggml_backend_zdnn_buffer * bias_extra = (ggml_backend_zdnn_buffer *)output_extra->extra;
|
||||
|
||||
const int64_t weights_rows = ne01;
|
||||
const int64_t weights_cols = ne00;
|
||||
@@ -129,14 +127,6 @@ static void ggml_zdnn_mul_mat_op(ggml_backend_zdnn_context * ctx, const ggml_ten
|
||||
const int64_t output_rows = ne1;
|
||||
const int64_t output_cols = ne0;
|
||||
|
||||
const int64_t bias_dim [GGML_MAX_DIMS] = { 1, 1, 1, output_cols };
|
||||
ggml_zdnn_create_tensor(ptd_bias, td_bias, zt_bias, output, bias_dim, ZDNN_1D);
|
||||
|
||||
void * bias_data = (void *)calloc(ne0, ggml_element_size(output));
|
||||
if (weights_extra->ztensor.is_transformed == false) ggml_zdnn_load_tensor(weights_extra->ztensor, weights->data);
|
||||
if (inputs_extra->ztensor.is_transformed == false) ggml_zdnn_load_tensor(inputs_extra->ztensor, inputs->data);
|
||||
ggml_zdnn_load_tensor(zt_bias, bias_data);
|
||||
|
||||
// GGML_LOG_INFO("%s: tensor '%s' tensor dimensions: [%ld, %ld, %ld, %ld] pre_tfm_desc dimensions: [%ld, %ld, %ld, %ld]\n",
|
||||
// __func__, weights_extra->name,
|
||||
// weights->ne[3], weights->ne[2], weights->ne[1], weights->ne[0],
|
||||
@@ -158,29 +148,21 @@ static void ggml_zdnn_mul_mat_op(ggml_backend_zdnn_context * ctx, const ggml_ten
|
||||
GGML_ASSERT(inputs_extra->pre_tfm_desc.dim1 == inputs->ne[0] && "inputs_extra->pre_tfm_desc.dim1 must match inputs->ne[0]");
|
||||
GGML_ASSERT(inputs_extra->pre_tfm_desc.dim2 == inputs->ne[1] && "inputs_extra->pre_tfm_desc.dim2 must match inputs->ne[1]");
|
||||
|
||||
ZDNN_CHECK(zdnn_matmul_transpose_op(&inputs_extra->ztensor, &weights_extra->ztensor, &zt_bias,
|
||||
ZDNN_CHECK(zdnn_matmul_transpose_op(&inputs_extra->ztensor, &weights_extra->ztensor, &bias_extra->ztensor,
|
||||
false, true, MATMUL_OP_ADDITION, &output_extra->ztensor));
|
||||
// TODO: Remove in the future as we are currently DLF16 -> FP32 then in the next op, FP32 -> DLF16 again. Inefficient.
|
||||
ZDNN_CHECK(zdnn_transform_origtensor(&output_extra->ztensor, output->data));
|
||||
|
||||
ZDNN_CHECK(zdnn_free_ztensor_buffer(&zt_bias));
|
||||
free(bias_data);
|
||||
GGML_UNUSED(ctx);
|
||||
GGML_UNUSED(weights_rows);
|
||||
GGML_UNUSED(weights_cols);
|
||||
GGML_UNUSED(inputs_rows);
|
||||
GGML_UNUSED(inputs_cols);
|
||||
GGML_UNUSED(output_rows);
|
||||
GGML_UNUSED(output_cols);
|
||||
}
|
||||
|
||||
static void ggml_zdnn_mul_mat_dispatch(ggml_backend_zdnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
bool use_mul_mat_vec =
|
||||
(src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_F16)
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||
&& src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
|
||||
|
||||
bool use_mul_mat_vec_q =
|
||||
ggml_is_quantized(src0->type)
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
||||
|
||||
bool use_mul_mat_q =
|
||||
ggml_is_quantized(src0->type)
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
||||
|
||||
// debug helpers
|
||||
// GGML_LOG_INFO("%s: use_mul_mat_vec = %d\n", __func__, use_mul_mat_vec);
|
||||
// GGML_LOG_INFO("%s: use_mul_mat_vec_q = %d\n", __func__, use_mul_mat_vec_q);
|
||||
@@ -192,25 +174,7 @@ static void ggml_zdnn_mul_mat_dispatch(ggml_backend_zdnn_context * ctx, const gg
|
||||
// GGML_LOG_INFO("%s: src0 is contiguous %d, transposed %d, type = %s, name = %s\n", __func__, ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
||||
// GGML_LOG_INFO("%s: src1 is contiguous %d, transposed %d, type = %s, name = %s\n", __func__, ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
||||
|
||||
if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16
|
||||
&& !ggml_is_transposed(src0) && !ggml_is_transposed(src1)
|
||||
&& src1->ne[2] * src1->ne[3] > 1) {
|
||||
// general KQ + KQV multi-batch
|
||||
GGML_LOG_INFO("%s: using zdnn_mul_mat_batched for KQ + KQV multi-batch\n", __func__);
|
||||
// ggml_zdnn_mul_mat_batched(ctx, src0, src1, dst);
|
||||
} else if (use_mul_mat_vec) {
|
||||
GGML_LOG_INFO("%s: using zdnn_op_mul_mat_vec for vector multiplication\n", __func__);
|
||||
// ggml_zdnn_op_mul_mat(ctx, src0, src1, dst, ggml_zdnn_op_mul_mat_vec, nullptr);
|
||||
} else if (use_mul_mat_vec_q) {
|
||||
GGML_LOG_INFO("%s: using zdnn_op_mul_mat_vec_q for quantized vector multiplication\n", __func__);
|
||||
// ggml_zdnn_op_mul_mat(ctx, src0, src1, dst, ggml_zdnn_op_mul_mat_vec_q, ggml_zdnn_quantize_row_q8_1);
|
||||
} else if (use_mul_mat_q) {
|
||||
GGML_LOG_INFO("%s: using zdnn_op_mul_mat_q for quantized matrix multiplication\n", __func__);
|
||||
// ggml_zdnn_op_mul_mat(ctx, src0, src1, dst, ggml_zdnn_op_mul_mat_q, ggml_zdnn_quantize_mmq_q8_1);
|
||||
} else {
|
||||
// GGML_LOG_INFO("%s: using zdnn_op_mul_mat for general matrix multiplication\n", __func__);
|
||||
ggml_zdnn_mul_mat_op(ctx, src0, src1, dst);
|
||||
}
|
||||
ggml_zdnn_mul_mat_op(ctx, src0, src1, dst);
|
||||
}
|
||||
|
||||
static bool ggml_zdnn_compute_forward(ggml_backend_zdnn_context * ctx, ggml_tensor * dst) {
|
||||
@@ -253,6 +217,8 @@ static enum ggml_status ggml_zdnn_graph_compute(ggml_backend_t backend, ggml_cgr
|
||||
}
|
||||
|
||||
return GGML_STATUS_SUCCESS;
|
||||
|
||||
GGML_UNUSED(ctx_dev);
|
||||
}
|
||||
|
||||
static bool ggml_zdnn_supports_op(const ggml_backend_zdnn_device_context * ctx_dev, const ggml_tensor * op) {
|
||||
@@ -266,22 +232,30 @@ static bool ggml_zdnn_supports_op(const ggml_backend_zdnn_device_context * ctx_d
|
||||
|
||||
case GGML_OP_MUL_MAT:
|
||||
{
|
||||
const ggml_tensor * src0 = op->src[0];
|
||||
const ggml_tensor * src1 = op->src[1];
|
||||
const ggml_tensor * weights = op->src[0];
|
||||
const ggml_tensor * inputs = op->src[1];
|
||||
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
const int64_t ne0 = op->ne[0];
|
||||
const int64_t ne1 = op->ne[1];
|
||||
const int64_t ne10 = inputs->ne[0];
|
||||
const int64_t ne0 = op->ne[0];
|
||||
const int64_t ne1 = op->ne[1];
|
||||
|
||||
const int64_t max_batch = ctx_dev->max_size;
|
||||
|
||||
return ggml_is_matrix(src0) &&
|
||||
ggml_is_matrix(src1) &&
|
||||
ggml_is_contiguous(src0) &&
|
||||
ggml_is_contiguous(src1) &&
|
||||
src0->view_src == nullptr && src1->view_src == nullptr &&
|
||||
src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 &&
|
||||
(ne0 <= max_batch && ne1 <= max_batch && ne10 <= max_batch);
|
||||
if (!ggml_is_matrix(weights) || !ggml_is_matrix(inputs) ||
|
||||
!ggml_is_contiguous(weights) || !ggml_is_contiguous(inputs) ||
|
||||
weights->view_src != nullptr || inputs->view_src != nullptr ||
|
||||
ne0 > max_batch || ne1 > max_batch || ne10 > max_batch) {
|
||||
return false;
|
||||
}
|
||||
|
||||
switch (weights->type) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_BF16:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
} break;
|
||||
|
||||
default:
|
||||
@@ -374,10 +348,12 @@ static void ggml_zdnn_free(ggml_backend_zdnn_context * ctx) {
|
||||
static void ggml_backend_zdnn_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_zdnn_buffer_context * ctx = (ggml_backend_zdnn_buffer_context *)buffer->context;
|
||||
|
||||
for (int i = 0; i < ctx->n_buffers; i++) {
|
||||
if (ctx->buffers[i]->ztensor.buffer != NULL && ctx->buffers[i]->ztensor.is_transformed) {
|
||||
ZDNN_CHECK(zdnn_free_ztensor_buffer(&ctx->buffers[i]->ztensor));
|
||||
}
|
||||
for (const auto & buf_ptr : ctx->buffers) {
|
||||
ggml_backend_zdnn_buffer * buf = buf_ptr.get();
|
||||
|
||||
// Free any extra buffer allocated for the tensor. E.g., bias for GGML_OP_MUL_MAT
|
||||
if (buf->extra != nullptr) free(buf->extra->data);
|
||||
if (buf->ztensor.buffer_size > 0) ZDNN_CHECK(zdnn_free_ztensor_buffer(&buf->ztensor));
|
||||
}
|
||||
|
||||
delete ctx;
|
||||
@@ -402,11 +378,37 @@ static enum ggml_status ggml_backend_zdnn_buffer_init_tensor(ggml_backend_buffer
|
||||
std::unique_ptr<ggml_backend_zdnn_buffer> zdnn_buffer = std::make_unique<ggml_backend_zdnn_buffer>();
|
||||
zdnn_buffer->data = tensor->data;
|
||||
zdnn_buffer->size = tsize;
|
||||
strncpy(zdnn_buffer->name, tensor->name, GGML_MAX_NAME - 1);
|
||||
zdnn_buffer->extra = nullptr;
|
||||
snprintf(zdnn_buffer->name, GGML_MAX_NAME, "%s", tensor->name);
|
||||
|
||||
ggml_zdnn_init_tensor(zdnn_buffer.get(), tensor);
|
||||
tensor->extra = zdnn_buffer.get();
|
||||
|
||||
switch (tensor->op) {
|
||||
case GGML_OP_MUL_MAT:
|
||||
{
|
||||
std::unique_ptr<ggml_backend_zdnn_buffer> zdnn_bias_buffer = std::make_unique<ggml_backend_zdnn_buffer>();
|
||||
zdnn_bias_buffer->data = (void *)calloc(tensor->ne[0], ggml_element_size(tensor));
|
||||
zdnn_bias_buffer->size = ggml_element_size(tensor) * tensor->ne[0];
|
||||
snprintf(zdnn_bias_buffer->name, GGML_MAX_NAME, "%.*s (bias)",
|
||||
GGML_MAX_NAME - (int)sizeof(" (bias)"), tensor->name);
|
||||
|
||||
const int64_t bias_dim[GGML_MAX_DIMS] = { 1, 1, 1, tensor->ne[0] };
|
||||
ggml_zdnn_create_tensor(zdnn_bias_buffer->pre_tfm_desc,
|
||||
zdnn_bias_buffer->tfm_desc,
|
||||
zdnn_bias_buffer->ztensor,
|
||||
tensor, bias_dim, ZDNN_1D);
|
||||
|
||||
ggml_zdnn_load_tensor(zdnn_bias_buffer->ztensor, zdnn_bias_buffer->data);
|
||||
zdnn_buffer->extra = zdnn_bias_buffer.get();
|
||||
|
||||
ctx->buffers.push_back(std::move(zdnn_bias_buffer));
|
||||
ctx->n_buffers++;
|
||||
} break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
ctx->buffers.push_back(std::move(zdnn_buffer));
|
||||
ctx->n_buffers++;
|
||||
|
||||
@@ -414,6 +416,8 @@ static enum ggml_status ggml_backend_zdnn_buffer_init_tensor(ggml_backend_buffer
|
||||
// __func__, tensor->name, buffer_idx, tsize);
|
||||
|
||||
return GGML_STATUS_SUCCESS;
|
||||
|
||||
GGML_UNUSED(buffer_idx);
|
||||
}
|
||||
|
||||
static void ggml_backend_zdnn_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
||||
@@ -425,6 +429,13 @@ static void ggml_backend_zdnn_buffer_memset_tensor(ggml_backend_buffer_t buffer,
|
||||
static void ggml_backend_zdnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
memcpy((char *)tensor->data + offset, data, size);
|
||||
|
||||
ggml_backend_zdnn_buffer * extra = (ggml_backend_zdnn_buffer *)tensor->extra;
|
||||
|
||||
// Fixes the LLAMA_SET_ROWS bug
|
||||
// see: https://github.com/ggml-org/llama.cpp/issues/15414
|
||||
if (tensor->buffer->usage == GGML_BACKEND_BUFFER_USAGE_COMPUTE && extra->ztensor.is_transformed) zdnn_reset_ztensor(&extra->ztensor);
|
||||
if (extra->ztensor.is_transformed == false) ggml_zdnn_load_tensor(extra->ztensor, tensor->data);
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
@@ -528,29 +539,6 @@ ggml_backend_buffer_type_t ggml_backend_zdnn_buffer_type(void) {
|
||||
return &ggml_backend_buffer_type_zdnn;
|
||||
}
|
||||
|
||||
static const char * ggml_backend_zdnn_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||
return GGML_ZDNN_NAME "_Mapped";
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_zdnn_buffer_from_ptr_type(void) {
|
||||
static ggml_backend_buffer_type ggml_backend_buffer_from_ptr_type_zdnn = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_zdnn_buffer_from_ptr_type_get_name,
|
||||
/* .alloc_buffer = */ ggml_backend_zdnn_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_zdnn_buffer_type_get_alignment,
|
||||
/* .get_max_size = */ NULL,
|
||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||
/* .is_host = */ ggml_backend_zdnn_buffer_type_is_host,
|
||||
},
|
||||
/* .device = */ &g_ggml_backend_zdnn_device,
|
||||
/* .context = */ NULL,
|
||||
};
|
||||
|
||||
return &ggml_backend_buffer_from_ptr_type_zdnn;
|
||||
}
|
||||
|
||||
//
|
||||
// backend
|
||||
//
|
||||
@@ -594,27 +582,6 @@ static ggml_guid_t ggml_backend_zdnn_guid(void) {
|
||||
return reinterpret_cast<ggml_guid_t>((void *)guid_str);
|
||||
}
|
||||
|
||||
// TODO: remove in the future
|
||||
ggml_backend_t ggml_backend_zdnn_init(void) {
|
||||
ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_zdnn_reg(), 0);
|
||||
|
||||
ggml_backend_zdnn_context * ctx = ggml_zdnn_init(dev);
|
||||
if (ctx == NULL) {
|
||||
GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ggml_backend_t backend = (ggml_backend_t)malloc(sizeof(ggml_backend));
|
||||
*backend = (ggml_backend) {
|
||||
/* .guid = */ ggml_backend_zdnn_guid(),
|
||||
/* .iface = */ ggml_backend_zdnn_i,
|
||||
/* .device = */ dev,
|
||||
/* .context = */ ctx,
|
||||
};
|
||||
|
||||
return backend;
|
||||
}
|
||||
|
||||
bool ggml_backend_is_zdnn(ggml_backend_t backend) {
|
||||
return backend != NULL &&
|
||||
ggml_guid_matches(backend->guid, ggml_backend_zdnn_guid());
|
||||
@@ -634,11 +601,15 @@ static const char * ggml_backend_zdnn_device_get_name(ggml_backend_dev_t dev) {
|
||||
|
||||
static const char * ggml_backend_zdnn_device_get_description(ggml_backend_dev_t dev) {
|
||||
return "IBM Z Neural Network Processing Assist (NNPA)";
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static void ggml_backend_zdnn_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||
*free = 0;
|
||||
*total = 0;
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static enum ggml_backend_dev_type ggml_backend_zdnn_device_get_type(ggml_backend_dev_t dev) {
|
||||
@@ -655,8 +626,8 @@ static void ggml_backend_zdnn_device_get_props(ggml_backend_dev_t dev, ggml_back
|
||||
props->caps = (ggml_backend_dev_caps) {
|
||||
/* .async = */ false,
|
||||
/* .host_buffer = */ false,
|
||||
/* .buffer_from_host_ptr = */ true,
|
||||
/* .events = */ false,
|
||||
/* .buffer_from_host_ptr = */ false,
|
||||
/* .events = */ false
|
||||
};
|
||||
}
|
||||
|
||||
@@ -672,7 +643,7 @@ static ggml_backend_t ggml_backend_zdnn_device_init(ggml_backend_dev_t dev, cons
|
||||
/* .guid = */ ggml_backend_zdnn_guid(),
|
||||
/* .iface = */ ggml_backend_zdnn_i,
|
||||
/* .device = */ dev,
|
||||
/* .context = */ ctx,
|
||||
/* .context = */ ctx
|
||||
};
|
||||
|
||||
return backend;
|
||||
@@ -686,46 +657,6 @@ static ggml_backend_buffer_type_t ggml_backend_zdnn_device_get_buffer_type(ggml_
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_zdnn_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
||||
ggml_backend_zdnn_buffer_context * ctx = new ggml_backend_zdnn_buffer_context();
|
||||
|
||||
ctx->all_data = ptr;
|
||||
ctx->all_size = size;
|
||||
ctx->owned = false;
|
||||
ctx->n_buffers = 0;
|
||||
|
||||
const size_t size_page = sysconf(_SC_PAGESIZE);
|
||||
|
||||
// page-align the data ptr
|
||||
{
|
||||
const uintptr_t offs = (uintptr_t) ptr % size_page;
|
||||
ptr = (void *)((char *)ptr - offs);
|
||||
size += offs;
|
||||
}
|
||||
|
||||
size_t size_aligned = size;
|
||||
if ((size_aligned % size_page) != 0) {
|
||||
size_aligned += size_page - (size_aligned % size_page);
|
||||
}
|
||||
|
||||
ggml_backend_zdnn_device_context * ctx_dev = (ggml_backend_zdnn_device_context *)dev->context;
|
||||
|
||||
GGML_ASSERT(ctx_dev->zdnn_device >= 0);
|
||||
int device = ctx_dev->zdnn_device; GGML_UNUSED(device);
|
||||
|
||||
std::unique_ptr<ggml_backend_zdnn_buffer> zdnn_buffer = std::make_unique<ggml_backend_zdnn_buffer>();
|
||||
zdnn_buffer->data = ptr;
|
||||
zdnn_buffer->size = size;
|
||||
ctx->buffers.push_back(std::move(zdnn_buffer));
|
||||
|
||||
GGML_LOG_INFO("%s: allocated buffer, size = %8.2f MiB\n",
|
||||
__func__, size_aligned / 1024.0 / 1024.0);
|
||||
|
||||
++ctx->n_buffers;
|
||||
|
||||
return ggml_backend_buffer_init(ggml_backend_zdnn_buffer_from_ptr_type(), ggml_backend_zdnn_buffer_i, ctx, size);
|
||||
}
|
||||
|
||||
static bool ggml_backend_zdnn_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
ggml_backend_zdnn_device_context * ctx_dev = (ggml_backend_zdnn_device_context *) dev->context;
|
||||
|
||||
@@ -734,8 +665,7 @@ static bool ggml_backend_zdnn_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
|
||||
static bool ggml_backend_zdnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
||||
return
|
||||
buft->iface.get_name == ggml_backend_zdnn_buffer_type_get_name ||
|
||||
buft->iface.get_name == ggml_backend_zdnn_buffer_from_ptr_type_get_name;
|
||||
buft->iface.get_name == ggml_backend_zdnn_buffer_type_get_name;
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
@@ -749,7 +679,7 @@ static ggml_backend_device_i ggml_backend_zdnn_device_i = {
|
||||
/* .init_backend = */ ggml_backend_zdnn_device_init,
|
||||
/* .get_buffer_type = */ ggml_backend_zdnn_device_get_buffer_type,
|
||||
/* .get_host_buffer_type = */ NULL,
|
||||
/* .buffer_from_host_ptr = */ ggml_backend_zdnn_device_buffer_from_ptr,
|
||||
/* .buffer_from_host_ptr = */ NULL,
|
||||
/* .supports_op = */ ggml_backend_zdnn_device_supports_op,
|
||||
/* .supports_buft = */ ggml_backend_zdnn_device_supports_buft,
|
||||
/* .offload_op = */ NULL,
|
||||
@@ -813,7 +743,7 @@ static ggml_backend_reg_i ggml_backend_zdnn_reg_i = {
|
||||
/* .get_name = */ ggml_backend_zdnn_reg_get_name,
|
||||
/* .get_device_count = */ ggml_backend_zdnn_reg_device_count,
|
||||
/* .get_device = */ ggml_backend_zdnn_reg_device_get,
|
||||
/* .get_proc_address = */ ggml_backend_zdnn_get_proc_address,
|
||||
/* .get_proc_address = */ ggml_backend_zdnn_get_proc_address
|
||||
};
|
||||
|
||||
static void ggml_zdnn_cleanup(void) {
|
||||
@@ -831,13 +761,13 @@ ggml_backend_reg_t ggml_backend_zdnn_reg(void) {
|
||||
g_ggml_backend_zdnn_reg = (ggml_backend_reg) {
|
||||
/* .api_version = */ GGML_ZDNN_VERSION,
|
||||
/* .iface = */ ggml_backend_zdnn_reg_i,
|
||||
/* .context = */ NULL,
|
||||
/* .context = */ NULL
|
||||
};
|
||||
|
||||
g_ggml_backend_zdnn_device = (ggml_backend_device) {
|
||||
/* .iface = */ ggml_backend_zdnn_device_i,
|
||||
/* .reg = */ &g_ggml_backend_zdnn_reg,
|
||||
/* .context = */ &g_ggml_ctx_dev_main,
|
||||
/* .context = */ &g_ggml_ctx_dev_main
|
||||
};
|
||||
|
||||
return &g_ggml_backend_zdnn_reg;
|
||||
|
||||
+1
-5
@@ -4923,12 +4923,8 @@ struct ggml_tensor * ggml_timestep_embedding(
|
||||
struct ggml_tensor * timesteps,
|
||||
int dim,
|
||||
int max_period) {
|
||||
int actual_dim = dim;
|
||||
if (dim % 2 != 0) {
|
||||
actual_dim = dim + 1;
|
||||
}
|
||||
|
||||
struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, actual_dim, timesteps->ne[0]);
|
||||
struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, dim, timesteps->ne[0]);
|
||||
|
||||
ggml_set_op_params_i32(result, 0, dim);
|
||||
ggml_set_op_params_i32(result, 1, max_period);
|
||||
|
||||
@@ -111,6 +111,7 @@ class Keys:
|
||||
DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
|
||||
DECODER_BLOCK_COUNT = "{arch}.decoder_block_count"
|
||||
ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping"
|
||||
ROUTER_LOGIT_SOFTCAPPING = "{arch}.router_logit_softcapping"
|
||||
FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping"
|
||||
SWIN_NORM = "{arch}.swin_norm"
|
||||
RESCALE_EVERY_N_LAYERS = "{arch}.rescale_every_n_layers"
|
||||
@@ -146,21 +147,27 @@ class Keys:
|
||||
REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count"
|
||||
SLIDING_WINDOW = "{arch}.attention.sliding_window"
|
||||
SCALE = "{arch}.attention.scale"
|
||||
OUTPUT_SCALE = "{arch}.attention.output_scale"
|
||||
TEMPERATURE_LENGTH = "{arch}.attention.temperature_length"
|
||||
KEY_LENGTH_MLA = "{arch}.attention.key_length_mla"
|
||||
VALUE_LENGTH_MLA = "{arch}.attention.value_length_mla"
|
||||
SHARED_KV_LAYERS = "{arch}.attention.shared_kv_layers"
|
||||
SLIDING_WINDOW_PATTERN = "{arch}.attention.sliding_window_pattern"
|
||||
|
||||
class Rope:
|
||||
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
||||
DIMENSION_SECTIONS = "{arch}.rope.dimension_sections"
|
||||
FREQ_BASE = "{arch}.rope.freq_base"
|
||||
SCALING_TYPE = "{arch}.rope.scaling.type"
|
||||
SCALING_FACTOR = "{arch}.rope.scaling.factor"
|
||||
SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor"
|
||||
SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length"
|
||||
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
||||
SCALING_YARN_LOG_MUL = "{arch}.rope.scaling.yarn_log_multiplier"
|
||||
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
||||
DIMENSION_SECTIONS = "{arch}.rope.dimension_sections"
|
||||
FREQ_BASE = "{arch}.rope.freq_base"
|
||||
SCALING_TYPE = "{arch}.rope.scaling.type"
|
||||
SCALING_FACTOR = "{arch}.rope.scaling.factor"
|
||||
SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor"
|
||||
SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length"
|
||||
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
||||
SCALING_YARN_LOG_MUL = "{arch}.rope.scaling.yarn_log_multiplier"
|
||||
SCALING_YARN_EXT_FACTOR = "{arch}.rope.scaling.yarn_ext_factor"
|
||||
SCALING_YARN_ATTN_FACTOR = "{arch}.rope.scaling.yarn_attn_factor"
|
||||
SCALING_YARN_BETA_FAST = "{arch}.rope.scaling.yarn_beta_fast"
|
||||
SCALING_YARN_BETA_SLOW = "{arch}.rope.scaling.yarn_beta_slow"
|
||||
|
||||
class Split:
|
||||
LLM_KV_SPLIT_NO = "split.no"
|
||||
@@ -392,6 +399,7 @@ class MODEL_ARCH(IntEnum):
|
||||
DREAM = auto()
|
||||
SMALLTHINKER = auto()
|
||||
LLADA = auto()
|
||||
LLADA_MOE = auto()
|
||||
SEED_OSS = auto()
|
||||
|
||||
|
||||
@@ -728,6 +736,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.DREAM: "dream",
|
||||
MODEL_ARCH.SMALLTHINKER: "smallthinker",
|
||||
MODEL_ARCH.LLADA: "llada",
|
||||
MODEL_ARCH.LLADA_MOE: "llada-moe",
|
||||
MODEL_ARCH.SEED_OSS: "seed_oss",
|
||||
}
|
||||
|
||||
@@ -1114,6 +1123,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_GATE_EXP,
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
MODEL_TENSOR.FFN_POST_NORM,
|
||||
MODEL_TENSOR.LAYER_OUT_NORM,
|
||||
],
|
||||
MODEL_ARCH.GPTNEOX: [
|
||||
@@ -2685,6 +2695,23 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
],
|
||||
MODEL_ARCH.LLADA_MOE: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q_NORM,
|
||||
MODEL_TENSOR.ATTN_K_NORM,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE_INP,
|
||||
MODEL_TENSOR.FFN_GATE_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
],
|
||||
# TODO
|
||||
}
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user