mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-15 10:16:45 +02:00
Compare commits
77 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 5edf1592fd | |||
| db3010bd23 | |||
| ff27f80a74 | |||
| d3248d9b65 | |||
| 7aeee88cfe | |||
| b07791aa1d | |||
| 4227c9be42 | |||
| df36bce667 | |||
| f75b830647 | |||
| 7a0de96045 | |||
| e4e915912c | |||
| 5ba36f6103 | |||
| b204a5a234 | |||
| 646944cfa8 | |||
| 1a01899b61 | |||
| 863d341eeb | |||
| d32e03f449 | |||
| 3973163bff | |||
| 5ade3000bd | |||
| 8b2483730f | |||
| 810b9fc8b9 | |||
| 4ebd0c125b | |||
| 5cdb27e091 | |||
| 3ea913f1ce | |||
| 29c8fbe4e0 | |||
| 1adc9812bd | |||
| b3e16665e1 | |||
| c24f4e2688 | |||
| d8914fc47e | |||
| e885445bc1 | |||
| 648ebcdb73 | |||
| 07aa869a91 | |||
| 00f35d509e | |||
| 6028bf7435 | |||
| bc5182272c | |||
| e71d48e326 | |||
| b0493156fa | |||
| f4586ee598 | |||
| 60a7658810 | |||
| efe3a90996 | |||
| bbd57b7eaf | |||
| 25ff6f7659 | |||
| be48528b06 | |||
| cf9e5648a7 | |||
| fba5c0d680 | |||
| 53d0a12658 | |||
| 27093afe78 | |||
| 228f724d9c | |||
| cd3069dfcb | |||
| 50e81bdf5d | |||
| 1ebbaddff2 | |||
| a3a7874272 | |||
| 002cb1bb33 | |||
| 79c1160b07 | |||
| 34c9d765bf | |||
| e54d41befc | |||
| 4850b52aed | |||
| cd6983d56d | |||
| 6c7e9a5440 | |||
| 1425f587a8 | |||
| aaa3d07ae7 | |||
| 50aa938901 | |||
| c4f53563df | |||
| a0552c8bee | |||
| 99acbc9921 | |||
| 7ad67ba9fe | |||
| 9a96389544 | |||
| 1d72c84188 | |||
| 20638e4f16 | |||
| 36d3f00e14 | |||
| 5fd160bbd9 | |||
| 756cfea826 | |||
| e725a1a982 | |||
| 3db4da56a5 | |||
| 476aa3fd57 | |||
| 0d8831543c | |||
| 65c797c4fa |
@@ -1,22 +0,0 @@
|
||||
node('x86_runner1'){ // Running on x86 runner containing latest vector qemu, latest vector gcc and all the necessary libraries
|
||||
stage('Cleanup'){
|
||||
cleanWs() // Cleaning previous CI build in workspace
|
||||
}
|
||||
stage('checkout repo'){
|
||||
retry(5){ // Retry if the cloning fails due to some reason
|
||||
checkout scm // Clone the repo on Runner
|
||||
}
|
||||
}
|
||||
stage('Compiling llama.cpp'){
|
||||
sh'''#!/bin/bash
|
||||
make RISCV=1 RISCV_CROSS_COMPILE=1 # Compiling llama for RISC-V
|
||||
'''
|
||||
}
|
||||
stage('Running llama.cpp'){
|
||||
sh'''#!/bin/bash
|
||||
module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc
|
||||
qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama-cli -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64
|
||||
cat llama_log.txt # Printing results
|
||||
'''
|
||||
}
|
||||
}
|
||||
@@ -4,8 +4,6 @@ FROM ubuntu:$UBUNTU_VERSION AS build
|
||||
|
||||
ARG TARGETARCH
|
||||
|
||||
ARG GGML_CPU_ARM_ARCH=armv8-a
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
||||
|
||||
@@ -13,10 +11,8 @@ WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN if [ "$TARGETARCH" = "amd64" ]; then \
|
||||
RUN if [ "$TARGETARCH" = "amd64" ] || [ "$TARGETARCH" = "arm64" ]; then \
|
||||
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
|
||||
elif [ "$TARGETARCH" = "arm64" ]; then \
|
||||
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
|
||||
else \
|
||||
echo "Unsupported architecture"; \
|
||||
exit 1; \
|
||||
|
||||
@@ -60,8 +60,7 @@ RUN apt-get update \
|
||||
git \
|
||||
python3 \
|
||||
python3-pip \
|
||||
&& pip install --upgrade pip setuptools wheel \
|
||||
&& pip install -r requirements.txt \
|
||||
&& pip install --break-system-packages -r requirements.txt \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
|
||||
@@ -40,7 +40,7 @@ body:
|
||||
attributes:
|
||||
label: GGML backends
|
||||
description: Which GGML backends do you know to be affected?
|
||||
options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL]
|
||||
options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL, zDNN]
|
||||
multiple: true
|
||||
validations:
|
||||
required: true
|
||||
|
||||
@@ -42,7 +42,7 @@ body:
|
||||
attributes:
|
||||
label: GGML backends
|
||||
description: Which GGML backends do you know to be affected?
|
||||
options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL]
|
||||
options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL, zDNN]
|
||||
multiple: true
|
||||
validations:
|
||||
required: true
|
||||
|
||||
@@ -22,6 +22,11 @@ Vulkan:
|
||||
- any-glob-to-any-file:
|
||||
- ggml/include/ggml-vulkan.h
|
||||
- ggml/src/ggml-vulkan/**
|
||||
IBM zDNN:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml/include/ggml-zdnn.h
|
||||
- ggml/src/ggml-zdnn/**
|
||||
documentation:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
|
||||
@@ -0,0 +1,43 @@
|
||||
name: Build on RISCV Linux Machine by Cloud-V
|
||||
on:
|
||||
workflow_dispatch:
|
||||
workflow_call:
|
||||
|
||||
jobs:
|
||||
bianbu-riscv64-native: # Bianbu 2.2
|
||||
runs-on: self-hosted
|
||||
|
||||
steps:
|
||||
- name: Install prerequisites
|
||||
run: |
|
||||
sudo apt-get update || true
|
||||
sudo apt-get install -y libatomic1
|
||||
- uses: actions/checkout@v4
|
||||
- name: Setup Riscv
|
||||
run: |
|
||||
sudo apt-get update || true
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
gcc-14-riscv64-linux-gnu \
|
||||
g++-14-riscv64-linux-gnu \
|
||||
cmake
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
cmake -B build -DLLAMA_CURL=OFF \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
-DLLAMA_BUILD_TOOLS=ON \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=Linux \
|
||||
-DCMAKE_SYSTEM_PROCESSOR=riscv64 \
|
||||
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
|
||||
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
|
||||
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
|
||||
-DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
|
||||
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
+28
-35
@@ -64,7 +64,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: macOS-latest-cmake-arm64
|
||||
evict-old-files: 1d
|
||||
@@ -104,7 +104,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: macOS-latest-cmake-x64
|
||||
evict-old-files: 1d
|
||||
@@ -144,7 +144,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: macOS-latest-cmake-arm64-webgpu
|
||||
evict-old-files: 1d
|
||||
@@ -179,7 +179,6 @@ jobs:
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
export LLAMA_SET_ROWS=0
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
@@ -200,7 +199,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ubuntu-cpu-cmake
|
||||
evict-old-files: 1d
|
||||
@@ -252,7 +251,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ubuntu-latest-cmake-sanitizer-${{ matrix.sanitizer }}
|
||||
evict-old-files: 1d
|
||||
@@ -331,7 +330,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ubuntu-latest-cmake-rpc
|
||||
evict-old-files: 1d
|
||||
@@ -364,7 +363,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ubuntu-22-cmake-vulkan
|
||||
evict-old-files: 1d
|
||||
@@ -401,7 +400,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ubuntu-22-cmake-webgpu
|
||||
evict-old-files: 1d
|
||||
@@ -438,14 +437,13 @@ jobs:
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
export LLAMA_SET_ROWS=0
|
||||
cd build
|
||||
# This is using llvmpipe and runs slower than other backends
|
||||
ctest -L main --verbose --timeout 3600
|
||||
|
||||
ubuntu-22-cmake-hip:
|
||||
runs-on: ubuntu-22.04
|
||||
container: rocm/dev-ubuntu-22.04:6.0.2
|
||||
container: rocm/dev-ubuntu-22.04:6.1.2
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -459,7 +457,7 @@ jobs:
|
||||
sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libcurl4-openssl-dev
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ubuntu-22-cmake-hip
|
||||
evict-old-files: 1d
|
||||
@@ -473,16 +471,6 @@ jobs:
|
||||
-DGGML_HIP=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Build with legacy HIP support
|
||||
id: cmake_build_legacy_hip
|
||||
run: |
|
||||
cmake -B build2 -S . \
|
||||
-DCMAKE_C_COMPILER=hipcc \
|
||||
-DCMAKE_CXX_COMPILER=hipcc \
|
||||
-DGGML_HIP_ROCWMMA_FATTN=ON \
|
||||
-DGGML_HIP=ON
|
||||
cmake --build build2 --config Release -j $(nproc)
|
||||
|
||||
ubuntu-22-cmake-musa:
|
||||
runs-on: ubuntu-22.04
|
||||
container: mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
|
||||
@@ -499,7 +487,7 @@ jobs:
|
||||
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ubuntu-22-cmake-musa
|
||||
evict-old-files: 1d
|
||||
@@ -544,7 +532,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ubuntu-22-cmake-sycl
|
||||
evict-old-files: 1d
|
||||
@@ -592,7 +580,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ubuntu-22-cmake-sycl-fp16
|
||||
evict-old-files: 1d
|
||||
@@ -623,7 +611,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: macOS-latest-cmake-ios
|
||||
evict-old-files: 1d
|
||||
@@ -660,7 +648,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: macOS-latest-cmake-tvos
|
||||
evict-old-files: 1d
|
||||
@@ -732,7 +720,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: macOS-latest-swift
|
||||
evict-old-files: 1d
|
||||
@@ -778,7 +766,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-msys2
|
||||
variant: ccache
|
||||
@@ -846,7 +834,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-latest-cmake-${{ matrix.build }}
|
||||
variant: ccache
|
||||
@@ -960,7 +948,7 @@ jobs:
|
||||
apt install -y cmake build-essential ninja-build libgomp1 git libcurl4-openssl-dev
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ubuntu-latest-cmake-cuda
|
||||
evict-old-files: 1d
|
||||
@@ -989,7 +977,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Install ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-cuda-${{ matrix.cuda }}
|
||||
variant: ccache
|
||||
@@ -1045,7 +1033,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-latest-cmake-sycl
|
||||
variant: ccache
|
||||
@@ -1091,7 +1079,7 @@ jobs:
|
||||
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
|
||||
|
||||
- name: Install ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ${{ github.job }}
|
||||
evict-old-files: 1d
|
||||
@@ -1125,6 +1113,11 @@ jobs:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Xcode
|
||||
uses: maxim-lobanov/setup-xcode@v1
|
||||
with:
|
||||
xcode-version: latest-stable
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
@@ -1158,7 +1151,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: android-build
|
||||
evict-old-files: 1d
|
||||
|
||||
@@ -0,0 +1,53 @@
|
||||
name: "Copilot Setup Steps"
|
||||
|
||||
# Automatically run the setup steps when they are changed to allow for easy validation, and
|
||||
# allow manual testing through the repository's "Actions" tab
|
||||
on:
|
||||
workflow_dispatch:
|
||||
push:
|
||||
paths:
|
||||
- .github/workflows/copilot-setup-steps.yml
|
||||
pull_request:
|
||||
paths:
|
||||
- .github/workflows/copilot-setup-steps.yml
|
||||
|
||||
jobs:
|
||||
# The job MUST be called `copilot-setup-steps` or it will not be picked up by Copilot.
|
||||
copilot-setup-steps:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
# Set the permissions to the lowest permissions possible needed for your steps.
|
||||
# Copilot will be given its own token for its operations.
|
||||
permissions:
|
||||
# If you want to clone the repository as part of your setup steps, for example to install dependencies, you'll need the `contents: read` permission. If you don't clone the repository in your setup steps, Copilot will do this for you automatically after the steps complete.
|
||||
contents: read
|
||||
|
||||
# You can define any steps you want, and they will run before the agent starts.
|
||||
# If you do not check out your code, Copilot will do this for you.
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: copilot-setup-steps
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential libcurl4-openssl-dev
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.11'
|
||||
|
||||
- name: Install Python dependencies
|
||||
run: |
|
||||
python3 -m venv .venv
|
||||
.venv/bin/activate
|
||||
pip install -r requirements/requirements-all.txt -r tools/server/tests/requirements.txt
|
||||
pip install flake8 pyright
|
||||
@@ -32,7 +32,7 @@ jobs:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: macOS-latest-cmake-arm64
|
||||
evict-old-files: 1d
|
||||
@@ -85,7 +85,7 @@ jobs:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: macOS-latest-cmake-x64
|
||||
evict-old-files: 1d
|
||||
@@ -147,7 +147,7 @@ jobs:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ubuntu-cpu-cmake
|
||||
evict-old-files: 1d
|
||||
@@ -198,7 +198,7 @@ jobs:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ubuntu-22-cmake-vulkan
|
||||
evict-old-files: 1d
|
||||
@@ -256,7 +256,7 @@ jobs:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-latest-cmake-cpu-${{ matrix.arch }}
|
||||
variant: ccache
|
||||
@@ -328,7 +328,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-latest-cmake-${{ matrix.backend }}-${{ matrix.arch }}
|
||||
variant: ccache
|
||||
@@ -398,7 +398,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Install ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-cuda-${{ matrix.cuda }}
|
||||
variant: ccache
|
||||
@@ -471,7 +471,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-latest-cmake-sycl
|
||||
variant: ccache
|
||||
@@ -545,7 +545,7 @@ jobs:
|
||||
git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-latest-cmake-hip-${{ matrix.name }}-x64
|
||||
evict-old-files: 1d
|
||||
@@ -600,7 +600,7 @@ jobs:
|
||||
name: llama-bin-win-hip-${{ matrix.name }}-x64.zip
|
||||
|
||||
ios-xcode-build:
|
||||
runs-on: macos-latest
|
||||
runs-on: macos-15
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
@@ -608,6 +608,10 @@ jobs:
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Setup Xcode
|
||||
run: |
|
||||
sudo xcode-select -s /Applications/Xcode_16.4.app
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
|
||||
@@ -12,6 +12,8 @@ if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
|
||||
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
|
||||
endif()
|
||||
|
||||
message("CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}")
|
||||
|
||||
# Add path to modules
|
||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
|
||||
|
||||
|
||||
@@ -10,3 +10,4 @@
|
||||
/ggml/src/ggml-opt.cpp @JohannesGaessler
|
||||
/ggml/src/gguf.cpp @JohannesGaessler
|
||||
/ggml/src/ggml-vulkan/ @0cc4m
|
||||
/ggml/src/ggml-zdnn/ @taronaeo
|
||||
|
||||
@@ -17,6 +17,7 @@ LLM inference in C/C++
|
||||
|
||||
## Hot topics
|
||||
|
||||
- **[[FEEDBACK] Better packaging for llama.cpp to support downstream consumers 🤗](https://github.com/ggml-org/llama.cpp/discussions/15313)**
|
||||
- Support for the `gpt-oss` model with native MXFP4 format has been added | [PR](https://github.com/ggml-org/llama.cpp/pull/15091) | [Collaboration with NVIDIA](https://blogs.nvidia.com/blog/rtx-ai-garage-openai-oss) | [Comment](https://github.com/ggml-org/llama.cpp/discussions/15095)
|
||||
- Hot PRs: [All](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+) | [Open](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+is%3Aopen)
|
||||
- Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
|
||||
@@ -240,7 +241,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
<details>
|
||||
<summary>Infrastructure</summary>
|
||||
|
||||
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
|
||||
- [Paddler](https://github.com/intentee/paddler) - Open-source LLMOps platform for hosting and scaling AI in your own infrastructure
|
||||
- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
|
||||
- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
|
||||
- [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server
|
||||
|
||||
+124
-40
@@ -749,6 +749,39 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
|
||||
// utils
|
||||
//
|
||||
|
||||
// Helper function to parse tensor buffer override strings
|
||||
static void parse_tensor_buffer_overrides(const std::string & value, std::vector<llama_model_tensor_buft_override> & overrides) {
|
||||
std::map<std::string, ggml_backend_buffer_type_t> buft_list;
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
||||
auto * dev = ggml_backend_dev_get(i);
|
||||
auto * buft = ggml_backend_dev_buffer_type(dev);
|
||||
if (buft) {
|
||||
buft_list[ggml_backend_buft_name(buft)] = buft;
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto & override : string_split<std::string>(value, ',')) {
|
||||
std::string::size_type pos = override.find('=');
|
||||
if (pos == std::string::npos) {
|
||||
throw std::invalid_argument("invalid value");
|
||||
}
|
||||
std::string tensor_name = override.substr(0, pos);
|
||||
std::string buffer_type = override.substr(pos + 1);
|
||||
|
||||
if (buft_list.find(buffer_type) == buft_list.end()) {
|
||||
printf("Available buffer types:\n");
|
||||
for (const auto & it : buft_list) {
|
||||
printf(" %s\n", ggml_backend_buft_name(it.second));
|
||||
}
|
||||
throw std::invalid_argument("unknown buffer type");
|
||||
}
|
||||
// keep strings alive and avoid leaking memory by storing them in a static vector
|
||||
static std::list<std::string> buft_overrides;
|
||||
buft_overrides.push_back(tensor_name);
|
||||
overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)});
|
||||
}
|
||||
}
|
||||
|
||||
struct handle_model_result {
|
||||
bool found_mmproj = false;
|
||||
common_params_model mmproj;
|
||||
@@ -993,6 +1026,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||
params.tensor_buft_overrides.push_back({nullptr, nullptr});
|
||||
}
|
||||
|
||||
if (!params.speculative.tensor_buft_overrides.empty()) {
|
||||
params.speculative.tensor_buft_overrides.push_back({nullptr, nullptr});
|
||||
}
|
||||
|
||||
if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
|
||||
throw std::runtime_error(string_format(
|
||||
"error: the supplied chat template is not supported: %s%s\n",
|
||||
@@ -1201,6 +1238,7 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
|
||||
common_params_print_completion(ctx_arg);
|
||||
exit(0);
|
||||
}
|
||||
params.lr.init();
|
||||
} catch (const std::invalid_argument & ex) {
|
||||
fprintf(stderr, "%s\n", ex.what());
|
||||
ctx_arg.params = params_org;
|
||||
@@ -1469,6 +1507,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.swa_full = true;
|
||||
}
|
||||
).set_env("LLAMA_ARG_SWA_FULL"));
|
||||
add_opt(common_arg(
|
||||
{"--swa-checkpoints"}, "N",
|
||||
string_format("max number of SWA checkpoints per slot to create (default: %d)\n"
|
||||
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_swa_checkpoints),
|
||||
[](common_params & params, int value) {
|
||||
params.n_swa_checkpoints = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_SWA_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"--kv-unified", "-kvu"},
|
||||
string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
|
||||
@@ -2349,40 +2395,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
add_opt(common_arg(
|
||||
{"--override-tensor", "-ot"}, "<tensor name pattern>=<buffer type>,...",
|
||||
"override tensor buffer type", [](common_params & params, const std::string & value) {
|
||||
/* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
|
||||
if (buft_list.empty()) {
|
||||
// enumerate all the devices and add their buffer types to the list
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
||||
auto * dev = ggml_backend_dev_get(i);
|
||||
auto * buft = ggml_backend_dev_buffer_type(dev);
|
||||
if (buft) {
|
||||
buft_list[ggml_backend_buft_name(buft)] = buft;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto & override : string_split<std::string>(value, ',')) {
|
||||
std::string::size_type pos = override.find('=');
|
||||
if (pos == std::string::npos) {
|
||||
throw std::invalid_argument("invalid value");
|
||||
}
|
||||
std::string tensor_name = override.substr(0, pos);
|
||||
std::string buffer_type = override.substr(pos + 1);
|
||||
|
||||
if (buft_list.find(buffer_type) == buft_list.end()) {
|
||||
printf("Available buffer types:\n");
|
||||
for (const auto & it : buft_list) {
|
||||
printf(" %s\n", ggml_backend_buft_name(it.second));
|
||||
}
|
||||
throw std::invalid_argument("unknown buffer type");
|
||||
}
|
||||
// keep strings alive and avoid leaking memory by storing them in a static vector
|
||||
static std::list<std::string> buft_overrides;
|
||||
buft_overrides.push_back(tensor_name);
|
||||
params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)});
|
||||
}
|
||||
parse_tensor_buffer_overrides(value, params.tensor_buft_overrides);
|
||||
}
|
||||
));
|
||||
add_opt(common_arg(
|
||||
{"--override-tensor-draft", "-otd"}, "<tensor name pattern>=<buffer type>,...",
|
||||
"override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
|
||||
parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides);
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"--cpu-moe", "-cmoe"},
|
||||
"keep all Mixture of Experts (MoE) weights in the CPU",
|
||||
@@ -2405,6 +2426,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
}
|
||||
).set_env("LLAMA_ARG_N_CPU_MOE"));
|
||||
add_opt(common_arg(
|
||||
{"--cpu-moe-draft", "-cmoed"},
|
||||
"keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
|
||||
[](common_params & params) {
|
||||
params.speculative.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
|
||||
add_opt(common_arg(
|
||||
{"--n-cpu-moe-draft", "-ncmoed"}, "N",
|
||||
"keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model",
|
||||
[](common_params & params, int value) {
|
||||
if (value < 0) {
|
||||
throw std::invalid_argument("invalid value");
|
||||
}
|
||||
for (int i = 0; i < value; ++i) {
|
||||
static std::list<std::string> buft_overrides_draft;
|
||||
buft_overrides_draft.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
|
||||
params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
|
||||
add_opt(common_arg(
|
||||
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
|
||||
"number of layers to store in VRAM",
|
||||
@@ -2655,7 +2697,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.out_file = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS}));
|
||||
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE}));
|
||||
add_opt(common_arg(
|
||||
{"-ofreq", "--output-frequency"}, "N",
|
||||
string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
|
||||
@@ -2949,11 +2991,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
"- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
|
||||
"(default: auto)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
|
||||
else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; }
|
||||
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
|
||||
else if (value == "auto") { params.reasoning_format = COMMON_REASONING_FORMAT_AUTO; }
|
||||
else { throw std::invalid_argument("invalid value"); }
|
||||
params.reasoning_format = common_reasoning_format_from_name(value);
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
|
||||
add_opt(common_arg(
|
||||
@@ -3134,7 +3172,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.speculative.cpuparams.n_threads = std::thread::hardware_concurrency();
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"-tbd", "--threads-batch-draft"}, "N",
|
||||
"number of threads to use during batch and prompt processing (default: same as --threads-draft)",
|
||||
@@ -3144,7 +3182,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"-Cd", "--cpu-mask-draft"}, "M",
|
||||
"Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
|
||||
@@ -3537,5 +3575,51 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
||||
|
||||
|
||||
add_opt(
|
||||
common_arg({ "-lr", "--learning-rate" }, "ALPHA",
|
||||
string_format(
|
||||
"adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)",
|
||||
(double) params.lr.lr0),
|
||||
[](common_params & params, const std::string & value) { params.lr.lr0 = std::stof(value); })
|
||||
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
||||
add_opt(
|
||||
common_arg({ "-lr-min", "--learning-rate-min" }, "ALPHA",
|
||||
string_format(
|
||||
"(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)",
|
||||
(double) params.lr.lr_min),
|
||||
[](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(value); })
|
||||
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
||||
add_opt(
|
||||
common_arg({ "-decay-epochs", "--learning-rate-decay-epochs" }, "ALPHA",
|
||||
string_format(
|
||||
"(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)",
|
||||
(double) params.lr.decay_epochs),
|
||||
[](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(value); })
|
||||
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
||||
add_opt(common_arg(
|
||||
{ "-wd", "--weight-decay" }, "WD",
|
||||
string_format(
|
||||
"adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).",
|
||||
(double) params.lr.wd),
|
||||
[](common_params & params, const std::string & value) { params.lr.wd = std::stof(value); })
|
||||
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
||||
add_opt(common_arg({ "-val-split", "--val-split" }, "FRACTION",
|
||||
string_format("fraction of data to use as validation set for training (default: %.2g).",
|
||||
(double) params.val_split),
|
||||
[](common_params & params, const std::string & value) { params.val_split = std::stof(value); })
|
||||
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
||||
add_opt(common_arg({ "-epochs", "--epochs" }, "N",
|
||||
string_format("optimizer max # of epochs (default: %d)", params.lr.epochs),
|
||||
[](common_params & params, int epochs) { params.lr.epochs = epochs; })
|
||||
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
||||
add_opt(common_arg({ "-opt", "--optimizer" }, "sgd|adamw", "adamw or sgd",
|
||||
[](common_params & params, const std::string & name) {
|
||||
params.optimizer = common_opt_get_optimizer(name.c_str());
|
||||
if (params.optimizer == GGML_OPT_OPTIMIZER_TYPE_COUNT) {
|
||||
throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd");
|
||||
}
|
||||
})
|
||||
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
||||
|
||||
return ctx_arg;
|
||||
}
|
||||
|
||||
@@ -55,7 +55,15 @@ bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::
|
||||
bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
|
||||
std::string name = tool_call.contains("name") ? tool_call.at("name") : "";
|
||||
std::string id = tool_call.contains("id") ? tool_call.at("id") : "";
|
||||
std::string arguments = tool_call.contains("arguments") ? tool_call.at("arguments") : "";
|
||||
std::string arguments = "";
|
||||
if (tool_call.contains("arguments")) {
|
||||
if (tool_call.at("arguments").is_object()) {
|
||||
arguments = tool_call.at("arguments").dump();
|
||||
} else {
|
||||
arguments = tool_call.at("arguments");
|
||||
}
|
||||
}
|
||||
|
||||
return add_tool_call(name, id, arguments);
|
||||
}
|
||||
|
||||
|
||||
+310
-7
@@ -296,6 +296,7 @@ json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msg
|
||||
}
|
||||
if (!msg.reasoning_content.empty()) {
|
||||
jmsg["reasoning_content"] = msg.reasoning_content;
|
||||
jmsg["thinking"] = msg.reasoning_content; // gpt-oss
|
||||
}
|
||||
if (!msg.tool_name.empty()) {
|
||||
jmsg["name"] = msg.tool_name;
|
||||
@@ -472,11 +473,12 @@ std::string common_chat_format_single(
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
std::string common_chat_format_example(const struct common_chat_templates * tmpls, bool use_jinja) {
|
||||
std::string common_chat_format_example(const struct common_chat_templates * tmpls, bool use_jinja, const std::map<std::string, std::string> & chat_template_kwargs) {
|
||||
common_chat_templates_inputs inputs;
|
||||
inputs.use_jinja = use_jinja;
|
||||
inputs.add_bos = tmpls->add_bos;
|
||||
inputs.add_eos = tmpls->add_eos;
|
||||
inputs.chat_template_kwargs = chat_template_kwargs;
|
||||
auto add_simple_msg = [&](auto role, auto content) {
|
||||
common_chat_msg msg;
|
||||
msg.role = role;
|
||||
@@ -552,6 +554,17 @@ common_chat_templates_ptr common_chat_templates_init(
|
||||
default_template_src = CHATML_TEMPLATE_SRC;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO @ngxson : this is a temporary hack to prevent chat template from throwing an error
|
||||
// Ref: https://github.com/ggml-org/llama.cpp/pull/15230#issuecomment-3173959633
|
||||
if (default_template_src.find("<|channel|>") != std::string::npos
|
||||
// search for the error message and patch it
|
||||
&& default_template_src.find("in message.content or") != std::string::npos) {
|
||||
string_replace_all(default_template_src,
|
||||
"{%- if \"<|channel|>analysis<|message|>\" in message.content or \"<|channel|>final<|message|>\" in message.content %}",
|
||||
"{%- if false %}");
|
||||
}
|
||||
|
||||
std::string token_bos = bos_token_override;
|
||||
std::string token_eos = eos_token_override;
|
||||
bool add_bos = false;
|
||||
@@ -606,6 +619,7 @@ const char * common_chat_format_name(common_chat_format format) {
|
||||
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
|
||||
case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
|
||||
case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
|
||||
case COMMON_CHAT_FORMAT_GRANITE: return "Granite";
|
||||
case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
|
||||
default:
|
||||
throw std::runtime_error("Unknown chat format");
|
||||
@@ -618,11 +632,25 @@ const char * common_reasoning_format_name(common_reasoning_format format) {
|
||||
case COMMON_REASONING_FORMAT_AUTO: return "auto";
|
||||
case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
|
||||
case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
|
||||
case COMMON_REASONING_FORMAT_GRANITE: return "granite";
|
||||
default:
|
||||
throw std::runtime_error("Unknown reasoning format");
|
||||
}
|
||||
}
|
||||
|
||||
common_reasoning_format common_reasoning_format_from_name(const std::string & format) {
|
||||
if (format == "none") {
|
||||
return COMMON_REASONING_FORMAT_NONE;
|
||||
} else if (format == "auto") {
|
||||
return COMMON_REASONING_FORMAT_AUTO;
|
||||
} else if (format == "deepseek") {
|
||||
return COMMON_REASONING_FORMAT_DEEPSEEK;
|
||||
} else if (format == "deepseek-legacy") {
|
||||
return COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY;
|
||||
}
|
||||
throw std::runtime_error("Unknown reasoning format: " + format);
|
||||
}
|
||||
|
||||
static std::string wrap_code_as_arguments(common_chat_msg_parser & builder, const std::string & code) {
|
||||
std::string arguments;
|
||||
if (builder.is_partial()) {
|
||||
@@ -1312,16 +1340,164 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
|
||||
data.prompt = prompt;
|
||||
data.format = COMMON_CHAT_FORMAT_GPT_OSS;
|
||||
|
||||
// TODO: support tool calls in GPT-OSS?
|
||||
// These special tokens are required to parse properly, so we include them
|
||||
// even if parse_tool_calls is false.
|
||||
data.preserved_tokens = {
|
||||
"<|channel|>",
|
||||
"<|constrain|>",
|
||||
"<|message|>",
|
||||
"<|start|>",
|
||||
"<|end|>",
|
||||
};
|
||||
|
||||
if (inputs.tools.is_array() && !inputs.tools.empty()) {
|
||||
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||
// tool calls can appear in commentary or analysis channels
|
||||
auto channel = builder.add_rule("channel", "\"<|channel|>\" ( \"commentary\" | \"analysis\" )");
|
||||
|
||||
std::vector<std::string> tool_rules_recipient_in_role;
|
||||
std::vector<std::string> tool_rules_recipient_in_channel;
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const auto & function = tool.at("function");
|
||||
std::string name = function.at("name");
|
||||
auto parameters = function.at("parameters");
|
||||
builder.resolve_refs(parameters);
|
||||
|
||||
tool_rules_recipient_in_role.push_back(
|
||||
builder.add_rule(name + "-call",
|
||||
"\"" + name + "\"" + channel + " \" <|constrain|>json\"? \"<|message|>\" " +
|
||||
builder.add_schema(name + "-args", parameters)
|
||||
)
|
||||
);
|
||||
|
||||
tool_rules_recipient_in_channel.push_back(
|
||||
builder.add_rule(name + "-call",
|
||||
"\"" + name + "\"" + " \" <|constrain|>json\"? \"<|message|>\" " +
|
||||
builder.add_schema(name + "-args", parameters)
|
||||
)
|
||||
);
|
||||
});
|
||||
|
||||
auto recipient_in_role = builder.add_rule("recipient_in_role",
|
||||
"\"<|start|>assistant\"? \" to=functions.\" ( " +
|
||||
string_join(tool_rules_recipient_in_role, " | ") + " )"
|
||||
);
|
||||
|
||||
auto recipient_in_channel = builder.add_rule("recipient_in_channel",
|
||||
channel + " \" to=functions.\" ( " +
|
||||
string_join(tool_rules_recipient_in_channel, " | ") + " )"
|
||||
);
|
||||
|
||||
builder.add_rule("root", recipient_in_role + " | " + recipient_in_channel);
|
||||
|
||||
// Trigger on tool calls that appear in the commentary channel
|
||||
data.grammar_triggers.push_back({
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
|
||||
"<\\|channel\\|>(commentary|analysis) to"
|
||||
});
|
||||
|
||||
// Trigger tool calls that appear in the role section, either at the
|
||||
// start or in the middle.
|
||||
data.grammar_triggers.push_back({
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
|
||||
"^ to"
|
||||
});
|
||||
|
||||
data.grammar_triggers.push_back({
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
|
||||
"<\\|start\\|>assistant to"
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
return data;
|
||||
}
|
||||
static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
|
||||
// TODO @ngxson : this won't work with --special enabled, we should fix that
|
||||
builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|start|>assistant<|channel|>final<|message|>");
|
||||
if (!builder.syntax().parse_tool_calls) {
|
||||
builder.add_content(builder.consume_rest());
|
||||
return;
|
||||
static const std::string constraint = "(?: (<\\|constrain\\|>)?([a-zA-Z0-9_-]+))";
|
||||
static const std::string recipient("(?: to=functions\\.([^<\\s]+))");
|
||||
|
||||
static const common_regex start_regex("<\\|start\\|>assistant");
|
||||
static const common_regex analysis_regex("<\\|channel\\|>analysis");
|
||||
static const common_regex final_regex("<\\|channel\\|>final" + constraint + "?");
|
||||
static const common_regex preamble_regex("<\\|channel\\|>commentary");
|
||||
static const common_regex tool_call1_regex(recipient + "<\\|channel\\|>(analysis|commentary)" + constraint + "?");
|
||||
static const common_regex tool_call2_regex("<\\|channel\\|>(analysis|commentary)" + recipient + constraint + "?");
|
||||
|
||||
auto consume_end = [&](bool include_end = false) {
|
||||
if (auto res = builder.try_find_literal("<|end|>")) {
|
||||
return res->prelude + (include_end ? builder.str(res->groups[0]) : "");
|
||||
}
|
||||
return builder.consume_rest();
|
||||
};
|
||||
|
||||
auto handle_tool_call = [&](const std::string & name) {
|
||||
if (auto args = builder.try_consume_json_with_dumped_args({{}})) {
|
||||
if (builder.syntax().parse_tool_calls) {
|
||||
if (!builder.add_tool_call(name, "", args->value) || args->is_partial) {
|
||||
throw common_chat_msg_partial_exception("incomplete tool call");
|
||||
}
|
||||
} else if (args->is_partial) {
|
||||
throw common_chat_msg_partial_exception("incomplete tool call");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
auto regex_match = [](const common_regex & regex, const std::string & input) -> std::optional<common_regex_match> {
|
||||
auto match = regex.search(input, 0, true);
|
||||
if (match.type == COMMON_REGEX_MATCH_TYPE_FULL) {
|
||||
return match;
|
||||
}
|
||||
return std::nullopt;
|
||||
};
|
||||
|
||||
do {
|
||||
auto header_start_pos = builder.pos();
|
||||
auto content_start = builder.try_find_literal("<|message|>");
|
||||
if (!content_start) {
|
||||
throw common_chat_msg_partial_exception("incomplete header");
|
||||
}
|
||||
|
||||
auto header = content_start->prelude;
|
||||
|
||||
if (auto match = regex_match(tool_call1_regex, header)) {
|
||||
auto group = match->groups[1];
|
||||
auto name = header.substr(group.begin, group.end - group.begin);
|
||||
handle_tool_call(name);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (auto match = regex_match(tool_call2_regex, header)) {
|
||||
auto group = match->groups[2];
|
||||
auto name = header.substr(group.begin, group.end - group.begin);
|
||||
handle_tool_call(name);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (regex_match(analysis_regex, header)) {
|
||||
builder.move_to(header_start_pos);
|
||||
if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE || builder.syntax().reasoning_in_content) {
|
||||
builder.add_content(consume_end(true));
|
||||
} else {
|
||||
builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|end|>");
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if(regex_match(final_regex, header) || regex_match(preamble_regex, header)) {
|
||||
builder.add_content(consume_end());
|
||||
continue;
|
||||
}
|
||||
|
||||
// Possibly a malformed message, attempt to recover by rolling
|
||||
// back to pick up the next <|start|>
|
||||
LOG_DBG("%s: unknown header from message: %s\n", __func__, header.c_str());
|
||||
builder.move_to(header_start_pos);
|
||||
} while (builder.try_find_regex(start_regex, std::string::npos, false));
|
||||
|
||||
auto remaining = builder.consume_rest();
|
||||
if (!remaining.empty()) {
|
||||
LOG_DBG("%s: content after last message: %s\n", __func__, remaining.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1734,6 +1910,124 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
|
||||
builder.add_content(builder.consume_rest());
|
||||
}
|
||||
|
||||
static common_chat_params common_chat_params_init_granite(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
||||
common_chat_params data;
|
||||
|
||||
// Pass thinking context for Granite template
|
||||
json additional_context = {
|
||||
{"thinking", inputs.enable_thinking},
|
||||
};
|
||||
|
||||
data.prompt = apply(tmpl, inputs, /* messages_override= */ std::nullopt, /* tools_override= */ std::nullopt, additional_context);
|
||||
data.format = COMMON_CHAT_FORMAT_GRANITE;
|
||||
|
||||
if (string_ends_with(data.prompt, "<think>\n") || string_ends_with(data.prompt, "<think>")) {
|
||||
if (!inputs.enable_thinking) {
|
||||
data.prompt += "</think>";
|
||||
} else {
|
||||
data.thinking_forced_open = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (!inputs.tools.is_null()) {
|
||||
// Granite uses <|tool_call|> followed by JSON list
|
||||
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||
std::vector<std::string> tool_rules;
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const auto & function = tool.at("function");
|
||||
std::string name = function.at("name");
|
||||
auto parameters = function.at("parameters");
|
||||
builder.resolve_refs(parameters);
|
||||
tool_rules.push_back(builder.add_rule(name + "-call", builder.add_schema(name +
|
||||
"-args", {
|
||||
{"type", "object"},
|
||||
{"properties", {
|
||||
{"name", {{"const", name}}},
|
||||
{"arguments", parameters},
|
||||
}},
|
||||
{"required", json::array({"name", "arguments"})},
|
||||
})));
|
||||
});
|
||||
|
||||
auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | "));
|
||||
auto tool_list = builder.add_rule("tool_list", "\"[\" space " + tool_call + " (\",\" space " + tool_call + ")* space \"]\"");
|
||||
|
||||
if (data.thinking_forced_open) {
|
||||
builder.add_rule("root", "\"</think>\" space \"<response>\" space [^<]* \"</response>\" space \"<|tool_call|>\" space " + tool_list);
|
||||
} else {
|
||||
builder.add_rule("root", "\"<|tool_call|>\" space " + tool_list);
|
||||
}
|
||||
|
||||
data.grammar_triggers.push_back({
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
|
||||
"<|tool_call|>"
|
||||
});
|
||||
|
||||
data.preserved_tokens = {
|
||||
"<think>",
|
||||
"</think>",
|
||||
"<response>",
|
||||
"</response>",
|
||||
"<|tool_call|>",
|
||||
};
|
||||
});
|
||||
} else {
|
||||
// Handle thinking tags for non-tool responses
|
||||
if (data.thinking_forced_open && inputs.enable_thinking) {
|
||||
data.grammar_lazy = false;
|
||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||
builder.add_rule("root", "\"</think>\" space \"<response>\" space .* \"</response>\" space");
|
||||
});
|
||||
data.preserved_tokens = {
|
||||
"<think>",
|
||||
"</think>",
|
||||
"<response>",
|
||||
"</response>",
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
static void common_chat_parse_granite(common_chat_msg_parser & builder) {
|
||||
// Parse thinking tags
|
||||
builder.try_parse_reasoning("<think>", "</think>");
|
||||
|
||||
// Parse response tags using regex
|
||||
static const common_regex response_regex("<response>([\\s\\S]*?)</response>");
|
||||
if (auto res = builder.try_find_regex(response_regex)) {
|
||||
// Extract the content between the tags (capture group 1)
|
||||
auto content = builder.str(res->groups[1]);
|
||||
builder.add_content(content);
|
||||
builder.move_to(res->groups[0].end);
|
||||
}
|
||||
|
||||
if (!builder.syntax().parse_tool_calls) {
|
||||
builder.add_content(builder.consume_rest());
|
||||
return;
|
||||
}
|
||||
|
||||
// Look for tool calls
|
||||
static const common_regex tool_call_regex(regex_escape("<|tool_call|>"));
|
||||
if (auto res = builder.try_find_regex(tool_call_regex)) {
|
||||
builder.move_to(res->groups[0].end);
|
||||
|
||||
// Expect JSON array of tool calls
|
||||
auto tool_calls_data = builder.consume_json();
|
||||
if (tool_calls_data.json.is_array()) {
|
||||
if (!builder.add_tool_calls(tool_calls_data.json)) {
|
||||
builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
|
||||
}
|
||||
} else {
|
||||
builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
|
||||
}
|
||||
} else {
|
||||
builder.add_content(builder.consume_rest());
|
||||
}
|
||||
}
|
||||
|
||||
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
||||
common_chat_params data;
|
||||
data.prompt = apply(tmpl, inputs);
|
||||
@@ -1805,6 +2099,11 @@ static common_chat_params common_chat_templates_apply_jinja(
|
||||
return common_chat_params_init_command_r7b(tmpl, params);
|
||||
}
|
||||
|
||||
// Granite (IBM) - detects thinking / tools support
|
||||
if (src.find("elif thinking") != std::string::npos && src.find("<|tool_call|>") != std::string::npos) {
|
||||
return common_chat_params_init_granite(tmpl, params);
|
||||
}
|
||||
|
||||
// Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
|
||||
if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null()) {
|
||||
return common_chat_params_init_hermes_2_pro(tmpl, params);
|
||||
@@ -1865,6 +2164,7 @@ static common_chat_params common_chat_templates_apply_legacy(
|
||||
int alloc_size = 0;
|
||||
std::vector<llama_chat_message> chat;
|
||||
std::vector<std::string> contents;
|
||||
|
||||
for (const auto & msg : inputs.messages) {
|
||||
auto content = msg.content;
|
||||
for (const auto & part : msg.content_parts) {
|
||||
@@ -1966,6 +2266,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
|
||||
case COMMON_CHAT_FORMAT_COMMAND_R7B:
|
||||
common_chat_parse_command_r7b(builder);
|
||||
break;
|
||||
case COMMON_CHAT_FORMAT_GRANITE:
|
||||
common_chat_parse_granite(builder);
|
||||
break;
|
||||
case COMMON_CHAT_FORMAT_GPT_OSS:
|
||||
common_chat_parse_gpt_oss(builder);
|
||||
break;
|
||||
|
||||
+4
-1
@@ -109,6 +109,7 @@ enum common_chat_format {
|
||||
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
|
||||
COMMON_CHAT_FORMAT_HERMES_2_PRO,
|
||||
COMMON_CHAT_FORMAT_COMMAND_R7B,
|
||||
COMMON_CHAT_FORMAT_GRANITE,
|
||||
COMMON_CHAT_FORMAT_GPT_OSS,
|
||||
|
||||
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
|
||||
@@ -186,10 +187,12 @@ std::string common_chat_format_single(
|
||||
// Returns an example of formatted chat
|
||||
std::string common_chat_format_example(
|
||||
const struct common_chat_templates * tmpls,
|
||||
bool use_jinja);
|
||||
bool use_jinja,
|
||||
const std::map<std::string, std::string> & chat_template_kwargs);
|
||||
|
||||
const char* common_chat_format_name(common_chat_format format);
|
||||
const char* common_reasoning_format_name(common_reasoning_format format);
|
||||
common_reasoning_format common_reasoning_format_from_name(const std::string & format);
|
||||
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
|
||||
|
||||
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
|
||||
|
||||
@@ -41,6 +41,7 @@
|
||||
#endif
|
||||
#include <locale>
|
||||
#include <windows.h>
|
||||
#include <string.h>
|
||||
#include <fcntl.h>
|
||||
#include <io.h>
|
||||
#else
|
||||
@@ -1565,3 +1566,56 @@ ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
ggml_opt_optimizer_params common_opt_lr_pars(void * userdata) {
|
||||
ggml_opt_optimizer_params result = ggml_opt_get_default_optimizer_params(nullptr);
|
||||
const lr_opt & d = *(lr_opt *) userdata;
|
||||
result.adamw.alpha = result.sgd.alpha = d.get_lr(d.epoch);
|
||||
result.sgd.wd = result.adamw.wd = d.wd;
|
||||
return result;
|
||||
}
|
||||
|
||||
// TODO make all command line args case-insensitive
|
||||
static inline bool eq_case_insensitive(char const* a, char const* b) {
|
||||
return !
|
||||
#if defined(_MSC_VER)
|
||||
_stricmp
|
||||
#else
|
||||
strcasecmp
|
||||
#endif // defined(_MSC_VER)
|
||||
(a, b);
|
||||
}
|
||||
|
||||
enum ggml_opt_optimizer_type common_opt_get_optimizer(const char * n) {
|
||||
if (eq_case_insensitive("adamw", n)) {
|
||||
return GGML_OPT_OPTIMIZER_TYPE_ADAMW;
|
||||
}
|
||||
if (eq_case_insensitive("sgd", n)) {
|
||||
return GGML_OPT_OPTIMIZER_TYPE_SGD;
|
||||
}
|
||||
return GGML_OPT_OPTIMIZER_TYPE_COUNT;
|
||||
}
|
||||
|
||||
// TODO simplify to use just log and exp
|
||||
static float const k_log_2 = std::log(2.f);
|
||||
|
||||
void lr_opt::init() {
|
||||
if (lr_min > 0 && lr_min < lr0) {
|
||||
float nhalf = std::log(lr0 / lr_min) / k_log_2;
|
||||
float e = epochs;
|
||||
if (decay_epochs > 0 && decay_epochs < e) {
|
||||
e = decay_epochs;
|
||||
} else {
|
||||
decay_epochs = e;
|
||||
}
|
||||
scale_epoch = nhalf / e;
|
||||
}
|
||||
}
|
||||
|
||||
float lr_opt::get_lr(float epoch) const {
|
||||
float r = lr_min <= 0 ? lr0 :
|
||||
epoch >= decay_epochs ? lr_min :
|
||||
lr0 * std::pow(0.5f, epoch * scale_epoch);
|
||||
LOG_INF("epoch %.2g lr=%.2g\n", epoch, r);
|
||||
return r;
|
||||
}
|
||||
|
||||
+41
-7
@@ -2,14 +2,17 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "llama-cpp.h"
|
||||
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <sstream>
|
||||
#include <cmath>
|
||||
|
||||
#include "ggml-opt.h"
|
||||
#include "llama-cpp.h"
|
||||
|
||||
#ifdef _WIN32
|
||||
#define DIRECTORY_SEPARATOR '\\'
|
||||
@@ -82,6 +85,7 @@ enum llama_example {
|
||||
LLAMA_EXAMPLE_PARALLEL,
|
||||
LLAMA_EXAMPLE_TTS,
|
||||
LLAMA_EXAMPLE_DIFFUSION,
|
||||
LLAMA_EXAMPLE_FINETUNE,
|
||||
|
||||
LLAMA_EXAMPLE_COUNT,
|
||||
};
|
||||
@@ -202,6 +206,7 @@ struct common_params_speculative {
|
||||
float p_split = 0.1f; // speculative decoding split probability
|
||||
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
|
||||
std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
|
||||
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
|
||||
|
||||
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
|
||||
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
|
||||
@@ -239,8 +244,28 @@ enum common_reasoning_format {
|
||||
COMMON_REASONING_FORMAT_AUTO,
|
||||
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
|
||||
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
|
||||
COMMON_REASONING_FORMAT_GRANITE, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
|
||||
};
|
||||
|
||||
|
||||
struct lr_opt {
|
||||
float lr0 = 1e-5; // learning rate at first epoch
|
||||
float lr_min = -1;
|
||||
float decay_epochs = -1; // if >0, the learning rate starts at lr0 and decays to lr_min after this many epochs
|
||||
float scale_epoch = 0;
|
||||
float wd = 0;
|
||||
unsigned epochs = 2;
|
||||
|
||||
unsigned epoch; // set by optimizer outer (epochs) loop
|
||||
// learning rate decay - constant LR per epoch only for now
|
||||
float get_lr(float e) const;
|
||||
float get_lr() const { return get_lr(epoch); }
|
||||
// must call after arg parse, before get_lr
|
||||
void init();
|
||||
};
|
||||
|
||||
struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
|
||||
|
||||
struct common_params {
|
||||
int32_t n_predict = -1; // new tokens to predict
|
||||
int32_t n_ctx = 4096; // context size
|
||||
@@ -375,6 +400,11 @@ struct common_params {
|
||||
bool no_mmproj = false; // explicitly disable multimodal model
|
||||
std::vector<std::string> image; // path to image file(s)
|
||||
|
||||
// finetune
|
||||
struct lr_opt lr;
|
||||
enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW;
|
||||
float val_split = 0.05f; // fraction of the data used for the validation set
|
||||
|
||||
// embedding
|
||||
bool embedding = false; // get only sentence embedding
|
||||
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
||||
@@ -383,11 +413,12 @@ struct common_params {
|
||||
std::string cls_sep = "\t"; // separator of classification sequences
|
||||
|
||||
// server params
|
||||
int32_t port = 8080; // server listens on this network port
|
||||
int32_t timeout_read = 600; // http read timeout in seconds
|
||||
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
||||
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
||||
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
|
||||
int32_t port = 8080; // server listens on this network port
|
||||
int32_t timeout_read = 600; // http read timeout in seconds
|
||||
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
||||
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
||||
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
|
||||
int32_t n_swa_checkpoints = 3; // max number of SWA checkpoints per slot
|
||||
|
||||
std::string hostname = "127.0.0.1";
|
||||
std::string public_path = ""; // NOLINT
|
||||
@@ -702,3 +733,6 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
||||
//
|
||||
|
||||
ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
|
||||
|
||||
// "adamw" or "sgd" (case insensitive)
|
||||
enum ggml_opt_optimizer_type common_opt_get_optimizer(const char *);
|
||||
|
||||
+378
-106
@@ -28,6 +28,14 @@ if TYPE_CHECKING:
|
||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
||||
import gguf
|
||||
from gguf.vocab import MistralTokenizerType, MistralVocab
|
||||
from mistral_common.tokens.tokenizers.base import TokenizerVersion
|
||||
from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN, DATASET_STD
|
||||
from mistral_common.tokens.tokenizers.tekken import Tekkenizer
|
||||
from mistral_common.tokens.tokenizers.sentencepiece import (
|
||||
SentencePieceTokenizer,
|
||||
)
|
||||
|
||||
|
||||
logger = logging.getLogger("hf-to-gguf")
|
||||
|
||||
@@ -81,6 +89,8 @@ class ModelBase:
|
||||
block_count: int
|
||||
tensor_map: gguf.TensorNameMap
|
||||
|
||||
is_mistral_format: bool = False
|
||||
|
||||
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, *, is_big_endian: bool = False,
|
||||
use_temp_file: bool = False, eager: bool = False,
|
||||
metadata_override: Path | None = None, model_name: str | None = None,
|
||||
@@ -106,16 +116,17 @@ class ModelBase:
|
||||
logger.info(f"Using remote model with HuggingFace id: {remote_hf_model_id}")
|
||||
remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id)
|
||||
self.tensor_names = set(name for name in remote_tensors.keys())
|
||||
for name, remote_tensor in gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id).items():
|
||||
for name, remote_tensor in remote_tensors.items():
|
||||
yield (name, LazyTorchTensor.from_remote_tensor(remote_tensor))
|
||||
|
||||
self.get_tensors = get_remote_tensors
|
||||
else:
|
||||
self.part_names = ModelBase.get_model_part_names(self.dir_model, "model", ".safetensors")
|
||||
prefix = "model" if not self.is_mistral_format else "consolidated"
|
||||
self.part_names = ModelBase.get_model_part_names(self.dir_model, prefix, ".safetensors")
|
||||
self.is_safetensors = len(self.part_names) > 0
|
||||
if not self.is_safetensors:
|
||||
self.part_names = ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
|
||||
self.hparams = ModelBase.load_hparams(self.dir_model) if hparams is None else hparams
|
||||
self.hparams = ModelBase.load_hparams(self.dir_model, self.is_mistral_format) if hparams is None else hparams
|
||||
self.tensor_names = None
|
||||
self.metadata_override = metadata_override
|
||||
self.model_name = model_name
|
||||
@@ -153,19 +164,23 @@ class ModelBase:
|
||||
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
|
||||
tensor_names_from_parts: set[str] = set()
|
||||
|
||||
index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
|
||||
index_name += ".index.json"
|
||||
index_file = self.dir_model / index_name
|
||||
if not self.is_mistral_format:
|
||||
index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
|
||||
index_name += ".index.json"
|
||||
index_file = self.dir_model / index_name
|
||||
|
||||
if index_file.is_file():
|
||||
self.tensor_names = set()
|
||||
logger.info(f"gguf: loading model weight map from '{index_name}'")
|
||||
with open(index_file, "r", encoding="utf-8") as f:
|
||||
index: dict[str, Any] = json.load(f)
|
||||
weight_map = index.get("weight_map")
|
||||
if weight_map is None or not isinstance(weight_map, dict):
|
||||
raise ValueError(f"Can't load 'weight_map' from {index_name!r}")
|
||||
self.tensor_names.update(weight_map.keys())
|
||||
if index_file.is_file():
|
||||
self.tensor_names = set()
|
||||
logger.info(f"gguf: loading model weight map from '{index_name}'")
|
||||
with open(index_file, "r", encoding="utf-8") as f:
|
||||
index: dict[str, Any] = json.load(f)
|
||||
weight_map = index.get("weight_map")
|
||||
if weight_map is None or not isinstance(weight_map, dict):
|
||||
raise ValueError(f"Can't load 'weight_map' from {index_name!r}")
|
||||
self.tensor_names.update(weight_map.keys())
|
||||
else:
|
||||
self.tensor_names = tensor_names_from_parts
|
||||
weight_map = {}
|
||||
else:
|
||||
self.tensor_names = tensor_names_from_parts
|
||||
weight_map = {}
|
||||
@@ -426,7 +441,12 @@ class ModelBase:
|
||||
return part_names
|
||||
|
||||
@staticmethod
|
||||
def load_hparams(dir_model: Path):
|
||||
def load_hparams(dir_model: Path, is_mistral_format: bool):
|
||||
if is_mistral_format:
|
||||
with open(dir_model / "params.json", "r", encoding="utf-8") as f:
|
||||
config = json.load(f)
|
||||
return config
|
||||
|
||||
try:
|
||||
# for security reason, we don't allow loading remote code by default
|
||||
# if a model need remote code, we will fallback to config.json
|
||||
@@ -476,7 +496,10 @@ class TextModel(ModelBase):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.hf_arch = get_model_architecture(self.hparams, self.model_type)
|
||||
if not self.is_mistral_format:
|
||||
self.hf_arch = get_model_architecture(self.hparams, self.model_type)
|
||||
else:
|
||||
self.hf_arch = ""
|
||||
|
||||
if "text_config" in self.hparams:
|
||||
# move the text_config to the root level
|
||||
@@ -542,14 +565,14 @@ class TextModel(ModelBase):
|
||||
self.gguf_writer.add_head_count(n_head)
|
||||
logger.info(f"gguf: head count = {n_head}")
|
||||
|
||||
if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
|
||||
if (n_head_kv := self.find_hparam(["num_key_value_heads", "n_kv_heads"], optional=True)) is not None:
|
||||
self.gguf_writer.add_head_count_kv(n_head_kv)
|
||||
logger.info(f"gguf: key-value head count = {n_head_kv}")
|
||||
|
||||
if (rope_theta := self.hparams.get("rope_theta")) is not None:
|
||||
self.gguf_writer.add_rope_freq_base(rope_theta)
|
||||
logger.info(f"gguf: rope theta = {rope_theta}")
|
||||
if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
|
||||
if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps"], optional=True)) is not None:
|
||||
self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
|
||||
logger.info(f"gguf: rms norm epsilon = {f_rms_eps}")
|
||||
if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None:
|
||||
@@ -1210,12 +1233,19 @@ class MmprojModel(ModelBase):
|
||||
raise TypeError("MmprojModel must be subclassed with model_arch = gguf.MODEL_ARCH.MMPROJ")
|
||||
|
||||
# get n_embd of the text model
|
||||
if "text_config" not in self.hparams:
|
||||
self.hparams["text_config"] = {}
|
||||
if "audio_config" not in self.hparams:
|
||||
self.hparams["audio_config"] = {}
|
||||
text_config = {**self.hparams, **self.hparams["text_config"]}
|
||||
self.n_embd_text = text_config.get("hidden_size", text_config.get("n_embd", 0))
|
||||
if not self.is_mistral_format:
|
||||
if "text_config" not in self.hparams:
|
||||
self.hparams["text_config"] = {}
|
||||
if "audio_config" not in self.hparams:
|
||||
self.hparams["audio_config"] = {}
|
||||
text_config = {**self.hparams, **self.hparams["text_config"]}
|
||||
self.n_embd_text = text_config.get("hidden_size", text_config.get("n_embd", 0))
|
||||
else:
|
||||
text_config = {
|
||||
k: v for k, v in self.hparams.items() if k not in ["vision_encoder", "audio_encoder"]
|
||||
}
|
||||
self.n_embd_text = text_config.get("hidden_dim", 0)
|
||||
|
||||
assert self.n_embd_text > 0, "n_embd not found in hparams"
|
||||
|
||||
# move vision config to the top level, while preserving the original hparams in global_config
|
||||
@@ -1236,11 +1266,13 @@ class MmprojModel(ModelBase):
|
||||
self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count)
|
||||
|
||||
# load preprocessor config
|
||||
with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
|
||||
self.preprocessor_config = json.load(f)
|
||||
if not self.is_mistral_format:
|
||||
with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
|
||||
self.preprocessor_config = json.load(f)
|
||||
|
||||
def get_vision_config(self) -> dict[str, Any] | None:
|
||||
return self.global_config.get("vision_config")
|
||||
config_name = "vision_config" if not self.is_mistral_format else "vision_encoder"
|
||||
return self.global_config.get(config_name)
|
||||
|
||||
def get_audio_config(self) -> dict[str, Any] | None:
|
||||
return self.global_config.get("audio_config")
|
||||
@@ -1264,8 +1296,11 @@ class MmprojModel(ModelBase):
|
||||
self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads"]))
|
||||
|
||||
# preprocessor config
|
||||
self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"])
|
||||
self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_std"])
|
||||
image_mean = DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"]
|
||||
image_std = DATASET_STD if self.is_mistral_format else self.preprocessor_config["image_std"]
|
||||
|
||||
self.gguf_writer.add_vision_image_mean(image_mean)
|
||||
self.gguf_writer.add_vision_image_std(image_std)
|
||||
|
||||
if self.has_audio_encoder:
|
||||
self.gguf_writer.add_clip_has_audio_encoder(True)
|
||||
@@ -1924,11 +1959,63 @@ class LlamaModel(TextModel):
|
||||
if self.hf_arch == "VLlama3ForCausalLM":
|
||||
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
|
||||
|
||||
def _set_vocab_mistral(self):
|
||||
vocab = MistralVocab(self.dir_model)
|
||||
logger.info(
|
||||
f"Converting tokenizer {vocab.tokenizer_type} of size {vocab.vocab_size}."
|
||||
)
|
||||
|
||||
self.gguf_writer.add_tokenizer_model(vocab.gguf_tokenizer_model)
|
||||
|
||||
tokens = []
|
||||
scores = []
|
||||
toktypes = []
|
||||
|
||||
for text, score, toktype in vocab.all_tokens():
|
||||
tokens.append(text)
|
||||
scores.append(score)
|
||||
toktypes.append(toktype)
|
||||
|
||||
assert len(tokens) == vocab.vocab_size, (
|
||||
f"token count ({len(tokens)}) != vocab size ({vocab.vocab_size})"
|
||||
)
|
||||
|
||||
if vocab.tokenizer_type == MistralTokenizerType.tekken:
|
||||
self.gguf_writer.add_tokenizer_pre("tekken")
|
||||
self.gguf_writer.add_token_merges(
|
||||
vocab.extract_vocab_merges_from_model()
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}, {vocab.pad_id}."
|
||||
)
|
||||
|
||||
self.gguf_writer.add_bos_token_id(vocab.bos_id)
|
||||
self.gguf_writer.add_eos_token_id(vocab.eos_id)
|
||||
self.gguf_writer.add_unk_token_id(vocab.unk_id)
|
||||
self.gguf_writer.add_pad_token_id(vocab.pad_id)
|
||||
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_scores(scores)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
self.gguf_writer.add_vocab_size(vocab.vocab_size)
|
||||
|
||||
self.gguf_writer.add_add_bos_token(True)
|
||||
self.gguf_writer.add_add_eos_token(False)
|
||||
|
||||
template_dir = Path(__file__).parent / "models/templates/"
|
||||
|
||||
template = MistralModel.get_community_chat_template(vocab, template_dir)
|
||||
self.gguf_writer.add_chat_template(template)
|
||||
|
||||
def set_vocab(self):
|
||||
if self.is_mistral_format:
|
||||
return self._set_vocab_mistral()
|
||||
|
||||
path_tekken_json = self.dir_model / "tekken.json"
|
||||
path_tokenizer_json = self.dir_model / "tokenizer.json"
|
||||
if path_tekken_json.is_file() and not path_tokenizer_json.is_file():
|
||||
return self.set_vocab_tekken()
|
||||
self._set_vocab_mistral()
|
||||
|
||||
try:
|
||||
self._set_vocab_sentencepiece()
|
||||
@@ -1962,56 +2049,12 @@ class LlamaModel(TextModel):
|
||||
if self.hparams.get("vocab_size", 32000) == 49152:
|
||||
self.gguf_writer.add_add_bos_token(False)
|
||||
|
||||
def set_vocab_tekken(self):
|
||||
vocab = gguf.vocab.MistralVocab(self.dir_model)
|
||||
self.gguf_writer.add_tokenizer_model(vocab.gguf_tokenizer_model)
|
||||
|
||||
tokens = []
|
||||
scores = []
|
||||
toktypes = []
|
||||
|
||||
for text, score, toktype in vocab.all_tokens():
|
||||
tokens.append(text)
|
||||
scores.append(score)
|
||||
toktypes.append(toktype)
|
||||
|
||||
assert len(tokens) == vocab.vocab_size, (
|
||||
f"token count ({len(tokens)}) != vocab size ({vocab.vocab_size})"
|
||||
)
|
||||
|
||||
if vocab.tokenizer_type == gguf.vocab.MistralTokenizerType.tekken:
|
||||
self.gguf_writer.add_tokenizer_pre("tekken")
|
||||
self.gguf_writer.add_token_merges(
|
||||
vocab.extract_vocab_merges_from_model()
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}, {vocab.pad_id}."
|
||||
)
|
||||
|
||||
self.gguf_writer.add_bos_token_id(vocab.bos_id)
|
||||
self.gguf_writer.add_eos_token_id(vocab.eos_id)
|
||||
self.gguf_writer.add_unk_token_id(vocab.unk_id)
|
||||
self.gguf_writer.add_pad_token_id(vocab.pad_id)
|
||||
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_scores(scores)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
self.gguf_writer.add_vocab_size(vocab.vocab_size)
|
||||
|
||||
self.gguf_writer.add_add_bos_token(True)
|
||||
self.gguf_writer.add_add_eos_token(False)
|
||||
|
||||
script_dir = Path(__file__).parent
|
||||
template_path = script_dir / "models/templates/unsloth-mistral-Devstral-Small-2507.jinja"
|
||||
with open(template_path, "r", encoding="utf-8") as f:
|
||||
template = f.read()
|
||||
self.gguf_writer.add_chat_template(template)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||
|
||||
if not self.is_mistral_format:
|
||||
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||
|
||||
if (rope_dim := hparams.get("head_dim")) is None:
|
||||
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||
@@ -2033,13 +2076,25 @@ class LlamaModel(TextModel):
|
||||
_experts: list[dict[str, Tensor]] | None = None
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
n_head = self.hparams["num_attention_heads"]
|
||||
n_kv_head = self.hparams.get("num_key_value_heads")
|
||||
n_head = self.find_hparam(["n_heads", "num_attention_heads"])
|
||||
n_kv_head = self.find_hparam(["n_kv_heads", "num_key_value_heads"])
|
||||
|
||||
vision_prefixes = [
|
||||
"vision_encoder.",
|
||||
"vision_language_adapter.",
|
||||
"patch_merger.",
|
||||
"pre_mm_projector_norm",
|
||||
]
|
||||
|
||||
is_multimodal_tensor = "vision_tower" in name \
|
||||
or "vision_model" in name \
|
||||
or "audio_tower" in name \
|
||||
or "model.connector" in name \
|
||||
or "multi_modal_projector" in name
|
||||
or "multi_modal_projector" in name \
|
||||
or any(
|
||||
name.startswith(prefix)
|
||||
for prefix in vision_prefixes
|
||||
)
|
||||
|
||||
if is_multimodal_tensor:
|
||||
return [] # skip vision tensors
|
||||
@@ -2155,13 +2210,18 @@ class LlavaVisionModel(MmprojModel):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
if self.hparams["model_type"] == "pixtral":
|
||||
if self.hparams.get("model_type") == "pixtral":
|
||||
# layer_norm_eps is not in config.json, it is hard-coded in modeling_pixtral.py
|
||||
self.hparams["layer_norm_eps"] = self.hparams.get("layer_norm_eps", 1e-5)
|
||||
self.img_break_tok_id = self.get_token_id("[IMG_BREAK]")
|
||||
logger.info(f"Image break token id: {self.img_break_tok_id}")
|
||||
elif self.is_mistral_format:
|
||||
# hparams is already vision config here so norm_eps is only defined in global_config.
|
||||
self.hparams["norm_eps"] = self.global_config.get("norm_eps", None)
|
||||
assert self.hparams["norm_eps"] is not None, "norm_eps not found in params.json"
|
||||
self.img_break_tok_id = self.find_vparam(["image_break_token_id"])
|
||||
else:
|
||||
raise ValueError(f"Unsupported model type: {self.hparams['model_type']}")
|
||||
logger.info(f"Image break token id: {self.img_break_tok_id}")
|
||||
|
||||
def get_token_id(self, token: str) -> int:
|
||||
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
||||
@@ -2175,7 +2235,7 @@ class LlavaVisionModel(MmprojModel):
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
if hparams["model_type"] == "pixtral":
|
||||
if hparams.get("model_type") == "pixtral":
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PIXTRAL)
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])
|
||||
|
||||
@@ -2193,18 +2253,30 @@ class LlavaVisionModel(MmprojModel):
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
del bid # unused
|
||||
n_head = self.hparams["num_attention_heads"]
|
||||
n_head = (
|
||||
self.hparams["num_attention_heads"] if not self.is_mistral_format else self.find_vparam(["num_attention_heads"])
|
||||
)
|
||||
n_kv_head = n_head
|
||||
|
||||
if name.startswith("multi_modal_projector.") or name.startswith("vision_tower."):
|
||||
valid_prefixes = (
|
||||
"multi_modal_projector.",
|
||||
"vision_tower.",
|
||||
"vision_encoder.",
|
||||
"vision_language_adapter.",
|
||||
"patch_merger.",
|
||||
"pre_mm_projector_norm",
|
||||
)
|
||||
|
||||
if any(name.startswith(prefix) for prefix in valid_prefixes):
|
||||
# process vision tensors
|
||||
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
||||
if name.endswith(("q_proj.weight", "q_proj.bias")) and not self.is_mistral_format:
|
||||
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
||||
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
||||
if name.endswith(("k_proj.weight", "k_proj.bias")) and not self.is_mistral_format:
|
||||
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
if self.img_break_tok_id > 0 and "embed_tokens.weight" in name:
|
||||
embed_key = "embed_tokens.weight" if not self.is_mistral_format else "tok_embeddings.weight"
|
||||
if self.img_break_tok_id > 0 and embed_key in name:
|
||||
logger.info(f"Extracting [IMG_BREAK] token embedding from {name}")
|
||||
# for pixtral model, we need to extract the [IMG_BREAK] token embedding
|
||||
img_break_embd = data_torch[self.img_break_tok_id]
|
||||
@@ -3328,7 +3400,13 @@ class Qwen25OmniModel(Qwen2VLVisionModel):
|
||||
@ModelBase.register("InternVisionModel")
|
||||
class InternVisionModel(MmprojModel):
|
||||
def set_gguf_parameters(self):
|
||||
assert self.hparams_vision is not None
|
||||
if isinstance(self.hparams_vision['image_size'], list):
|
||||
self.hparams_vision['image_size'] = self.hparams_vision['image_size'][0]
|
||||
if isinstance(self.hparams_vision['patch_size'], list):
|
||||
self.hparams_vision['patch_size'] = self.hparams_vision['patch_size'][0]
|
||||
super().set_gguf_parameters()
|
||||
|
||||
hparams = self.hparams
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.INTERNVL)
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])
|
||||
@@ -3352,14 +3430,30 @@ class InternVisionModel(MmprojModel):
|
||||
return gguf.GGMLQuantizationType.F32
|
||||
return False
|
||||
|
||||
def _mapping_interns1_name(self, name):
|
||||
names_map = {
|
||||
"model.multi_modal_projector.layer_norm.bias": "mlp1.0.bias",
|
||||
"model.multi_modal_projector.layer_norm.weight": "mlp1.0.weight",
|
||||
"model.multi_modal_projector.linear_1.bias": "mlp1.1.bias",
|
||||
"model.multi_modal_projector.linear_1.weight": "mlp1.1.weight",
|
||||
"model.multi_modal_projector.linear_2.bias": "mlp1.3.bias",
|
||||
"model.multi_modal_projector.linear_2.weight": "mlp1.3.weight",
|
||||
}
|
||||
if name in names_map:
|
||||
name = names_map[name]
|
||||
return name
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
del bid # unused
|
||||
if name.startswith("vision_model") or name.startswith("mlp"):
|
||||
vision_prefix = ['vision_model', 'mlp', 'model.vision_tower', 'model.multi_modal_projector']
|
||||
# deal with intern-s1 special case
|
||||
name = self._mapping_interns1_name(name)
|
||||
if any([name.startswith(prefix) for prefix in vision_prefix]):
|
||||
# process visual tensors
|
||||
# correct name
|
||||
if name.startswith("vision_model"):
|
||||
name = "vision_tower." + name
|
||||
if (".ls" in name or "position_embedding" in name) and not name.endswith(".weight"):
|
||||
if (".ls" in name or ".lambda_" in name or "position_embedding" in name) and not name.endswith(".weight"):
|
||||
name += ".weight"
|
||||
# split QKV tensors if needed
|
||||
if ".qkv." in name:
|
||||
@@ -3445,6 +3539,10 @@ class Qwen2MoeModel(TextModel):
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# process the experts separately
|
||||
name = name.replace("language_model.", "") # InternVL
|
||||
if name.startswith("mlp") or name.startswith("vision_model") or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector"):
|
||||
# skip visual tensors
|
||||
return []
|
||||
if name.find("experts") != -1:
|
||||
n_experts = self.hparams["num_experts"]
|
||||
assert bid is not None
|
||||
@@ -3498,6 +3596,85 @@ class Qwen3Model(Qwen2Model):
|
||||
class Qwen3MoeModel(Qwen2MoeModel):
|
||||
model_arch = gguf.MODEL_ARCH.QWEN3MOE
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
hparams = ModelBase.load_hparams(self.dir_model, False)
|
||||
self.origin_hf_arch = hparams.get('architectures', [None])[0]
|
||||
|
||||
def set_vocab(self):
|
||||
# deal with intern-s1
|
||||
if self.origin_hf_arch == 'InternS1ForConditionalGeneration':
|
||||
self._set_vocab_interns1()
|
||||
return
|
||||
|
||||
try:
|
||||
self._set_vocab_sentencepiece()
|
||||
except FileNotFoundError:
|
||||
self._set_vocab_gpt2()
|
||||
|
||||
def _set_vocab_interns1(self):
|
||||
tokens: list[str] = []
|
||||
toktypes: list[int] = []
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
|
||||
vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab())
|
||||
vocab_size = self.hparams.get("vocab_size", len(vocab))
|
||||
assert max(vocab.values()) < vocab_size
|
||||
|
||||
tokpre = self.get_vocab_base_pre(tokenizer)
|
||||
|
||||
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()}
|
||||
added_vocab = tokenizer.get_added_vocab()
|
||||
|
||||
added_tokens_decoder = tokenizer.added_tokens_decoder
|
||||
|
||||
for i in range(vocab_size):
|
||||
if i not in reverse_vocab:
|
||||
tokens.append(f"[PAD{i}]")
|
||||
toktypes.append(gguf.TokenType.UNUSED)
|
||||
else:
|
||||
token: str = reverse_vocab[i]
|
||||
if token in added_vocab:
|
||||
# The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
|
||||
# To avoid unexpected issues - we make sure to normalize non-normalized tokens
|
||||
if not added_tokens_decoder[i].normalized:
|
||||
previous_token = token
|
||||
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
|
||||
if previous_token != token:
|
||||
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
|
||||
|
||||
if added_tokens_decoder[i].special or self.does_token_look_special(token):
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
else:
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
else:
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
tokens.append(token)
|
||||
|
||||
self.gguf_writer.add_tokenizer_model("gpt2")
|
||||
self.gguf_writer.add_tokenizer_pre(tokpre)
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
||||
special_tokens_map_file = self.dir_model / 'special_tokens_map.json'
|
||||
additional_special_tokens = []
|
||||
if special_tokens_map_file.is_file():
|
||||
with open(special_tokens_map_file, encoding = 'utf-8') as f:
|
||||
additional_special_tokens = json.load(f).get('additional_special_tokens', [])
|
||||
tokenizer_cfg_file = self.dir_model / 'special_tokens_map.json'
|
||||
if tokenizer_cfg_file.is_file():
|
||||
with open(tokenizer_cfg_file, encoding = 'utf-8') as f:
|
||||
added_tokens_decoder = json.load(f).get('added_tokens_decoder', {})
|
||||
token2ids_map = {data['content'] : int(token) for token, data in added_tokens_decoder.items() if data['special']}
|
||||
for token in additional_special_tokens:
|
||||
if token in token2ids_map:
|
||||
special_vocab._set_special_token(token, token2ids_map[token])
|
||||
special_vocab._set_special_token('eos', 151645)
|
||||
special_vocab._set_special_token("bos", 151643)
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
|
||||
@ModelBase.register("GPT2LMHeadModel")
|
||||
class GPT2Model(TextModel):
|
||||
@@ -4578,7 +4755,7 @@ class NomicBertModel(BertModel):
|
||||
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any):
|
||||
hparams = kwargs.pop("hparams", None)
|
||||
if hparams is None:
|
||||
hparams = ModelBase.load_hparams(dir_model)
|
||||
hparams = ModelBase.load_hparams(dir_model, False)
|
||||
|
||||
self.is_moe = bool(hparams.get("moe_every_n_layers"))
|
||||
self.model_arch = gguf.MODEL_ARCH.NOMIC_BERT_MOE if self.is_moe else gguf.MODEL_ARCH.NOMIC_BERT
|
||||
@@ -7997,7 +8174,6 @@ class GptOssModel(TextModel):
|
||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||
blocks0: Tensor = torch.zeros(1)
|
||||
blocks1: Tensor = torch.zeros(1)
|
||||
found_mxfp4_tensors = False
|
||||
# we assume that tensors are loaded in the correct order
|
||||
for name, data_torch in self.get_tensors():
|
||||
if "mlp.experts.down_proj_blocks" in name:
|
||||
@@ -8005,7 +8181,6 @@ class GptOssModel(TextModel):
|
||||
elif "mlp.experts.down_proj_scales" in name:
|
||||
new_name = self.map_tensor_name(name.replace("_scales", ".weight"))
|
||||
self.repack_mxfp4(new_name, blocks0, data_torch)
|
||||
found_mxfp4_tensors = True
|
||||
elif "mlp.experts.gate_up_proj_blocks" in name:
|
||||
blocks0, blocks1 = data_torch[:, ::2, :, :], data_torch[:, 1::2, :, :]
|
||||
elif "mlp.experts.gate_up_proj_scales" in name:
|
||||
@@ -8014,9 +8189,6 @@ class GptOssModel(TextModel):
|
||||
new_name_up = self.map_tensor_name(name.replace("gate_up_proj_scales", "up_proj.weight"))
|
||||
self.repack_mxfp4(new_name_gate, blocks0, scales0)
|
||||
self.repack_mxfp4(new_name_up, blocks1, scales1)
|
||||
found_mxfp4_tensors = True
|
||||
if not found_mxfp4_tensors:
|
||||
raise ValueError("No MXFP4 tensors found in the model. Please make sure you are using MXFP4 model.")
|
||||
return []
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
@@ -8029,7 +8201,12 @@ class GptOssModel(TextModel):
|
||||
if "down_proj" in name:
|
||||
if name.endswith("_bias"):
|
||||
name = name.replace("down_proj_bias", "down_proj.bias")
|
||||
elif "_blocks" not in name and "_scales" not in name:
|
||||
logger.warning(f"{name} is not in MXFP4, performance may be degraded")
|
||||
name = name.replace("down_proj", "down_proj.weight")
|
||||
data_torch = data_torch.transpose(-1, -2)
|
||||
else:
|
||||
# otherwise, it should already be repacked to ggml MXFP4 format
|
||||
return []
|
||||
|
||||
# split the gate_up into gate and up
|
||||
@@ -8042,7 +8219,18 @@ class GptOssModel(TextModel):
|
||||
(self.map_tensor_name(name_gate), gate_proj_bias),
|
||||
(self.map_tensor_name(name_up), up_proj_bias)
|
||||
]
|
||||
elif "_blocks" not in name and "_scales" not in name:
|
||||
logger.warning(f"{name} is not in MXFP4, performance may be degraded")
|
||||
name_up = name.replace("gate_up_proj", "up_proj.weight")
|
||||
name_gate = name.replace("gate_up_proj", "gate_proj.weight")
|
||||
data_torch = data_torch.transpose(-1, -2)
|
||||
gate_proj_weight, up_proj_weight = data_torch[:, ::2, :], data_torch[:, 1::2, :]
|
||||
return [
|
||||
(self.map_tensor_name(name_gate), gate_proj_weight),
|
||||
(self.map_tensor_name(name_up), up_proj_weight)
|
||||
]
|
||||
else:
|
||||
# otherwise, it should already be repacked to ggml MXFP4 format
|
||||
return []
|
||||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
@@ -8188,6 +8376,77 @@ class SmallThinkerModel(TextModel):
|
||||
if len(experts) > 0:
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
class MistralModel(LlamaModel):
|
||||
model_arch = gguf.MODEL_ARCH.LLAMA
|
||||
model_name = "Mistral"
|
||||
hf_arch = ""
|
||||
is_mistral_format = True
|
||||
undo_permute = False
|
||||
|
||||
@staticmethod
|
||||
def get_community_chat_template(vocab: MistralVocab, templates_dir: Path):
|
||||
assert TokenizerVersion is not None, "mistral_common is not installed"
|
||||
assert isinstance(vocab.tokenizer, (Tekkenizer, SentencePieceTokenizer)), (
|
||||
f"Expected Tekkenizer or SentencePieceTokenizer, got {type(vocab.tokenizer)}"
|
||||
)
|
||||
|
||||
if vocab.tokenizer.version == TokenizerVersion.v1:
|
||||
return "mistral-v1"
|
||||
elif vocab.tokenizer.version == TokenizerVersion.v3 and vocab.tokenizer_type == MistralTokenizerType.spm:
|
||||
return "mistral-v3"
|
||||
elif vocab.tokenizer.version == TokenizerVersion.v3 and vocab.tokenizer_type == MistralTokenizerType.tekken:
|
||||
return "mistral-v3-tekken"
|
||||
elif vocab.tokenizer.version == TokenizerVersion.v7 and vocab.tokenizer_type == MistralTokenizerType.spm:
|
||||
return "mistral-v7"
|
||||
elif vocab.tokenizer.version == TokenizerVersion.v7 and vocab.tokenizer_type == MistralTokenizerType.tekken:
|
||||
return "mistral-v7-tekken"
|
||||
elif vocab.tokenizer.version == TokenizerVersion.v11:
|
||||
template_file = "Mistral-Small-3.2-24B-Instruct-2506.jinja"
|
||||
elif vocab.tokenizer.version == TokenizerVersion.v13:
|
||||
template_file = "unsloth-mistral-Devstral-Small-2507.jinja"
|
||||
else:
|
||||
raise ValueError(f"Unknown tokenizer type: {vocab.tokenizer_type} and version {vocab.tokenizer.version}")
|
||||
|
||||
template_path = templates_dir / template_file
|
||||
if not template_path.exists():
|
||||
raise FileNotFoundError(f"Template file not found: {template_path}")
|
||||
|
||||
with open(template_path, "r", encoding="utf-8") as f:
|
||||
template = f.read()
|
||||
|
||||
return template
|
||||
|
||||
|
||||
class PixtralModel(LlavaVisionModel):
|
||||
model_name = "Pixtral"
|
||||
hf_arch = ""
|
||||
is_mistral_format = True
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PIXTRAL)
|
||||
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(
|
||||
self.find_hparam(["norm_eps"])
|
||||
)
|
||||
self.gguf_writer.add_rope_freq_base(self.find_vparam(["rope_theta"]))
|
||||
|
||||
self.gguf_writer.add_vision_use_silu(True)
|
||||
|
||||
# spatial_merge_size
|
||||
if self.find_vparam(["mm_projector_id"]) == "patch_merge":
|
||||
self.gguf_writer.add_vision_spatial_merge_size(
|
||||
self.find_vparam(["spatial_merge_size"])
|
||||
)
|
||||
|
||||
def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
|
||||
if name == "vision_language_adapter.w_in.weight":
|
||||
return "mm.1.weight"
|
||||
elif name == "vision_language_adapter.w_out.weight":
|
||||
return "mm.2.weight"
|
||||
return super().map_tensor_name(name, try_suffixes)
|
||||
|
||||
###### CONVERSION LOGIC ######
|
||||
|
||||
|
||||
@@ -8338,6 +8597,10 @@ def parse_args() -> argparse.Namespace:
|
||||
"--mmproj", action="store_true",
|
||||
help="(Experimental) Export multimodal projector (mmproj) for vision models. This will only work on some vision models. A prefix 'mmproj-' will be added to the output file name.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mistral-format", action="store_true",
|
||||
help="Whether the model is stored following the Mistral format.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
if not args.print_supported_models and args.model is None:
|
||||
@@ -8443,17 +8706,25 @@ def main() -> None:
|
||||
if "mmproj" not in fname_out.name:
|
||||
fname_out = ModelBase.add_prefix_to_filename(fname_out, "mmproj-")
|
||||
|
||||
is_mistral_format = args.mistral_format
|
||||
|
||||
with torch.inference_mode():
|
||||
output_type = ftype_map[args.outtype]
|
||||
model_type = ModelType.MMPROJ if args.mmproj else ModelType.TEXT
|
||||
hparams = ModelBase.load_hparams(dir_model)
|
||||
model_architecture = get_model_architecture(hparams, model_type)
|
||||
logger.info(f"Model architecture: {model_architecture}")
|
||||
try:
|
||||
model_class = ModelBase.from_model_architecture(model_architecture, model_type=model_type)
|
||||
except NotImplementedError:
|
||||
logger.error(f"Model {model_architecture} is not supported")
|
||||
sys.exit(1)
|
||||
hparams = ModelBase.load_hparams(dir_model, is_mistral_format)
|
||||
if not is_mistral_format:
|
||||
model_architecture = get_model_architecture(hparams, model_type)
|
||||
logger.info(f"Model architecture: {model_architecture}")
|
||||
try:
|
||||
model_class = ModelBase.from_model_architecture(model_architecture, model_type=model_type)
|
||||
except NotImplementedError:
|
||||
logger.error(f"Model {model_architecture} is not supported")
|
||||
sys.exit(1)
|
||||
elif args.mmproj:
|
||||
assert hparams.get("vision_encoder") is not None, "This model does not support multimodal"
|
||||
model_class = PixtralModel
|
||||
else:
|
||||
model_class = MistralModel
|
||||
|
||||
model_instance = model_class(dir_model, output_type, fname_out,
|
||||
is_big_endian=args.bigendian, use_temp_file=args.use_temp_file,
|
||||
@@ -8462,7 +8733,8 @@ def main() -> None:
|
||||
split_max_tensors=args.split_max_tensors,
|
||||
split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
|
||||
small_first_shard=args.no_tensor_first_split,
|
||||
remote_hf_model_id=hf_repo_id)
|
||||
remote_hf_model_id=hf_repo_id,
|
||||
)
|
||||
|
||||
if args.vocab_only:
|
||||
logger.info("Exporting model vocab...")
|
||||
|
||||
@@ -340,7 +340,7 @@ if __name__ == '__main__':
|
||||
sys.exit(1)
|
||||
else:
|
||||
logger.info(f"Loading base model: {dir_base_model.name}")
|
||||
hparams = ModelBase.load_hparams(dir_base_model)
|
||||
hparams = ModelBase.load_hparams(dir_base_model, False)
|
||||
|
||||
with torch.inference_mode():
|
||||
try:
|
||||
|
||||
+29
-11
@@ -76,6 +76,23 @@ cmake --build build --config Release -j $(nproc)
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
```
|
||||
|
||||
## IBM zDNN Accelerator
|
||||
|
||||
This provides acceleration using the IBM zAIU co-processor located in the Telum I and Telum II processors. Make sure to have the [IBM zDNN library](https://github.com/IBM/zDNN) installed.
|
||||
|
||||
#### Compile from source from IBM
|
||||
|
||||
You may find the official build instructions here: [Building and Installing zDNN](https://github.com/IBM/zDNN?tab=readme-ov-file#building-and-installing-zdnn)
|
||||
|
||||
### Compilation
|
||||
|
||||
```bash
|
||||
cmake -S . -B build \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_ZDNN=ON
|
||||
cmake --build build --config Release -j$(nproc)
|
||||
```
|
||||
|
||||
## Getting GGUF Models
|
||||
|
||||
All models need to be converted to Big-Endian. You can achieve this in three cases:
|
||||
@@ -145,15 +162,15 @@ All models need to be converted to Big-Endian. You can achieve this in three cas
|
||||
|
||||
### 1. SIMD Acceleration
|
||||
|
||||
Only available in IBM z15 or later system with the `-DGGML_VXE=ON` (turned on by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z14/arch12. In such systems, the APIs can still run but will use a scalar implementation.
|
||||
Only available in IBM z15/LinuxONE 3 or later system with the `-DGGML_VXE=ON` (turned on by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z14/arch12. In such systems, the APIs can still run but will use a scalar implementation.
|
||||
|
||||
### 2. NNPA Vector Intrinsics Acceleration
|
||||
|
||||
Only available in IBM z16 or later system with the `-DGGML_NNPA=ON` (turned off by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation.
|
||||
Only available in IBM z16/LinuxONE 4 or later system with the `-DGGML_NNPA=ON` (turned off by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation.
|
||||
|
||||
### 3. zDNN Accelerator
|
||||
### 3. zDNN Accelerator (WIP)
|
||||
|
||||
_Only available in IBM z16 / LinuxONE 4 or later system. No support currently available._
|
||||
Only available in IBM z17/LinuxONE 5 or later system with the `-DGGML_ZDNN=ON` compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs will default back to CPU routines.
|
||||
|
||||
### 4. Spyre Accelerator
|
||||
|
||||
@@ -229,11 +246,12 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
|
||||
|
||||
## Appendix A: Hardware Support Matrix
|
||||
|
||||
| | Support | Minimum Compiler Version |
|
||||
| ------- | ------- | ------------------------ |
|
||||
| IBM z15 | ✅ | |
|
||||
| IBM z16 | ✅ | |
|
||||
| IBM z17 | ✅ | GCC 15.1.0 |
|
||||
| | Support | Minimum Compiler Version |
|
||||
| -------- | ------- | ------------------------ |
|
||||
| IBM z15 | ✅ | |
|
||||
| IBM z16 | ✅ | |
|
||||
| IBM z17 | ✅ | GCC 15.1.0 |
|
||||
| IBM zDNN | ✅ | |
|
||||
|
||||
- ✅ - supported and verified to run as intended
|
||||
- 🚫 - unsupported, we are unlikely able to provide support
|
||||
@@ -242,7 +260,7 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
|
||||
|
||||
| | VX/VXE/VXE2 | NNPA | zDNN | Spyre |
|
||||
| ---------- | ----------- | ---- | ---- | ----- |
|
||||
| FP32 | ✅ | ✅ | ❓ | ❓ |
|
||||
| FP32 | ✅ | ✅ | ✅ | ❓ |
|
||||
| FP16 | ✅ | ✅ | ❓ | ❓ |
|
||||
| BF16 | 🚫 | 🚫 | ❓ | ❓ |
|
||||
| Q4_0 | ✅ | ✅ | ❓ | ❓ |
|
||||
@@ -273,4 +291,4 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
|
||||
- 🚫 - acceleration unavailable, will still run using scalar implementation
|
||||
- ❓ - acceleration unknown, please contribute if you can test it yourself
|
||||
|
||||
Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on July 25, 2025.
|
||||
Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on July 31, 2025.
|
||||
|
||||
@@ -13,7 +13,7 @@ If there are differences in usage, please refer to the official build [documenta
|
||||
|
||||
Clone llama.cpp:
|
||||
```bash
|
||||
git clone https://github.com/ggerganov/llama.cpp
|
||||
git clone https://github.com/ggml-org/llama.cpp
|
||||
cd llama.cpp
|
||||
```
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ If there are differences in usage, please refer to the official build [documenta
|
||||
|
||||
Clone llama.cpp:
|
||||
```bash
|
||||
git clone https://github.com/ggerganov/llama.cpp
|
||||
git clone https://github.com/ggml-org/llama.cpp
|
||||
cd llama.cpp
|
||||
```
|
||||
|
||||
|
||||
+89
-88
@@ -12,91 +12,92 @@ Legend:
|
||||
- 🟡 Partially supported by this backend
|
||||
- ❌ Not supported by this backend
|
||||
|
||||
| Operation | BLAS | CANN | CPU | CUDA | Metal | OpenCL | SYCL | Vulkan |
|
||||
|-----------|------|------|------|------|------|------|------|------|
|
||||
| ABS | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
|
||||
| ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
|
||||
| ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ |
|
||||
| ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ |
|
||||
| ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
|
||||
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 |
|
||||
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ |
|
||||
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 |
|
||||
| CONV_2D | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ✅ |
|
||||
| CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
|
||||
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
|
||||
| CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
| COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 |
|
||||
| COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
|
||||
| CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
| CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
| DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ |
|
||||
| DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ |
|
||||
| DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 |
|
||||
| ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
|
||||
| EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
|
||||
| FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 |
|
||||
| GATED_LINEAR_ATTN | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ |
|
||||
| GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
|
||||
| GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
|
||||
| GEGLU_QUICK | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
|
||||
| GELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| GELU_ERF | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| GELU_QUICK | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 |
|
||||
| GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ |
|
||||
| GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
|
||||
| HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
|
||||
| IM2COL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ |
|
||||
| L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
|
||||
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
|
||||
| LOG | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ |
|
||||
| MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ |
|
||||
| MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ |
|
||||
| NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
|
||||
| NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
|
||||
| OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
|
||||
| OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ |
|
||||
| PAD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| PAD_REFLECT_1D | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||
| POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
|
||||
| REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
|
||||
| RELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 |
|
||||
| REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
|
||||
| RMS_NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ |
|
||||
| RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
|
||||
| RMS_NORM_MUL_ADD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| ROLL | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ |
|
||||
| ROPE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
|
||||
| RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
|
||||
| RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
|
||||
| SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| SET | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||
| SET_ROWS | ❌ | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
|
||||
| SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
|
||||
| SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 |
|
||||
| SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ |
|
||||
| SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ✅ |
|
||||
| SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 |
|
||||
| SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | ❌ |
|
||||
| SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
|
||||
| SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ |
|
||||
| SUM | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ |
|
||||
| SUM_ROWS | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
|
||||
| TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | 🟡 |
|
||||
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ |
|
||||
| Operation | BLAS | CANN | CPU | CUDA | Metal | OpenCL | SYCL | Vulkan | zDNN |
|
||||
|-----------|------|------|------|------|------|------|------|------|------|
|
||||
| ABS | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
|
||||
| ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
|
||||
| ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
|
||||
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ❌ |
|
||||
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
|
||||
| CONV_2D | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ |
|
||||
| CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
|
||||
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ |
|
||||
| COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
|
||||
| CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
|
||||
| CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
|
||||
| DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
|
||||
| DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
|
||||
| ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
|
||||
| EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
|
||||
| FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ |
|
||||
| GATED_LINEAR_ATTN | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
|
||||
| GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
|
||||
| GEGLU_QUICK | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
|
||||
| GELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
|
||||
| GELU_ERF | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
|
||||
| GELU_QUICK | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
|
||||
| GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
|
||||
| GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
|
||||
| HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
|
||||
| IM2COL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ |
|
||||
| L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| LOG | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
| MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
|
||||
| MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ |
|
||||
| NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
|
||||
| NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
|
||||
| OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
|
||||
| OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ |
|
||||
| PAD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| PAD_REFLECT_1D | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
| POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
|
||||
| RELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
|
||||
| REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ❌ |
|
||||
| REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
|
||||
| RMS_NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ |
|
||||
| RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
|
||||
| RMS_NORM_MUL_ADD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| ROLL | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ |
|
||||
| ROPE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
|
||||
| RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| SET | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
| SET_ROWS | ❌ | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
|
||||
| SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
|
||||
| SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
|
||||
| SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
|
||||
| SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
|
||||
| SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ |
|
||||
| SOFTCAP | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ |
|
||||
| SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ✅ | ❌ |
|
||||
| SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ |
|
||||
| SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | ❌ | ❌ |
|
||||
| SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
| SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
| STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
|
||||
| SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
|
||||
| SUM | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
|
||||
| SUM_ROWS | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
|
||||
| TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | ❌ |
|
||||
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ |
|
||||
|
||||
+8134
File diff suppressed because it is too large
Load Diff
@@ -7,6 +7,7 @@
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <numeric>
|
||||
|
||||
/**
|
||||
* This the arbitrary data which will be passed to each callback.
|
||||
@@ -77,6 +78,12 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
|
||||
LOG(" ]\n");
|
||||
LOG(" sum = %f\n", sum);
|
||||
}
|
||||
|
||||
// TODO: make this abort configurable/optional?
|
||||
if (std::isnan(sum)) {
|
||||
LOG_ERR("encountered NaN - aborting\n");
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -59,6 +59,8 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
|
||||
params.tensor_buft_overrides = params.speculative.tensor_buft_overrides;
|
||||
|
||||
common_init_result llama_init_dft = common_init_from_params(params);
|
||||
|
||||
//model_dft = llama_init_dft.model.get();
|
||||
|
||||
@@ -85,6 +85,8 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
|
||||
params.tensor_buft_overrides = params.speculative.tensor_buft_overrides;
|
||||
|
||||
common_init_result llama_init_dft = common_init_from_params(params);
|
||||
|
||||
model_dft = llama_init_dft.model.get();
|
||||
|
||||
@@ -10,20 +10,20 @@
|
||||
#include <vector>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
common_params params;
|
||||
|
||||
params.escape = false;
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_FINETUNE)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (params.use_mmap) {
|
||||
LOG_INF("%s: force disabling memory mapping because it would result in-read-only pointers to the weights\n", __func__);
|
||||
LOG_INF("%s: force disabling memory mapping because it would result in-read-only pointers to the weights\n",
|
||||
__func__);
|
||||
params.use_mmap = false;
|
||||
}
|
||||
if (params.cache_type_k != GGML_TYPE_F32) {
|
||||
@@ -38,11 +38,10 @@ int main(int argc, char ** argv) {
|
||||
common_init();
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
// load the model and apply lora adapter, if any
|
||||
common_init_result llama_init = common_init_from_params(params);
|
||||
llama_model_ptr & model = llama_init.model;
|
||||
llama_context_ptr & ctx = llama_init.context;
|
||||
common_init_result llama_init = common_init_from_params(params);
|
||||
llama_model_ptr & model = llama_init.model;
|
||||
llama_context_ptr & ctx = llama_init.context;
|
||||
|
||||
if (model == NULL) {
|
||||
LOG_ERR("%s: unable to load model\n", __func__);
|
||||
@@ -55,31 +54,32 @@ int main(int argc, char ** argv) {
|
||||
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
||||
}
|
||||
|
||||
constexpr float val_split = 0.05f;
|
||||
std::vector<llama_token> tokens = common_tokenize(ctx.get(), params.prompt, true);
|
||||
ggml_opt_dataset_t dataset = common_opt_dataset_init(ctx.get(), tokens, llama_n_ctx(ctx.get()) / 2);
|
||||
|
||||
std::vector<llama_token> tokens = common_tokenize(ctx.get(), params.prompt, true);
|
||||
ggml_opt_dataset_t dataset = common_opt_dataset_init(ctx.get(), tokens, llama_n_ctx(ctx.get())/2);
|
||||
struct lr_opt & lr = params.lr;
|
||||
LOG_INF("-optimizer %s -lr0 %.2g -wd %.2g -lr-min %.2g -min-epochs %.2g -epochs %d -period %.2g -val %.2g\n",
|
||||
ggml_opt_optimizer_name(params.optimizer), (double) lr.lr0, (double) lr.wd, (double) lr.lr_min, (double) lr.decay_epochs,
|
||||
(unsigned) lr.epochs, (double) params.n_batch / params.n_ubatch, (double) params.val_split);
|
||||
|
||||
struct ggml_opt_optimizer_params optimizer_params = ggml_opt_get_default_optimizer_params(nullptr);
|
||||
optimizer_params.adamw.alpha = 1e-7f; // learning rate
|
||||
|
||||
struct llama_opt_params lopt_params {
|
||||
/*n_ctx_train =*/ 0,
|
||||
/*param_filter =*/ llama_opt_param_filter_all,
|
||||
/*param_filter_ud =*/ nullptr,
|
||||
/*get_opt_pars =*/ ggml_opt_get_constant_optimizer_params,
|
||||
/*get_opt_pars_ud =*/ &optimizer_params,
|
||||
struct llama_opt_params lopt_params{
|
||||
/*n_ctx_train =*/0,
|
||||
/*param_filter =*/llama_opt_param_filter_all,
|
||||
/*param_filter_ud =*/nullptr,
|
||||
/*get_opt_pars =*/common_opt_lr_pars,
|
||||
/*get_opt_pars_ud =*/¶ms.lr,
|
||||
/*optimizer_type =*/params.optimizer,
|
||||
};
|
||||
llama_opt_init(ctx.get(), model.get(), lopt_params);
|
||||
|
||||
const int64_t idata_split = ggml_opt_dataset_ndata(dataset) * (1.0f - val_split);
|
||||
const int64_t idata_split = ggml_opt_dataset_ndata(dataset) * (1.0f - params.val_split);
|
||||
|
||||
ggml_opt_result_t result_train = ggml_opt_result_init();
|
||||
ggml_opt_result_t result_eval = ggml_opt_result_init();
|
||||
|
||||
for (int epoch = 0; epoch < 2; ++epoch) {
|
||||
for (lr.epoch = 0; lr.epoch < lr.epochs; ++lr.epoch) {
|
||||
llama_opt_epoch(ctx.get(), dataset, result_train, result_eval, idata_split,
|
||||
ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar);
|
||||
ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar);
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
ggml_opt_result_reset(result_train);
|
||||
@@ -88,7 +88,7 @@ int main(int argc, char ** argv) {
|
||||
ggml_opt_result_free(result_train);
|
||||
ggml_opt_result_free(result_eval);
|
||||
|
||||
llama_model_save_to_file(model.get(), "finetuned-model.gguf");
|
||||
llama_model_save_to_file(model.get(), params.out_file.c_str());
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
|
||||
@@ -36,9 +36,6 @@
|
||||
# ```
|
||||
# nixConfig = {
|
||||
# extra-substituters = [
|
||||
# # Populated by the CI in ggml-org/llama.cpp
|
||||
# "https://llama-cpp.cachix.org"
|
||||
#
|
||||
# # A development cache for nixpkgs imported with `config.cudaSupport = true`.
|
||||
# # Populated by https://hercules-ci.com/github/SomeoneSerge/nixpkgs-cuda-ci.
|
||||
# # This lets one skip building e.g. the CUDA-enabled openmpi.
|
||||
@@ -47,10 +44,8 @@
|
||||
# ];
|
||||
#
|
||||
# # Verify these are the same keys as published on
|
||||
# # - https://app.cachix.org/cache/llama-cpp
|
||||
# # - https://app.cachix.org/cache/cuda-maintainers
|
||||
# extra-trusted-public-keys = [
|
||||
# "llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc="
|
||||
# "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
|
||||
# ];
|
||||
# };
|
||||
|
||||
@@ -176,6 +176,7 @@ option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM"
|
||||
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
|
||||
option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
|
||||
option(GGML_HIP_MMQ_MFMA "ggml: enable MFMA MMA for CDNA in MMQ" ON)
|
||||
option(GGML_HIP_EXPORT_METRICS "ggml: enable kernel perf metrics output" OFF)
|
||||
option(GGML_MUSA_GRAPHS "ggml: use MUSA graph, experimental, unstable" OFF)
|
||||
option(GGML_MUSA_MUDNN_COPY "ggml: enable muDNN for accelerated copy" OFF)
|
||||
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
||||
@@ -187,6 +188,7 @@ option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation"
|
||||
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
|
||||
option(GGML_WEBGPU "ggml: use WebGPU" OFF)
|
||||
option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF)
|
||||
option(GGML_ZDNN "ggml: use zDNN" OFF)
|
||||
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
|
||||
option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
|
||||
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
|
||||
|
||||
@@ -106,7 +106,7 @@ if(NOT TARGET ggml::ggml)
|
||||
|
||||
find_library(GGML_LIBRARY ggml
|
||||
REQUIRED
|
||||
HINTS ${GGML_LIB_DIR} ${GGML_BACKEND_DIR}
|
||||
HINTS ${GGML_LIB_DIR}
|
||||
NO_CMAKE_FIND_ROOT_PATH)
|
||||
|
||||
add_library(ggml::ggml UNKNOWN IMPORTED)
|
||||
@@ -125,54 +125,56 @@ if(NOT TARGET ggml::ggml)
|
||||
IMPORTED_LOCATION "${GGML_BASE_LIBRARY}")
|
||||
|
||||
set(_ggml_all_targets "")
|
||||
foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS})
|
||||
string(REPLACE "-" "_" _ggml_backend_pfx "${_ggml_backend}")
|
||||
string(TOUPPER "${_ggml_backend_pfx}" _ggml_backend_pfx)
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS})
|
||||
string(REPLACE "-" "_" _ggml_backend_pfx "${_ggml_backend}")
|
||||
string(TOUPPER "${_ggml_backend_pfx}" _ggml_backend_pfx)
|
||||
|
||||
find_library(${_ggml_backend_pfx}_LIBRARY ${_ggml_backend}
|
||||
REQUIRED
|
||||
HINTS ${GGML_LIB_DIR}
|
||||
NO_CMAKE_FIND_ROOT_PATH)
|
||||
find_library(${_ggml_backend_pfx}_LIBRARY ${_ggml_backend}
|
||||
REQUIRED
|
||||
HINTS ${GGML_LIB_DIR}
|
||||
NO_CMAKE_FIND_ROOT_PATH)
|
||||
|
||||
message(STATUS "Found ${${_ggml_backend_pfx}_LIBRARY}")
|
||||
message(STATUS "Found ${${_ggml_backend_pfx}_LIBRARY}")
|
||||
|
||||
add_library(ggml::${_ggml_backend} UNKNOWN IMPORTED)
|
||||
set_target_properties(ggml::${_ggml_backend}
|
||||
PROPERTIES
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${GGML_INCLUDE_DIR}"
|
||||
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
|
||||
IMPORTED_LOCATION "${${_ggml_backend_pfx}_LIBRARY}"
|
||||
INTERFACE_COMPILE_FEATURES c_std_90
|
||||
POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
string(REGEX MATCH "^ggml-cpu" is_cpu_variant "${_ggml_backend}")
|
||||
if(is_cpu_variant)
|
||||
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES "ggml::ggml-base")
|
||||
set_target_properties(ggml::${_ggml_backend}
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_LIBRARIES "${GGML_CPU_INTERFACE_LINK_LIBRARIES}")
|
||||
|
||||
if(GGML_CPU_INTERFACE_LINK_OPTIONS)
|
||||
set_target_properties(ggml::${_ggml_backend}
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_OPTIONS "${GGML_CPU_INTERFACE_LINK_OPTIONS}")
|
||||
endif()
|
||||
|
||||
else()
|
||||
list(APPEND ${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES "ggml::ggml-base")
|
||||
add_library(ggml::${_ggml_backend} UNKNOWN IMPORTED)
|
||||
set_target_properties(ggml::${_ggml_backend}
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_LIBRARIES "${${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES}")
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${GGML_INCLUDE_DIR}"
|
||||
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
|
||||
IMPORTED_LOCATION "${${_ggml_backend_pfx}_LIBRARY}"
|
||||
INTERFACE_COMPILE_FEATURES c_std_90
|
||||
POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
if(${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS)
|
||||
string(REGEX MATCH "^ggml-cpu" is_cpu_variant "${_ggml_backend}")
|
||||
if(is_cpu_variant)
|
||||
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES "ggml::ggml-base")
|
||||
set_target_properties(ggml::${_ggml_backend}
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_LIBRARIES "${GGML_CPU_INTERFACE_LINK_LIBRARIES}")
|
||||
|
||||
if(GGML_CPU_INTERFACE_LINK_OPTIONS)
|
||||
set_target_properties(ggml::${_ggml_backend}
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_OPTIONS "${GGML_CPU_INTERFACE_LINK_OPTIONS}")
|
||||
endif()
|
||||
|
||||
else()
|
||||
list(APPEND ${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES "ggml::ggml-base")
|
||||
set_target_properties(ggml::${_ggml_backend}
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_OPTIONS "${${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS}")
|
||||
endif()
|
||||
endif()
|
||||
INTERFACE_LINK_LIBRARIES "${${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES}")
|
||||
|
||||
list(APPEND _ggml_all_targets ggml::${_ggml_backend})
|
||||
endforeach()
|
||||
if(${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS)
|
||||
set_target_properties(ggml::${_ggml_backend}
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_OPTIONS "${${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
list(APPEND _ggml_all_targets ggml::${_ggml_backend})
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
list(APPEND GGML_INTERFACE_LINK_LIBRARIES ggml::ggml-base "${_ggml_all_targets}")
|
||||
set_target_properties(ggml::ggml
|
||||
|
||||
+25
-6
@@ -74,16 +74,26 @@ extern "C" {
|
||||
GGML_OPT_BUILD_TYPE_OPT = 30,
|
||||
};
|
||||
|
||||
enum ggml_opt_optimizer_type {
|
||||
GGML_OPT_OPTIMIZER_TYPE_ADAMW,
|
||||
GGML_OPT_OPTIMIZER_TYPE_SGD,
|
||||
|
||||
GGML_OPT_OPTIMIZER_TYPE_COUNT
|
||||
};
|
||||
|
||||
// parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
|
||||
struct ggml_opt_optimizer_params {
|
||||
// AdamW optimizer parameters
|
||||
struct {
|
||||
float alpha; // learning rate
|
||||
float beta1;
|
||||
float beta2;
|
||||
float beta1; // first AdamW momentum
|
||||
float beta2; // second AdamW momentum
|
||||
float eps; // epsilon for numerical stability
|
||||
float wd; // weight decay for AdamW, use 0.0f to disable
|
||||
float wd; // weight decay - 0.0f to disable
|
||||
} adamw;
|
||||
struct {
|
||||
float alpha; // learning rate
|
||||
float wd; // weight decay
|
||||
} sgd;
|
||||
};
|
||||
|
||||
// callback to calculate optimizer parameters prior to a backward pass
|
||||
@@ -112,8 +122,11 @@ extern "C" {
|
||||
|
||||
int32_t opt_period; // after how many gradient accumulation steps an optimizer step should be done
|
||||
|
||||
ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
|
||||
void * get_opt_pars_ud; // userdata for calculating optimizer parameters
|
||||
ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
|
||||
void * get_opt_pars_ud; // userdata for calculating optimizer parameters
|
||||
|
||||
// only GGML_OPT_OPTIMIZER_TYPE_ADAMW needs m, v momenta per parameter tensor
|
||||
enum ggml_opt_optimizer_type optimizer;
|
||||
};
|
||||
|
||||
// get parameters for an optimization context with defaults set where possible
|
||||
@@ -142,6 +155,10 @@ extern "C" {
|
||||
// get the gradient accumulator for a node from the forward graph
|
||||
GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);
|
||||
|
||||
GGML_API enum ggml_opt_optimizer_type ggml_opt_context_optimizer_type(ggml_opt_context_t); //TODO consistent naming scheme
|
||||
|
||||
GGML_API const char * ggml_opt_optimizer_name(enum ggml_opt_optimizer_type);
|
||||
|
||||
// ====== Optimization Result ======
|
||||
|
||||
GGML_API ggml_opt_result_t ggml_opt_result_init(void);
|
||||
@@ -226,12 +243,14 @@ extern "C" {
|
||||
struct ggml_tensor * outputs, // output tensor, must have shape [ne_label, ndata_batch] if labels are used
|
||||
ggml_opt_dataset_t dataset, // dataset with data and optionally also labels
|
||||
enum ggml_opt_loss_type loss_type, // loss to minimize
|
||||
enum ggml_opt_optimizer_type optimizer, // sgd or adamw
|
||||
ggml_opt_get_optimizer_params get_opt_pars, // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
|
||||
int64_t nepoch, // how many times the dataset should be iterated over
|
||||
int64_t nbatch_logical, // datapoints optimizer step, must be a multiple of ndata_batch in inputs/outputs
|
||||
float val_split, // fraction of the dataset to use for validation, must be in [0.0f, 1.0f)
|
||||
bool silent); // whether or not info prints to stderr should be suppressed
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -0,0 +1,16 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
GGML_BACKEND_API ggml_backend_t ggml_backend_zdnn_init(void);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zdnn_reg(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
+28
-2
@@ -241,6 +241,8 @@
|
||||
#define GGML_ROPE_TYPE_MROPE 8
|
||||
#define GGML_ROPE_TYPE_VISION 24
|
||||
|
||||
#define GGML_MROPE_SECTIONS 4
|
||||
|
||||
#define GGML_UNUSED(x) (void)(x)
|
||||
|
||||
#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
|
||||
@@ -540,6 +542,7 @@ extern "C" {
|
||||
GGML_OP_CROSS_ENTROPY_LOSS,
|
||||
GGML_OP_CROSS_ENTROPY_LOSS_BACK,
|
||||
GGML_OP_OPT_STEP_ADAMW,
|
||||
GGML_OP_OPT_STEP_SGD,
|
||||
|
||||
GGML_OP_GLU,
|
||||
|
||||
@@ -1660,7 +1663,7 @@ extern "C" {
|
||||
struct ggml_tensor * b,
|
||||
struct ggml_tensor * c,
|
||||
int n_dims,
|
||||
int sections[4],
|
||||
int sections[GGML_MROPE_SECTIONS],
|
||||
int mode,
|
||||
int n_ctx_orig,
|
||||
float freq_base,
|
||||
@@ -1686,6 +1689,22 @@ extern "C" {
|
||||
float beta_fast,
|
||||
float beta_slow);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_rope_multi_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
struct ggml_tensor * c,
|
||||
int n_dims,
|
||||
int sections[GGML_MROPE_SECTIONS],
|
||||
int mode,
|
||||
int n_ctx_orig,
|
||||
float freq_base,
|
||||
float freq_scale,
|
||||
float ext_factor,
|
||||
float attn_factor,
|
||||
float beta_fast,
|
||||
float beta_slow);
|
||||
|
||||
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
@@ -2293,7 +2312,14 @@ extern "C" {
|
||||
struct ggml_tensor * grad,
|
||||
struct ggml_tensor * m,
|
||||
struct ggml_tensor * v,
|
||||
struct ggml_tensor * adamw_params); // parameters such a the learning rate
|
||||
struct ggml_tensor * adamw_params); // parameters such as the learning rate
|
||||
|
||||
// stochastic gradient descent step (with weight decay)
|
||||
GGML_API struct ggml_tensor * ggml_opt_step_sgd(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * grad,
|
||||
struct ggml_tensor * sgd_params); // alpha, weight decay
|
||||
|
||||
//
|
||||
// automatic differentiation
|
||||
|
||||
@@ -382,6 +382,7 @@ ggml_add_backend(RPC)
|
||||
ggml_add_backend(SYCL)
|
||||
ggml_add_backend(Vulkan)
|
||||
ggml_add_backend(WebGPU)
|
||||
ggml_add_backend(zDNN)
|
||||
ggml_add_backend(OpenCL)
|
||||
|
||||
foreach (target ggml-base ggml)
|
||||
|
||||
@@ -49,6 +49,10 @@
|
||||
#include "ggml-webgpu.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_ZDNN
|
||||
#include "ggml-zdnn.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_OPENCL
|
||||
#include "ggml-opencl.h"
|
||||
#endif
|
||||
@@ -180,6 +184,9 @@ struct ggml_backend_registry {
|
||||
#ifdef GGML_USE_WEBGPU
|
||||
register_backend(ggml_backend_webgpu_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_ZDNN
|
||||
register_backend(ggml_backend_zdnn_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_OPENCL
|
||||
register_backend(ggml_backend_opencl_reg());
|
||||
#endif
|
||||
|
||||
@@ -1071,6 +1071,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
}
|
||||
}
|
||||
}
|
||||
// if the node is still unassigned, assign it to the first backend that supports it
|
||||
for (int b = 0; b < sched->n_backends && *cur_backend_id == -1; b++) {
|
||||
ggml_backend_sched_set_if_supported(sched, node, b, cur_backend_id);
|
||||
}
|
||||
GGML_ASSERT(*cur_backend_id != -1);
|
||||
}
|
||||
|
||||
// pass 5: split graph, find tensors that need to be copied
|
||||
@@ -1098,7 +1103,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
|
||||
const int node_backend_id = tensor_backend_id(node);
|
||||
|
||||
assert(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback
|
||||
GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback
|
||||
|
||||
// check if we should start a new split based on the sources of the current node
|
||||
bool need_new_split = false;
|
||||
@@ -1156,7 +1161,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
|
||||
size_t src_id = hash_id(src);
|
||||
const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
|
||||
assert(src_backend_id != -1); // all inputs should be assigned by now
|
||||
GGML_ASSERT(src_backend_id != -1); // all inputs should be assigned by now
|
||||
|
||||
if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
|
||||
if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
|
||||
|
||||
@@ -281,10 +281,10 @@ ggml_backend_t ggml_backend_blas_init(void) {
|
||||
ggml_backend_blas_context * ctx = new ggml_backend_blas_context;
|
||||
|
||||
ggml_backend_t backend = new ggml_backend {
|
||||
/* .guid = */ ggml_backend_blas_guid(),
|
||||
/* .interface = */ blas_backend_i,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_blas_reg(), 0),
|
||||
/* .context = */ ctx,
|
||||
/* .guid = */ ggml_backend_blas_guid(),
|
||||
/* .iface = */ blas_backend_i,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_blas_reg(), 0),
|
||||
/* .context = */ ctx,
|
||||
};
|
||||
|
||||
#if defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP)
|
||||
|
||||
+245
-380
@@ -753,69 +753,55 @@ static void cann_copy(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
||||
void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_tensor* src0 = dst->src[0];
|
||||
|
||||
aclTensor* acl_src = ggml_cann_create_tensor(src0);
|
||||
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
||||
if (ggml_are_same_shape(src0, dst)) {
|
||||
aclTensor* acl_src = ggml_cann_create_tensor(src0);
|
||||
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
||||
if (dst->type == src0->type) {
|
||||
cann_copy(ctx, acl_src, acl_dst);
|
||||
} else {
|
||||
aclnn_cast(ctx, acl_src, acl_dst, ggml_cann_type_mapping(dst->type));
|
||||
}
|
||||
ggml_cann_release_resources(ctx, acl_src, acl_dst);
|
||||
} else {
|
||||
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
|
||||
if (dst->type == src0->type) {
|
||||
size_t cpy_size = ggml_nbytes(dst);
|
||||
ggml_cann_async_memcpy(ctx, dst->data, src0->data, cpy_size,
|
||||
ACL_MEMCPY_DEVICE_TO_DEVICE);
|
||||
return;
|
||||
} else {
|
||||
ggml_cann_pool_alloc src_buffer_allocator(
|
||||
ctx.pool(),
|
||||
ggml_nelements(dst) * ggml_type_size(dst->type));
|
||||
void* src_trans_buffer = src_buffer_allocator.get();
|
||||
size_t src_trans_nb[GGML_MAX_DIMS];
|
||||
src_trans_nb[0] = ggml_type_size(dst->type);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
|
||||
}
|
||||
aclTensor* src_trans_tensor = ggml_cann_create_tensor(
|
||||
src_trans_buffer, ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), src0->ne, src_trans_nb,
|
||||
GGML_MAX_DIMS);
|
||||
|
||||
aclnn_cast(ctx, acl_src, src_trans_tensor, ggml_cann_type_mapping(dst->type));
|
||||
size_t cpy_size = ggml_nbytes(dst);
|
||||
ggml_cann_async_memcpy(ctx, dst->data, src_trans_buffer, cpy_size,
|
||||
ACL_MEMCPY_DEVICE_TO_DEVICE);
|
||||
ggml_cann_release_resources(ctx, src_trans_tensor);
|
||||
return;
|
||||
}
|
||||
} else if (ggml_is_contiguous(dst)) {
|
||||
ggml_cann_pool_alloc src_buffer_allocator(
|
||||
ctx.pool(), ggml_nelements(dst) * ggml_type_size(dst->type));
|
||||
void* src_trans_buffer = src_buffer_allocator.get();
|
||||
void* src_trans_buffer = src0->data;
|
||||
ggml_cann_pool_alloc src_buffer_allocator;
|
||||
if (!ggml_is_contiguous(src0)) {
|
||||
aclTensor* acl_src = ggml_cann_create_tensor(src0);
|
||||
src_buffer_allocator.alloc(ctx.pool(),
|
||||
ggml_nelements(src0) * ggml_type_size(src0->type));
|
||||
src_trans_buffer = src_buffer_allocator.get();
|
||||
size_t src_trans_nb[GGML_MAX_DIMS];
|
||||
src_trans_nb[0] = ggml_type_size(dst->type);
|
||||
src_trans_nb[0] = ggml_type_size(src0->type);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
|
||||
}
|
||||
aclTensor* src_trans_tensor = ggml_cann_create_tensor(
|
||||
src_trans_buffer, ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), src0->ne, src_trans_nb,
|
||||
src_trans_buffer, ggml_cann_type_mapping(src0->type),
|
||||
ggml_type_size(src0->type), src0->ne, src_trans_nb,
|
||||
GGML_MAX_DIMS);
|
||||
|
||||
aclnn_cast(ctx, acl_src, src_trans_tensor, ggml_cann_type_mapping(dst->type));
|
||||
|
||||
size_t cpy_size = ggml_nbytes(dst);
|
||||
ggml_cann_async_memcpy(ctx, dst->data, src_trans_buffer, cpy_size,
|
||||
ACL_MEMCPY_DEVICE_TO_DEVICE);
|
||||
ggml_cann_release_resources(ctx, src_trans_tensor);
|
||||
return;
|
||||
} else {
|
||||
GGML_ABORT("Unsupport dst is not tontiguous.");
|
||||
cann_copy(ctx, acl_src, src_trans_tensor);
|
||||
ggml_cann_release_resources(ctx, acl_src, src_trans_tensor);
|
||||
}
|
||||
|
||||
size_t src_reshape_nb[GGML_MAX_DIMS];
|
||||
src_reshape_nb[0] = ggml_type_size(src0->type);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
src_reshape_nb[i] = src_reshape_nb[i - 1] * dst->ne[i - 1];
|
||||
}
|
||||
|
||||
aclTensor* trans_acl_src = ggml_cann_create_tensor(src_trans_buffer,
|
||||
ggml_cann_type_mapping(src0->type),ggml_type_size(src0->type),
|
||||
dst->ne, src_reshape_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
|
||||
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
||||
|
||||
if (dst->type == src0->type) {
|
||||
cann_copy(ctx, trans_acl_src, acl_dst);
|
||||
} else {
|
||||
aclnn_cast(ctx, trans_acl_src, acl_dst, ggml_cann_type_mapping(dst->type));
|
||||
}
|
||||
ggml_cann_release_resources(ctx, trans_acl_src, acl_dst);
|
||||
}
|
||||
ggml_cann_release_resources(ctx, acl_src, acl_dst);
|
||||
return;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -1330,160 +1316,196 @@ static void aclnn_pow_tensor_tensor(ggml_backend_cann_context& ctx,
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Applies the Alibi (Attention with Linear Biases) mechanism to the
|
||||
* @details This function implements the Alibi mechanism, which introduces
|
||||
* learnable biases into the attention scores to simulate relative
|
||||
* position encoding without the need for explicit positional
|
||||
* embeddings.
|
||||
* @brief Generate a range of values and apply a scalar base exponentiation.
|
||||
*
|
||||
* @param ctx The backend CANN context for executing operations.
|
||||
* @param acl_src The source tensor representing the query or key.
|
||||
* @param acl_position The position tensor containing relative positions.
|
||||
* @param acl_dst The destination tensor where the result will be stored.
|
||||
* @param n_head The number of attention heads.
|
||||
* @param src_ne The dimensions of the source tensor.
|
||||
* @param src_nb0 The byte size of the first dimension of the source
|
||||
tensor.
|
||||
* @param max_bias The maximum bias value used in the Alibi mechanism.
|
||||
* @param dst The destination tensor object for additional metadata.
|
||||
* This function creates an evenly spaced sequence from `start` to `stop` (exclusive),
|
||||
* with step size `step`, stores it in a temporary buffer, and then computes:
|
||||
*
|
||||
* The function performs the following steps:
|
||||
* 1. Calculates the logarithm floor of the number of heads to determine the
|
||||
base for bias calculation.
|
||||
* 2. Initializes arrays with arithmetic sequences and fills them with bias
|
||||
values.
|
||||
* 3. Computes the bias tensor based on the calculated biases and arithmetic
|
||||
sequences.
|
||||
* 4. Reshapes the bias tensor to match the dimensions of the input tensors.
|
||||
* 5. Multiplies the position tensor by the bias tensor.
|
||||
* 6. Adds the result of the multiplication to the source tensor to produce the
|
||||
final output.
|
||||
* @f[
|
||||
* slope[i] = m^{\left( start + i \cdot step \right)}, \quad 0 \le i < size
|
||||
* @f]
|
||||
*
|
||||
* The results are written to the provided @p slope_buffer.
|
||||
*
|
||||
* @param ctx CANN backend context for memory allocation and operator execution.
|
||||
* @param slope_buffer Pointer to the output buffer (float array) for the computed slope values.
|
||||
* @param m Scalar base for the exponentiation.
|
||||
* @param size Number of elements in the generated sequence.
|
||||
* @param start Starting exponent offset.
|
||||
* @param stop Stopping exponent offset (exclusive).
|
||||
* @param step Step size for the exponent increment.
|
||||
*/
|
||||
static void aclnn_alibi(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
||||
aclTensor* acl_position, aclTensor* acl_dst,
|
||||
const int n_head, int64_t* src_ne, const size_t src_nb0,
|
||||
float max_bias, ggml_tensor* dst) {
|
||||
const int64_t ne2_ne3 = src_ne[2] * src_ne[3];
|
||||
GGML_ASSERT(src_nb0 == sizeof(float));
|
||||
GGML_ASSERT(n_head == src_ne[2]);
|
||||
static void aclnn_get_slope_inner(ggml_backend_cann_context& ctx, void* slope_buffer,
|
||||
float m, int64_t size, float start, float stop, float step){
|
||||
int64_t ne[] = {size};
|
||||
size_t nb[] = {sizeof(float)};
|
||||
|
||||
const int n_heads_log2_floor = 1u << (uint32_t)floor(log2(n_head));
|
||||
ggml_cann_pool_alloc arange_allocator(ctx.pool(), size * sizeof(float));
|
||||
void* arange_buffer = arange_allocator.get();
|
||||
|
||||
float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
||||
float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
||||
aclTensor* arange_tensor = ggml_cann_create_tensor(
|
||||
arange_buffer, ACL_FLOAT, sizeof(float), ne, nb, 1);
|
||||
aclnn_arange(ctx, arange_tensor, start, stop, step, size);
|
||||
|
||||
// init arange
|
||||
ggml_cann_pool_alloc arange_allocator(ctx.pool(),
|
||||
ne2_ne3 * ggml_type_size(dst->type));
|
||||
void* tmp_arange_buffer = arange_allocator.get();
|
||||
aclTensor* slope_tensor = ggml_cann_create_tensor(
|
||||
slope_buffer, ACL_FLOAT, sizeof(float), ne, nb, 1);
|
||||
|
||||
// arange1: [1, ..., n_heads_log2_floor+1)
|
||||
float start = 1;
|
||||
float stop = n_heads_log2_floor + 1;
|
||||
float step = 1;
|
||||
int64_t n_elements_arange = n_heads_log2_floor;
|
||||
aclScalar* sc = aclCreateScalar(&m, aclDataType::ACL_FLOAT);
|
||||
|
||||
int64_t tmp_arange1_ne[] = {n_heads_log2_floor};
|
||||
size_t tmp_arange1_nb[] = {sizeof(dst->type)};
|
||||
aclTensor* tmp_arange1_tensor = ggml_cann_create_tensor(
|
||||
tmp_arange_buffer, ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), tmp_arange1_ne, tmp_arange1_nb,
|
||||
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
|
||||
aclnn_arange(ctx, tmp_arange1_tensor, start, stop, step, n_elements_arange);
|
||||
|
||||
aclTensor* tmp_arange2_tensor = nullptr;
|
||||
if (n_heads_log2_floor < ne2_ne3) {
|
||||
// arange2: [1, ..., 2 * (k - n_heads_log2_floor) + 1)
|
||||
start = 1;
|
||||
stop = 2 * (ne2_ne3 - n_heads_log2_floor) + 1;
|
||||
step = 2;
|
||||
n_elements_arange = ne2_ne3 - n_heads_log2_floor;
|
||||
int64_t tmp_arange2_ne[] = {ne2_ne3 - n_heads_log2_floor};
|
||||
size_t tmp_arange2_nb[] = {sizeof(dst->type)};
|
||||
|
||||
aclTensor* tmp_arange2_tensor = ggml_cann_create_tensor(
|
||||
(char*)tmp_arange_buffer +
|
||||
n_heads_log2_floor * ggml_type_size(dst->type),
|
||||
ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
|
||||
tmp_arange2_ne, tmp_arange2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
aclnn_arange(ctx, tmp_arange2_tensor, start, stop, step,
|
||||
n_elements_arange);
|
||||
}
|
||||
|
||||
// init mk_base
|
||||
ggml_cann_pool_alloc mk_base_allocator(ctx.pool(),
|
||||
ne2_ne3 * ggml_type_size(dst->type));
|
||||
void* tmp_mk_base_buffer = mk_base_allocator.get();
|
||||
int64_t tmp_mk_base1_ne[] = {n_heads_log2_floor};
|
||||
size_t tmp_mk_base1_nb[] = {sizeof(dst->type)};
|
||||
aclTensor* tmp_mk_base1_tensor = ggml_cann_create_tensor(
|
||||
tmp_mk_base_buffer, ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), tmp_mk_base1_ne, tmp_mk_base1_nb,
|
||||
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
|
||||
aclnn_fill_scalar(ctx, m0, tmp_mk_base1_tensor);
|
||||
|
||||
aclTensor* tmp_mk_base2_tensor = nullptr;
|
||||
if (n_heads_log2_floor < ne2_ne3) {
|
||||
int64_t tmp_mk_base2_ne[] = {ne2_ne3 - n_heads_log2_floor};
|
||||
size_t tmp_mk_base2_nb[] = {sizeof(dst->type)};
|
||||
aclTensor* tmp_mk_base2_tensor = ggml_cann_create_tensor(
|
||||
(char*)tmp_mk_base_buffer +
|
||||
n_heads_log2_floor * ggml_type_size(dst->type),
|
||||
ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
|
||||
tmp_mk_base2_ne, tmp_mk_base2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
aclnn_fill_scalar(ctx, m1, tmp_mk_base2_tensor);
|
||||
}
|
||||
|
||||
// init mk
|
||||
int64_t tmp_mk_base_ne[] = {ne2_ne3};
|
||||
size_t tmp_mk_base_nb[] = {sizeof(dst->type)};
|
||||
aclTensor* tmp_mk_base_tensor = ggml_cann_create_tensor(
|
||||
tmp_mk_base_buffer, ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), tmp_mk_base_ne, tmp_mk_base_nb,
|
||||
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
aclTensor* tmp_arange_tensor = ggml_cann_create_tensor(
|
||||
tmp_arange_buffer, ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), tmp_mk_base_ne, tmp_mk_base_nb,
|
||||
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
aclnn_pow_tensor_tensor(ctx, tmp_mk_base_tensor, tmp_arange_tensor);
|
||||
|
||||
// reshape mk
|
||||
int64_t tmp_mk_ne[] = {1, 1, src_ne[2], src_ne[3]};
|
||||
size_t tmp_mk_nb[GGML_MAX_DIMS];
|
||||
tmp_mk_nb[0] = ggml_type_size(dst->type);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
tmp_mk_nb[i] = tmp_mk_nb[i - 1] * tmp_mk_ne[i - 1];
|
||||
}
|
||||
aclTensor* tmp_mk_tensor = ggml_cann_create_tensor(
|
||||
tmp_mk_base_buffer, ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), tmp_mk_ne, tmp_mk_nb, GGML_MAX_DIMS,
|
||||
ACL_FORMAT_ND);
|
||||
|
||||
// acl_position * mk
|
||||
int64_t tmp_output_ne[] = {src_ne[0], src_ne[1], src_ne[2], src_ne[3]};
|
||||
size_t tmp_output_nb[GGML_MAX_DIMS];
|
||||
tmp_output_nb[0] = ggml_type_size(dst->type);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
tmp_output_nb[i] = tmp_output_nb[i - 1] * tmp_output_ne[i - 1];
|
||||
}
|
||||
ggml_cann_pool_alloc output_allocator(ctx.pool(), ggml_nbytes(dst));
|
||||
void* tmp_output_buffer = output_allocator.get();
|
||||
aclTensor* tmp_output_tensor = ggml_cann_create_tensor(
|
||||
tmp_output_buffer, ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), tmp_output_ne, tmp_output_nb, GGML_MAX_DIMS,
|
||||
ACL_FORMAT_ND);
|
||||
aclnn_mul(ctx, acl_position, tmp_mk_tensor, tmp_output_tensor);
|
||||
|
||||
// add
|
||||
aclnn_add(ctx, tmp_output_tensor, acl_src, acl_dst);
|
||||
ggml_cann_release_resources(ctx, tmp_arange1_tensor, tmp_arange2_tensor,
|
||||
tmp_mk_base1_tensor, tmp_mk_base2_tensor, tmp_mk_base_tensor,
|
||||
tmp_arange_tensor, tmp_mk_tensor, tmp_output_tensor);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, PowScalarTensor, sc, arange_tensor, slope_tensor);
|
||||
ggml_cann_release_resources(ctx, sc, arange_tensor, slope_tensor);
|
||||
}
|
||||
|
||||
void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
/**
|
||||
* @brief Compute slope values for multiple attention heads based on ALiBi bias parameters.
|
||||
*
|
||||
* This function generates slope values for each attention head according to the ALiBi
|
||||
* (Attention with Linear Biases) method. It splits the computation into two ranges depending
|
||||
* on whether the head index is less than @p n_head_log2 or not, and uses different base values
|
||||
* (`m0` and `m1`) for the exponentiation.
|
||||
*
|
||||
* @f[
|
||||
* slope[h] =
|
||||
* \begin{cases}
|
||||
* m_0^{(h + 1)}, & h < n\_head\_log2 \\
|
||||
* m_1^{\left( 2 \cdot (h - n\_head\_log2) + 1 \right)}, & h \geq n\_head\_log2
|
||||
* \end{cases}
|
||||
* \quad , \quad \text{if } max\_bias > 0
|
||||
* @f]
|
||||
*
|
||||
* If @p max_bias <= 0, all slope values are set to 1.0.
|
||||
*
|
||||
* @param ctx CANN backend context for memory allocation and operator execution.
|
||||
* @param n_head Total number of attention heads.
|
||||
* @param slope_buffer Pointer to the output buffer (float array) for storing slopes.
|
||||
* @param max_bias Maximum bias value for slope computation.
|
||||
*
|
||||
*/
|
||||
static void aclnn_get_slope(ggml_backend_cann_context & ctx, int64_t n_head,
|
||||
void* slope_buffer, float max_bias) {
|
||||
const int n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
|
||||
|
||||
float m0 = powf(2.0f, -(max_bias) / n_head_log2);
|
||||
float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||
|
||||
// const float slope = (max_bias > 0.0f) ?
|
||||
// h < n_head_log2 ?
|
||||
// powf(m0, h + 1) :
|
||||
// powf(m1, 2*(h - n_head_log2) + 1) :
|
||||
// 1.0f;
|
||||
// arange1
|
||||
float start = 0 + 1;
|
||||
float end = (n_head_log2 - 1) + 1;
|
||||
float step = 1;
|
||||
float count = n_head_log2;
|
||||
// end needs to be +1 because aclnn uses a left-closed, right-open interval.
|
||||
aclnn_get_slope_inner(ctx, slope_buffer, m0, count, start, end + 1, step);
|
||||
if (n_head_log2 < n_head) {
|
||||
// arange2
|
||||
start = 2 * (n_head_log2 - n_head_log2) + 1;
|
||||
end = 2 * ((n_head - 1) - n_head_log2) + 1;
|
||||
step = 2;
|
||||
count = n_head - n_head_log2;
|
||||
aclnn_get_slope_inner(
|
||||
ctx, (char *) slope_buffer + n_head_log2 * sizeof(float),
|
||||
m1, count, start, end + 1, step);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Add ALiBi (Attention with Linear Biases) positional biases to the attention mask.
|
||||
*
|
||||
* This function computes the ALiBi slopes for each attention head (if max_bias > 0),
|
||||
* multiplies them with the attention mask to produce bias tensors, and adds these biases
|
||||
* to the destination tensor (@p dst).
|
||||
*
|
||||
* The function performs necessary broadcasting of the mask and slope tensors to match
|
||||
* the shape of the destination tensor, then applies element-wise multiplication and addition
|
||||
* using CANN operators.
|
||||
*
|
||||
* @param ctx CANN backend context for memory management and operator execution.
|
||||
* @param mask Input attention mask tensor, assumed to be contiguous.
|
||||
* @param dst Destination tensor to which ALiBi biases will be added.
|
||||
* @param dst_ptr Pointer to the memory of the destination tensor.
|
||||
* @param max_bias Maximum bias value controlling the slope scaling.
|
||||
*
|
||||
* @note
|
||||
* - Write data into dst_ptr using only the shape information of the dst tensor.
|
||||
* - `GGML_MAX_DIMS + 2` is used to extend tensor dimensions for broadcasting.
|
||||
*/
|
||||
static void aclnn_add_alibi(ggml_backend_cann_context& ctx, ggml_tensor* mask,
|
||||
ggml_tensor* dst, void* dst_ptr, float max_bias) {
|
||||
void* slope_buffer = nullptr;
|
||||
void* bias_buffer = nullptr;
|
||||
|
||||
if (max_bias > 0.0f) {
|
||||
int64_t n_heads = dst->ne[2];
|
||||
ggml_cann_pool_alloc slope_allocator(ctx.pool(), n_heads * sizeof(float));
|
||||
slope_buffer = slope_allocator.get();
|
||||
ggml_cann_pool_alloc bias_allocator(
|
||||
ctx.pool(), ggml_nelements(dst) * ggml_element_size(dst));
|
||||
bias_buffer = bias_allocator.get();
|
||||
aclnn_get_slope(ctx, n_heads, slope_buffer, max_bias);
|
||||
}
|
||||
|
||||
// broadcast for mask, slop and dst;
|
||||
int64_t nr2 = dst->ne[2] / mask->ne[2];
|
||||
int64_t nr3 = dst->ne[3] / mask->ne[3];
|
||||
|
||||
// broadcast the mask across rows
|
||||
int64_t mask_ne[] = { mask->ne[0], dst->ne[1], mask->ne[2], 1, mask->ne[3], 1 };
|
||||
size_t mask_nb[] = {
|
||||
mask_nb[0] = mask->nb[0], mask_nb[1] = mask->nb[1], mask_nb[2] = mask->nb[2],
|
||||
mask_nb[3] = mask->nb[2], mask_nb[4] = mask->nb[3], mask_nb[5] = mask->nb[3]
|
||||
};
|
||||
|
||||
int64_t dst_ne[] = { dst->ne[0], dst->ne[1], mask->ne[2], nr2, mask->ne[3], nr3 };
|
||||
size_t dst_nb[] = {
|
||||
dst_nb[0] = dst->nb[0], dst_nb[1] = dst->nb[1], dst_nb[2] = dst->nb[2],
|
||||
dst_nb[3] = dst->nb[2], dst_nb[4] = dst->nb[3], dst_nb[5] = dst->nb[3]
|
||||
};
|
||||
|
||||
// slope is a 1 dim tensor, slope.ne2 == dst.ne2
|
||||
int64_t slope_ne[] = { 1, 1, mask->ne[2], nr2, 1, 1 };
|
||||
size_t slope_nb[GGML_MAX_DIMS + 2];
|
||||
slope_nb[0] = sizeof(float);
|
||||
for (int i = 1; i < GGML_MAX_DIMS + 2; i++) {
|
||||
slope_nb[i] = slope_nb[i - 1] * slope_ne[i - 1];
|
||||
}
|
||||
|
||||
aclTensor* acl_slope = ggml_cann_create_tensor(
|
||||
slope_buffer, ACL_FLOAT, sizeof(float),
|
||||
slope_ne, slope_nb, GGML_MAX_DIMS + 2);
|
||||
aclTensor* acl_mask = ggml_cann_create_tensor(
|
||||
mask, mask_ne, mask_nb, GGML_MAX_DIMS + 2);
|
||||
|
||||
// write data into dst_ptr using only the shape information of the dst tensor.
|
||||
aclTensor* acl_dst = ggml_cann_create_tensor(
|
||||
dst_ptr, ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), dst_ne, dst_nb,
|
||||
GGML_MAX_DIMS + 2);
|
||||
|
||||
if (max_bias > 0.0f) {
|
||||
int64_t bias_ne[] = { mask->ne[0], dst->ne[1], mask->ne[2], nr2, mask->ne[3], 1 };
|
||||
size_t bias_nb[GGML_MAX_DIMS + 2];
|
||||
bias_nb[0] = sizeof(float);
|
||||
for (int i = 1; i < GGML_MAX_DIMS + 2; i++) {
|
||||
bias_nb[i] = bias_nb[i - 1] * bias_ne[i - 1];
|
||||
}
|
||||
aclTensor* bias_tensor = ggml_cann_create_tensor(
|
||||
bias_buffer, ACL_FLOAT, sizeof(float),
|
||||
bias_ne, bias_nb, GGML_MAX_DIMS + 2);
|
||||
|
||||
aclnn_mul(ctx, acl_slope, acl_mask, bias_tensor);
|
||||
aclnn_add(ctx, acl_dst, bias_tensor);
|
||||
ggml_cann_release_resources(ctx, bias_tensor);
|
||||
} else {
|
||||
aclnn_add(ctx, acl_dst, acl_mask);
|
||||
}
|
||||
ggml_cann_release_resources(ctx, acl_slope, acl_mask, acl_dst);
|
||||
}
|
||||
|
||||
void ggml_cann_cpy(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cann_dup(ctx, dst);
|
||||
}
|
||||
|
||||
@@ -1501,118 +1523,41 @@ void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
* @param acl_dst The destination tensor where the softmax results will be
|
||||
* stored.
|
||||
*/
|
||||
static void aclnn_softmax(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
||||
int64_t dim, aclTensor* acl_dst) {
|
||||
static void aclnn_softmax(ggml_backend_cann_context & ctx,
|
||||
aclTensor* acl_src, int64_t dim, aclTensor * acl_dst) {
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, Softmax, acl_src, dim, acl_dst);
|
||||
}
|
||||
|
||||
void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
void ggml_cann_softmax(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
ggml_tensor* src0 = dst->src[0];
|
||||
ggml_tensor* src1 = dst->src[1]; // mask
|
||||
|
||||
aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
|
||||
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
||||
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
||||
|
||||
float scale = 1.0f;
|
||||
float scale = 1.0f;
|
||||
float max_bias = 0.0f;
|
||||
|
||||
memcpy(&scale, (float*)dst->op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, (float*)dst->op_params + 1, sizeof(float));
|
||||
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
|
||||
|
||||
// input mul scale
|
||||
aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT);
|
||||
ggml_cann_pool_alloc src_tensor_allocator(ctx.pool(), ggml_nbytes(src0));
|
||||
void* src_tensor_buffer = src_tensor_allocator.get();
|
||||
aclTensor* softmax_tensor = ggml_cann_create_tensor(
|
||||
src_tensor_buffer, ggml_cann_type_mapping(src0->type),
|
||||
ggml_element_size(src0), src0->ne, src0->nb,GGML_MAX_DIMS);
|
||||
|
||||
size_t n_bytes = ggml_nbytes(src0);
|
||||
ggml_cann_pool_alloc mul_scale_allocator(ctx.pool(), n_bytes);
|
||||
void* input_mul_scale_buffer = mul_scale_allocator.get();
|
||||
aclTensor* acl_input_mul_scale_tensor = ggml_cann_create_tensor(
|
||||
input_mul_scale_buffer, ACL_FLOAT, ggml_type_size(src0->type), src0->ne,
|
||||
src0->nb, GGML_MAX_DIMS);
|
||||
|
||||
bool inplace = false;
|
||||
aclnn_muls(ctx, acl_src0, scale, acl_input_mul_scale_tensor, inplace);
|
||||
aclnn_muls(ctx, acl_src0, scale, softmax_tensor, false);
|
||||
|
||||
// mask
|
||||
aclTensor* acl_src1_fp32_tensor = nullptr;
|
||||
aclTensor* tmp_mask_tensor = nullptr;
|
||||
ggml_cann_pool_alloc src1_fp32_allocator(ctx.pool());
|
||||
if (src1) {
|
||||
const bool use_f16 = src1->type == GGML_TYPE_F16;
|
||||
if (use_f16) {
|
||||
// cast to fp32
|
||||
size_t n_bytes = ggml_nelements(src1) * sizeof(float_t);
|
||||
size_t src1_fp32_nb[GGML_MAX_DIMS];
|
||||
src1_fp32_nb[0] = sizeof(float_t);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
src1_fp32_nb[i] = src1_fp32_nb[i - 1] * src1->ne[i - 1];
|
||||
}
|
||||
src1_fp32_allocator.alloc(n_bytes);
|
||||
void* src1_fp32_buffer = src1_fp32_allocator.get();
|
||||
acl_src1_fp32_tensor = ggml_cann_create_tensor(
|
||||
src1_fp32_buffer, ACL_FLOAT, sizeof(float), src1->ne,
|
||||
src1_fp32_nb, GGML_MAX_DIMS);
|
||||
aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
|
||||
aclnn_cast(ctx, acl_src1, acl_src1_fp32_tensor, ACL_FLOAT);
|
||||
ggml_cann_release_resources(ctx, acl_src1);
|
||||
} else {
|
||||
acl_src1_fp32_tensor = ggml_cann_create_tensor(src1);
|
||||
}
|
||||
|
||||
// broadcast the mask across rows, only use ne11 of ne01 in mask
|
||||
if (src1->ne[1] != src0->ne[1]) {
|
||||
// mask shape: [1,1,ne11,ne10]
|
||||
int64_t tmp_mask_ne[] = {src0->ne[0], src0->ne[1], 1, 1};
|
||||
size_t tmp_mask_nb[GGML_MAX_DIMS];
|
||||
tmp_mask_nb[0] = sizeof(float_t);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
tmp_mask_nb[i] = tmp_mask_nb[i - 1] * tmp_mask_ne[i - 1];
|
||||
}
|
||||
tmp_mask_tensor = ggml_cann_create_tensor(
|
||||
src1->data, ACL_FLOAT, sizeof(float), tmp_mask_ne, tmp_mask_nb,
|
||||
GGML_MAX_DIMS, ACL_FORMAT_ND);
|
||||
}
|
||||
|
||||
// alibi
|
||||
const int n_head = src0->ne[2];
|
||||
const size_t src_nb0 = src0->nb[0];
|
||||
|
||||
n_bytes = ggml_nbytes(dst);
|
||||
ggml_cann_pool_alloc output_allocator(ctx.pool(), n_bytes);
|
||||
void* output_buffer = output_allocator.get();
|
||||
aclTensor* alibi_output_tensor = ggml_cann_create_tensor(
|
||||
output_buffer, ACL_FLOAT, ggml_type_size(dst->type), dst->ne,
|
||||
dst->nb, GGML_MAX_DIMS);
|
||||
if (max_bias <= 0.0f) {
|
||||
// slope = 1.0
|
||||
if (tmp_mask_tensor) {
|
||||
aclnn_add(ctx, tmp_mask_tensor, acl_input_mul_scale_tensor,
|
||||
alibi_output_tensor);
|
||||
} else {
|
||||
aclnn_add(ctx, acl_src1_fp32_tensor, acl_input_mul_scale_tensor,
|
||||
alibi_output_tensor);
|
||||
}
|
||||
} else {
|
||||
// slope != 1.0
|
||||
if (tmp_mask_tensor) {
|
||||
aclnn_alibi(ctx, acl_input_mul_scale_tensor, tmp_mask_tensor,
|
||||
alibi_output_tensor, n_head, src0->ne, src_nb0,
|
||||
max_bias, dst);
|
||||
} else {
|
||||
aclnn_alibi(ctx, acl_input_mul_scale_tensor,
|
||||
acl_src1_fp32_tensor, alibi_output_tensor, n_head,
|
||||
src0->ne, src_nb0, max_bias, dst);
|
||||
}
|
||||
}
|
||||
|
||||
// softmax
|
||||
aclnn_softmax(ctx, alibi_output_tensor, 3, acl_dst);
|
||||
ggml_cann_release_resources(ctx, alibi_output_tensor);
|
||||
} else {
|
||||
aclnn_softmax(ctx, acl_input_mul_scale_tensor, 3, acl_dst);
|
||||
aclnn_add_alibi(ctx, src1, src0, src_tensor_buffer, max_bias);
|
||||
}
|
||||
|
||||
ggml_cann_release_resources(ctx, acl_src0, acl_src1_fp32_tensor, acl_dst,
|
||||
acl_scale, acl_input_mul_scale_tensor, tmp_mask_tensor);
|
||||
// softmax
|
||||
aclnn_softmax(ctx, softmax_tensor, 3, acl_dst);
|
||||
ggml_cann_release_resources(ctx, acl_src0, acl_dst, acl_scale, softmax_tensor);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -3208,104 +3153,24 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
|
||||
// Compute the slope if needed. Derived from ggml_cann_softmax().
|
||||
if(maxBias != 0.0f){
|
||||
// alibi
|
||||
const int64_t ne2_ne3 = src0->ne[2] * src0->ne[3];
|
||||
const int64_t n_head = src0->ne[2];
|
||||
const int n_heads_log2_floor = 1u << (uint32_t)floor(log2(n_head));
|
||||
float m0 = powf(2.0f, -(maxBias) / n_heads_log2_floor);
|
||||
float m1 = powf(2.0f, -(maxBias / 2.0f) / n_heads_log2_floor);
|
||||
// init arange
|
||||
ggml_cann_pool_alloc arange_allocator(ctx.pool(),
|
||||
ne2_ne3 * faElemSize);
|
||||
void* tmp_arange_buffer = arange_allocator.get();
|
||||
const int64_t n_heads = src0->ne[2];
|
||||
ggml_cann_pool_alloc slope_allocator(ctx.pool(), n_heads * sizeof(float));
|
||||
void* slope_buffer = slope_allocator.get();
|
||||
aclnn_get_slope(ctx, n_heads, slope_buffer, maxBias);
|
||||
|
||||
// arange1: [1, ..., n_heads_log2_floor+1)
|
||||
float start = 1;
|
||||
float stop = n_heads_log2_floor + 1;
|
||||
float step = 1;
|
||||
int64_t n_elements_arange = n_heads_log2_floor;
|
||||
|
||||
int64_t tmp_arange1_ne[] = {n_heads_log2_floor};
|
||||
size_t tmp_arange1_nb[] = {faElemSize};
|
||||
aclTensor* tmp_arange1_tensor = ggml_cann_create_tensor(
|
||||
tmp_arange_buffer, faDataType, faElemSize,
|
||||
tmp_arange1_ne, tmp_arange1_nb,
|
||||
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
|
||||
aclnn_arange(ctx, tmp_arange1_tensor, start, stop, step, n_elements_arange);
|
||||
|
||||
aclTensor* tmp_arange2_tensor = nullptr;
|
||||
if (n_heads_log2_floor < ne2_ne3) {
|
||||
// arange2: [1, ..., 2 * (k - n_heads_log2_floor) + 1)
|
||||
start = 1;
|
||||
stop = 2 * (ne2_ne3 - n_heads_log2_floor) + 1;
|
||||
step = 2;
|
||||
n_elements_arange = ne2_ne3 - n_heads_log2_floor;
|
||||
int64_t tmp_arange2_ne[] = {ne2_ne3 - n_heads_log2_floor};
|
||||
size_t tmp_arange2_nb[] = {faElemSize};
|
||||
|
||||
aclTensor* tmp_arange2_tensor = ggml_cann_create_tensor(
|
||||
(char*)tmp_arange_buffer +
|
||||
n_heads_log2_floor * faElemSize,
|
||||
faDataType, faElemSize,
|
||||
tmp_arange2_ne, tmp_arange2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
aclnn_arange(ctx, tmp_arange2_tensor, start, stop, step,
|
||||
n_elements_arange);
|
||||
int64_t slope_ne[] = {1, 1, n_heads, 1};
|
||||
size_t slope_nb[GGML_MAX_DIMS];
|
||||
slope_nb[0] = sizeof(float);
|
||||
for(int i = 1;i<GGML_MAX_DIMS;i++) {
|
||||
slope_nb[i] = slope_nb[i-1] * slope_ne[0];
|
||||
}
|
||||
|
||||
// init mk_base
|
||||
ggml_cann_pool_alloc mk_base_allocator(ctx.pool(),
|
||||
ne2_ne3 * faElemSize);
|
||||
void* tmp_mk_base_buffer = mk_base_allocator.get();
|
||||
int64_t tmp_mk_base1_ne[] = {n_heads_log2_floor};
|
||||
size_t tmp_mk_base1_nb[] = {faElemSize};
|
||||
aclTensor* tmp_mk_base1_tensor = ggml_cann_create_tensor(
|
||||
tmp_mk_base_buffer, faDataType, faElemSize,
|
||||
tmp_mk_base1_ne, tmp_mk_base1_nb,
|
||||
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
aclTensor* slope_tensor = ggml_cann_create_tensor(
|
||||
slope_buffer, ACL_FLOAT, sizeof(float),
|
||||
slope_ne, slope_nb, GGML_MAX_DIMS);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, bcast_pse_tensor, slope_tensor);
|
||||
|
||||
aclnn_fill_scalar(ctx, m0, tmp_mk_base1_tensor);
|
||||
|
||||
aclTensor* tmp_mk_base2_tensor = nullptr;
|
||||
if (n_heads_log2_floor < ne2_ne3) {
|
||||
int64_t tmp_mk_base2_ne[] = {ne2_ne3 - n_heads_log2_floor};
|
||||
size_t tmp_mk_base2_nb[] = {faElemSize};
|
||||
aclTensor* tmp_mk_base2_tensor = ggml_cann_create_tensor(
|
||||
(char*)tmp_mk_base_buffer +
|
||||
n_heads_log2_floor * faElemSize,
|
||||
faDataType, faElemSize,
|
||||
tmp_mk_base2_ne, tmp_mk_base2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
aclnn_fill_scalar(ctx, m1, tmp_mk_base2_tensor);
|
||||
}
|
||||
|
||||
// init mk
|
||||
int64_t tmp_mk_base_ne[] = {ne2_ne3};
|
||||
size_t tmp_mk_base_nb[] = {faElemSize};
|
||||
aclTensor* tmp_mk_base_tensor = ggml_cann_create_tensor(
|
||||
tmp_mk_base_buffer, faDataType, faElemSize,
|
||||
tmp_mk_base_ne, tmp_mk_base_nb,
|
||||
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
aclTensor* tmp_arange_tensor = ggml_cann_create_tensor(
|
||||
tmp_arange_buffer, faDataType, faElemSize,
|
||||
tmp_mk_base_ne, tmp_mk_base_nb,
|
||||
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
aclnn_pow_tensor_tensor(ctx, tmp_mk_base_tensor, tmp_arange_tensor);
|
||||
|
||||
// reshape mk
|
||||
int64_t tmp_mk_ne[] = {1, 1, src0->ne[2], src0->ne[3]};
|
||||
size_t tmp_mk_nb[GGML_MAX_DIMS];
|
||||
tmp_mk_nb[0] = faElemSize;
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
tmp_mk_nb[i] = tmp_mk_nb[i - 1] * tmp_mk_ne[i - 1];
|
||||
}
|
||||
aclTensor* tmp_mk_tensor = ggml_cann_create_tensor(
|
||||
tmp_mk_base_buffer, faDataType, faElemSize,
|
||||
tmp_mk_ne, tmp_mk_nb, GGML_MAX_DIMS,
|
||||
ACL_FORMAT_ND);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, bcast_pse_tensor, tmp_mk_tensor);
|
||||
|
||||
ggml_cann_release_resources(ctx, tmp_arange1_tensor, tmp_arange2_tensor,
|
||||
tmp_mk_base1_tensor, tmp_mk_base2_tensor, tmp_mk_base_tensor,
|
||||
tmp_arange_tensor, tmp_mk_tensor);
|
||||
ggml_cann_release_resources(ctx, slope_tensor);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -2456,8 +2456,8 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
// value of paddingW should be at most half of kernelW
|
||||
return (p0 <= (k0 / 2)) && (p1 <= (k1 / 2));
|
||||
}
|
||||
case GGML_OP_SUM:
|
||||
case GGML_OP_DUP:
|
||||
case GGML_OP_SUM:
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_CONCAT:
|
||||
case GGML_OP_REPEAT:
|
||||
@@ -2503,9 +2503,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
if (op->src[2]) {
|
||||
return false;
|
||||
}
|
||||
// TODO: support broadcast
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14435
|
||||
return !op->src[1] || (op->src[1]->ne[2] == 1 && op->src[1]->ne[3] == 1);
|
||||
return true;
|
||||
case GGML_OP_FLASH_ATTN_EXT:{
|
||||
// derived from [ggml-cuda.cu]
|
||||
if(op->src[1]->type != GGML_TYPE_F16 || op->src[2]->type != GGML_TYPE_F16){
|
||||
@@ -2532,11 +2530,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
// DeepSeek MLA
|
||||
return false;
|
||||
}
|
||||
// TODO: support broadcast
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14435
|
||||
if (op->src[0]->ne[3] != 1) {
|
||||
return false;
|
||||
}
|
||||
float logitSoftcap = 0.0f;
|
||||
memcpy(&logitSoftcap, (float*)op->op_params + 2, sizeof(float));
|
||||
if(logitSoftcap != 0.0f) {
|
||||
|
||||
@@ -460,7 +460,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
# NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version.
|
||||
# binutils must also be updated to the latest for the -march=z17 flag to work. Otherwise, use -march=arch15.
|
||||
message(STATUS "z17 target")
|
||||
list(APPEND ARCH_FLAGS -march=z17)
|
||||
list(APPEND ARCH_FLAGS -march=arch15)
|
||||
else()
|
||||
message(STATUS "Unknown target")
|
||||
message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.")
|
||||
|
||||
@@ -40,18 +40,22 @@
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
|
||||
// repack.cpp
|
||||
@@ -80,12 +84,14 @@
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||
#elif defined(__loongarch64)
|
||||
// quants.c
|
||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||
@@ -103,12 +109,14 @@
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||
#elif defined(__riscv)
|
||||
// quants.c
|
||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||
@@ -133,11 +141,13 @@
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||
#elif defined(__s390x__)
|
||||
// quants.c
|
||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||
@@ -164,12 +174,14 @@
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||
#elif defined(__wasm__)
|
||||
// quants.c
|
||||
#define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
|
||||
@@ -195,10 +207,12 @@
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||
#endif
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -2022,6 +2022,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
ggml_compute_forward_opt_step_adamw(params, tensor);
|
||||
}
|
||||
break;
|
||||
case GGML_OP_OPT_STEP_SGD:
|
||||
{
|
||||
ggml_compute_forward_opt_step_sgd(params, tensor);
|
||||
}
|
||||
break;
|
||||
case GGML_OP_NONE:
|
||||
{
|
||||
// nop
|
||||
@@ -2325,6 +2330,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||
case GGML_OP_CROSS_ENTROPY_LOSS:
|
||||
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
|
||||
case GGML_OP_OPT_STEP_ADAMW:
|
||||
case GGML_OP_OPT_STEP_SGD:
|
||||
{
|
||||
n_tasks = n_threads;
|
||||
} break;
|
||||
|
||||
@@ -35,7 +35,7 @@
|
||||
|
||||
// ggml-backend interface
|
||||
|
||||
std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type() {
|
||||
std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffer_types() {
|
||||
static std::vector<ggml_backend_buffer_type_t> bufts = []() {
|
||||
std::vector<ggml_backend_buffer_type_t> bufts;
|
||||
|
||||
@@ -57,8 +57,6 @@ std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type
|
||||
}
|
||||
#endif
|
||||
|
||||
bufts.push_back(NULL);
|
||||
|
||||
return bufts;
|
||||
}();
|
||||
|
||||
@@ -66,14 +64,20 @@ std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t * ggml_backend_cpu_device_get_extra_buffers_type(ggml_backend_dev_t device) {
|
||||
return ggml_backend_cpu_get_extra_buffers_type().data();
|
||||
static std::vector<ggml_backend_buffer_type_t> extra_bufts = [] {
|
||||
std::vector<ggml_backend_buffer_type_t> bufts = ggml_backend_cpu_get_extra_buffer_types();
|
||||
bufts.push_back(nullptr);
|
||||
return bufts;
|
||||
}();
|
||||
|
||||
return extra_bufts.data();
|
||||
|
||||
GGML_UNUSED(device);
|
||||
}
|
||||
|
||||
static bool ggml_backend_cpu_is_extra_buffer_type(ggml_backend_buffer_type_t buft) {
|
||||
for (auto * extra : ggml_backend_cpu_get_extra_buffers_type()) {
|
||||
if (extra && extra == buft) {
|
||||
for (auto * extra : ggml_backend_cpu_get_extra_buffer_types()) {
|
||||
if (extra == buft) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@@ -210,10 +214,10 @@ ggml_backend_t ggml_backend_cpu_init(void) {
|
||||
ctx->abort_callback_data = NULL;
|
||||
|
||||
ggml_backend_t cpu_backend = new ggml_backend {
|
||||
/* .guid = */ ggml_backend_cpu_guid(),
|
||||
/* .interface = */ ggml_backend_cpu_i,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
||||
/* .context = */ ctx,
|
||||
/* .guid = */ ggml_backend_cpu_guid(),
|
||||
/* .iface = */ ggml_backend_cpu_i,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
||||
/* .context = */ ctx,
|
||||
};
|
||||
|
||||
if (cpu_backend == NULL) {
|
||||
@@ -397,20 +401,13 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
|
||||
return true;
|
||||
}
|
||||
|
||||
// extra_buffer_op?
|
||||
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
|
||||
if (extra) {
|
||||
auto buf_extra = (ggml::cpu::extra_buffer_type*) extra->context;
|
||||
if (buf_extra && buf_extra->supports_op(dev, op)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// the other case need host buffer.
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (op->src[i] && op->src[i]->buffer && !ggml_backend_buft_is_host(op->src[i]->buffer->buft)) {
|
||||
return false;
|
||||
// check extra buffer types
|
||||
// note: only the first sources are checked for extra buffer types to reduce overhead, increase if necessary
|
||||
for (int i = 0; i < 4; i++) {
|
||||
if (op->src[i] && op->src[i]->buffer &&
|
||||
ggml_backend_cpu_is_extra_buffer_type(op->src[i]->buffer->buft)) {
|
||||
auto * buf_extra = (ggml::cpu::extra_buffer_type *) op->src[i]->buffer->buft->context;
|
||||
return buf_extra->supports_op(dev, op);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -259,7 +259,10 @@ class tensor_traits : public ggml::cpu::tensor_traits {
|
||||
const int64_t m_start = 0;
|
||||
|
||||
const int64_t n_step = static_cast<int64_t>(kernel->get_n_step());
|
||||
const int64_t num_threads = KAI_MIN(n / n_step, nth);
|
||||
int64_t num_threads = KAI_MIN(n / n_step, nth);
|
||||
if (num_threads <= 0) {
|
||||
num_threads = 1;
|
||||
}
|
||||
|
||||
if (ith < num_threads) {
|
||||
const int64_t num_n_per_thread0 = round_down(n / num_threads, n_step);
|
||||
@@ -309,7 +312,8 @@ class tensor_traits : public ggml::cpu::tensor_traits {
|
||||
GGML_ASSERT(kernel);
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
const int nth_raw = params->nth;
|
||||
const int nth = nth_raw > 0 ? nth_raw : 1;
|
||||
|
||||
const size_t k = ne00;
|
||||
const size_t m = ne11;
|
||||
@@ -327,9 +331,12 @@ class tensor_traits : public ggml::cpu::tensor_traits {
|
||||
const size_t num_n_per_thread = kai_roundup(kai_roundup(n, nth) / nth, n_step);
|
||||
const size_t n_start = ith * num_n_per_thread;
|
||||
|
||||
size_t n_to_process = num_n_per_thread;
|
||||
if ((n_start + n_to_process) > n) {
|
||||
n_to_process = n - n_start;
|
||||
size_t n_to_process = 0;
|
||||
if (n_start < n) {
|
||||
n_to_process = num_n_per_thread;
|
||||
if ((n_start + n_to_process) > n) {
|
||||
n_to_process = n - n_start;
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate number of columns to be processed per thread
|
||||
@@ -361,8 +368,10 @@ class tensor_traits : public ggml::cpu::tensor_traits {
|
||||
const void* lhs_ptr = (const void*)((const char *)lhs_packed + lhs_packed_offset);
|
||||
float *dst_ptr = reinterpret_cast<float *>(static_cast<uint8_t *>(dst->data) + dst_offset);
|
||||
|
||||
variant_call<void>(kernel->run_kernel, m, n_to_process, k, QK4_0, lhs_ptr, rhs_ptr, dst_ptr, dst_stride,
|
||||
sizeof(float), -FLT_MAX, FLT_MAX);
|
||||
if (n_to_process > 0) {
|
||||
variant_call<void>(kernel->run_kernel, m, n_to_process, k, QK4_0, lhs_ptr, rhs_ptr, dst_ptr, dst_stride,
|
||||
sizeof(float), -FLT_MAX, FLT_MAX);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -10330,6 +10330,7 @@ static void ggml_compute_forward_opt_step_adamw_f32(
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
const float * adamw_params_ptr = ggml_get_data_f32(adamw_params);
|
||||
|
||||
const float alpha = adamw_params_ptr[0];
|
||||
const float beta1 = adamw_params_ptr[1];
|
||||
const float beta2 = adamw_params_ptr[2];
|
||||
@@ -10337,7 +10338,7 @@ static void ggml_compute_forward_opt_step_adamw_f32(
|
||||
const float wd = adamw_params_ptr[4];
|
||||
const float beta1h = adamw_params_ptr[5];
|
||||
const float beta2h = adamw_params_ptr[6];
|
||||
|
||||
const float keep = 1.f - alpha * wd;
|
||||
for (int ir = ir0; ir < ir1; ++ir) {
|
||||
const int64_t i03 = ir/(ne02*ne01);
|
||||
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
||||
@@ -10360,7 +10361,7 @@ static void ggml_compute_forward_opt_step_adamw_f32(
|
||||
// The weight decay is applied independently of the Adam momenta m and v.
|
||||
// This is NOT equivalent to l2 regularization that adds w[i00]*w[i00] to the loss.
|
||||
// See: https://arxiv.org/pdf/1711.05101v3.pdf
|
||||
w[i00] = w[i00]*(1.0f - alpha*wd) - alpha*mh/vh;
|
||||
w[i00] = w[i00] * keep - alpha * mh / vh;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -10382,3 +10383,63 @@ void ggml_compute_forward_opt_step_adamw(
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_opt_step_sgd_f32(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src0_grad = dst->src[1];
|
||||
const ggml_tensor * sgd_params = dst->src[2];
|
||||
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, src0_grad));
|
||||
GGML_ASSERT(ggml_nelements(sgd_params) == 2);
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int nr = ggml_nrows(src0);
|
||||
|
||||
GGML_TENSOR_UNARY_OP_LOCALS
|
||||
GGML_ASSERT(nb00 == sizeof(float));
|
||||
|
||||
// rows per thread
|
||||
const int dr = (nr + nth - 1) / nth;
|
||||
|
||||
// row range for this thread
|
||||
const int ir0 = dr * ith;
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
// using adamw param subset we care about - alpha, wd - could have a separate struct
|
||||
const float * sgd_params_ptr = ggml_get_data_f32(sgd_params);
|
||||
const float alpha = sgd_params_ptr[0];
|
||||
const float keep = 1.f - alpha * sgd_params_ptr[1];
|
||||
|
||||
for (int ir = ir0; ir < ir1; ++ir) {
|
||||
const int64_t i03 = ir / (ne02 * ne01);
|
||||
const int64_t i02 = (ir - i03 * ne02 * ne01) / ne01;
|
||||
const int64_t i01 = (ir - i03 * ne02 * ne01 - i02 * ne01);
|
||||
|
||||
const size_t offset = i03 * nb03 + i02 * nb02 + i01 * nb01;
|
||||
|
||||
float * w = (float *) ((char *) src0->data + offset); // weight
|
||||
const float * g = (const float *) ((const char *) src0_grad->data + offset); // grad
|
||||
|
||||
for (int i00 = 0; i00 < ne00; ++i00) {
|
||||
w[i00] = w[i00] * keep - alpha * g[i00];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_compute_forward_opt_step_sgd(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
ggml_compute_forward_opt_step_sgd_f32(params, dst);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
{
|
||||
GGML_ABORT("fatal error - sgd is F32 only");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -107,7 +107,7 @@ void ggml_compute_forward_cross_entropy_loss(const struct ggml_compute_params *
|
||||
void ggml_compute_forward_cross_entropy_loss_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_opt_step_adamw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
|
||||
void ggml_compute_forward_opt_step_sgd(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
+200
-51
@@ -206,8 +206,9 @@ void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
const int ncols_interleaved = 4;
|
||||
const int blocklen = 4;
|
||||
|
||||
assert (n % qk == 0);
|
||||
assert (nc % ncols_interleaved == 0);
|
||||
assert(nr == 1);
|
||||
assert(n % qk == 0);
|
||||
assert(nc % ncols_interleaved == 0);
|
||||
|
||||
UNUSED(s);
|
||||
UNUSED(bs);
|
||||
@@ -307,30 +308,28 @@ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
{
|
||||
float sumf[8];
|
||||
int sumi;
|
||||
float sumf[8];
|
||||
int sumi;
|
||||
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
||||
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
||||
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
||||
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
||||
}
|
||||
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
||||
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
||||
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
||||
}
|
||||
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
||||
}
|
||||
}
|
||||
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
||||
}
|
||||
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -494,43 +493,73 @@ void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
||||
const int ncols_interleaved = 4;
|
||||
const int blocklen = 4;
|
||||
|
||||
assert (n % qk == 0);
|
||||
assert (nc % ncols_interleaved == 0);
|
||||
assert(nr == 1);
|
||||
assert(n % qk == 0);
|
||||
assert(nc % ncols_interleaved == 0);
|
||||
|
||||
UNUSED(s);
|
||||
UNUSED(bs);
|
||||
UNUSED(vx);
|
||||
UNUSED(vy);
|
||||
UNUSED(nr);
|
||||
UNUSED(nc);
|
||||
UNUSED(nb);
|
||||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
{
|
||||
float sumf[4];
|
||||
int sumi;
|
||||
float sumf[4];
|
||||
int sumi;
|
||||
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
|
||||
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
||||
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
||||
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
||||
}
|
||||
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
||||
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
||||
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
||||
}
|
||||
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
||||
}
|
||||
}
|
||||
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
||||
}
|
||||
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
const int ncols_interleaved = 8;
|
||||
const int blocklen = 8;
|
||||
|
||||
assert(nr == 1);
|
||||
assert(n % qk == 0);
|
||||
assert(nc % ncols_interleaved == 0);
|
||||
|
||||
UNUSED(bs);
|
||||
UNUSED(nr);
|
||||
|
||||
float sumf[8];
|
||||
int sumi;
|
||||
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
|
||||
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
||||
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
||||
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
||||
}
|
||||
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -934,6 +963,50 @@ void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
const int ncols_interleaved = 8;
|
||||
const int blocklen = 8;
|
||||
|
||||
assert(n % qk == 0);
|
||||
assert(nr % 4 == 0);
|
||||
assert(nc % ncols_interleaved == 0);
|
||||
|
||||
float sumf[4][8];
|
||||
int sumi;
|
||||
|
||||
for (int y = 0; y < nr / 4; y++) {
|
||||
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
||||
}
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
||||
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
||||
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
||||
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
|
||||
}
|
||||
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++)
|
||||
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // extern "C"
|
||||
|
||||
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
|
||||
@@ -1285,15 +1358,16 @@ static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_s
|
||||
|
||||
static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
||||
GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
|
||||
//GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
|
||||
GGML_ASSERT(interleave_block == 4);
|
||||
|
||||
block_iq4_nlx4 * dst = (block_iq4_nlx4 *)t->data;
|
||||
const block_iq4_nl * src = (const block_iq4_nl *)data;
|
||||
const block_iq4_nl * src = (const block_iq4_nl *)data;
|
||||
block_iq4_nlx4 * dst = ( block_iq4_nlx4 *)t->data;
|
||||
|
||||
block_iq4_nl dst_tmp[4];
|
||||
|
||||
int nrow = ggml_nrows(t);
|
||||
int nrows_interleaved = 4;
|
||||
int nblocks = t->ne[0] / QK4_0;
|
||||
int nblocks = t->ne[0] / QK4_NL;
|
||||
|
||||
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
|
||||
|
||||
@@ -1315,6 +1389,63 @@ static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_b
|
||||
GGML_UNUSED(data_size);
|
||||
}
|
||||
|
||||
static block_iq4_nlx8 make_block_iq4_nlx8(block_iq4_nl * in, unsigned int blck_size_interleave) {
|
||||
block_iq4_nlx8 out;
|
||||
|
||||
for (int i = 0; i < 8; i++) {
|
||||
out.d[i] = in[i].d;
|
||||
}
|
||||
|
||||
const int end = QK4_NL * 4 / blck_size_interleave;
|
||||
|
||||
if (blck_size_interleave == 8) {
|
||||
for (int i = 0; i < end; ++i) {
|
||||
int src_id = i % 8;
|
||||
int src_offset = (i / 8) * blck_size_interleave;
|
||||
int dst_offset = i * blck_size_interleave;
|
||||
|
||||
memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
|
||||
}
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
static int repack_iq4_nl_to_iq4_nl_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
||||
GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
|
||||
GGML_ASSERT(interleave_block == 8);
|
||||
|
||||
const block_iq4_nl * src = (const block_iq4_nl *)data;
|
||||
block_iq4_nlx8 * dst = ( block_iq4_nlx8 *)t->data;
|
||||
|
||||
block_iq4_nl dst_tmp[8];
|
||||
|
||||
int nrow = ggml_nrows(t);
|
||||
int nrows_interleaved = 8;
|
||||
int nblocks = t->ne[0] / QK4_NL;
|
||||
|
||||
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
|
||||
|
||||
if (t->ne[1] % nrows_interleaved != 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
for (int b = 0; b < nrow; b += nrows_interleaved) {
|
||||
for (int64_t x = 0; x < nblocks; x++) {
|
||||
for (int i = 0; i < nrows_interleaved; i++) {
|
||||
dst_tmp[i] = src[x + i * nblocks];
|
||||
}
|
||||
*dst++ = make_block_iq4_nlx8(dst_tmp, interleave_block);
|
||||
}
|
||||
src += nrows_interleaved * nblocks;
|
||||
}
|
||||
return 0;
|
||||
|
||||
GGML_UNUSED(data_size);
|
||||
}
|
||||
|
||||
namespace ggml::cpu::repack {
|
||||
// repack
|
||||
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
|
||||
@@ -1350,6 +1481,10 @@ template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void *
|
||||
// return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size);
|
||||
//}
|
||||
|
||||
template <> int repack<block_iq4_nl, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
||||
return repack_iq4_nl_to_iq4_nl_8_bl(t, 8, data, data_size);
|
||||
}
|
||||
|
||||
// gemv
|
||||
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
|
||||
void gemv(int, float *, size_t, const void *, const void *, int, int);
|
||||
@@ -1378,6 +1513,10 @@ template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size
|
||||
ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
template <> void gemv<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||
ggml_gemv_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
// gemm
|
||||
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
|
||||
void gemm(int, float *, size_t, const void *, const void *, int, int);
|
||||
@@ -1406,6 +1545,10 @@ template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size
|
||||
ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
template <> void gemm<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||
ggml_gemm_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
class tensor_traits_base : public ggml::cpu::tensor_traits {
|
||||
public:
|
||||
virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
|
||||
@@ -1680,6 +1823,7 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
||||
|
||||
// instance for IQ4
|
||||
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
|
||||
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
|
||||
|
||||
if (cur->type == GGML_TYPE_Q4_0) {
|
||||
if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
|
||||
@@ -1710,6 +1854,11 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
||||
}
|
||||
}
|
||||
} else if (cur->type == GGML_TYPE_IQ4_NL) {
|
||||
if (ggml_cpu_has_avx2()) {
|
||||
if (cur->ne[1] % 8 == 0) {
|
||||
return &iq4_nl_8x8_q8_0;
|
||||
}
|
||||
}
|
||||
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
||||
if (cur->ne[1] % 4 == 0) {
|
||||
return &iq4_nl_4x4_q8_0;
|
||||
|
||||
@@ -67,6 +67,13 @@ struct block_iq4_nlx4 {
|
||||
|
||||
static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
|
||||
|
||||
struct block_iq4_nlx8 {
|
||||
ggml_half d[8]; // deltas for 8 iq4_nl blocks
|
||||
uint8_t qs[QK4_NL * 4]; // nibbles / quants for 8 iq4_nl blocks
|
||||
};
|
||||
|
||||
static_assert(sizeof(block_iq4_nlx8) == 8 * sizeof(ggml_half) + QK4_NL * 4, "wrong iq4_nlx8 block size/padding");
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
@@ -80,12 +87,14 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
|
||||
// Native implementations
|
||||
void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
@@ -97,12 +106,14 @@ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
|
||||
#if defined(__cplusplus)
|
||||
} // extern "C"
|
||||
|
||||
@@ -10,7 +10,7 @@ extra_buffer_type::~extra_buffer_type() {}
|
||||
} // namespace ggml::cpu
|
||||
|
||||
bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) {
|
||||
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
|
||||
for (auto extra : ggml_backend_cpu_get_extra_buffer_types()) {
|
||||
if (extra && extra->context) {
|
||||
auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
|
||||
auto tensor_traits = buf_extra->get_tensor_traits(op);
|
||||
@@ -23,7 +23,7 @@ bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct
|
||||
}
|
||||
|
||||
bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size) {
|
||||
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
|
||||
for (auto extra : ggml_backend_cpu_get_extra_buffer_types()) {
|
||||
if (extra && extra->context) {
|
||||
auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
|
||||
auto tensor_traits = buf_extra->get_tensor_traits(op);
|
||||
|
||||
@@ -33,6 +33,6 @@ class extra_buffer_type {
|
||||
} // namespace ggml::cpu
|
||||
|
||||
// implemented in ggml-cpu.cpp.
|
||||
std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffers_type();
|
||||
std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffer_types();
|
||||
|
||||
#endif
|
||||
|
||||
@@ -120,6 +120,10 @@ if (CUDAToolkit_FOUND)
|
||||
|
||||
set(CUDA_FLAGS -use_fast_math -extended-lambda)
|
||||
|
||||
if (GGML_CUDA_DEBUG)
|
||||
list(APPEND CUDA_FLAGS -lineinfo)
|
||||
endif()
|
||||
|
||||
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")
|
||||
# Options are:
|
||||
# - none (not recommended)
|
||||
|
||||
@@ -87,6 +87,10 @@
|
||||
#define GGML_CUDA_CC_IS_QY2(cc) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NG)
|
||||
#define GGML_CUDA_CC_IS_NG(cc) (cc >= GGML_CUDA_CC_NG)
|
||||
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
|
||||
# define GGML_CUDA_USE_CUB
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
|
||||
|
||||
#ifdef __CUDA_ARCH_LIST__
|
||||
constexpr bool ggml_cuda_has_arch_impl(int) {
|
||||
return false;
|
||||
@@ -233,9 +237,13 @@ typedef float2 dfloat2;
|
||||
#endif // defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA)
|
||||
|
||||
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
||||
#define NEW_MMA_AVAILABLE
|
||||
#define TURING_MMA_AVAILABLE
|
||||
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
||||
|
||||
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||
#define AMPERE_MMA_AVAILABLE
|
||||
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||
|
||||
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||
#define CP_ASYNC_AVAILABLE
|
||||
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||
@@ -303,12 +311,16 @@ static bool amd_mfma_available(const int cc) {
|
||||
}
|
||||
|
||||
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
|
||||
static bool new_mma_available(const int cc) {
|
||||
static bool turing_mma_available(const int cc) {
|
||||
return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;
|
||||
}
|
||||
|
||||
static bool ampere_mma_available(const int cc) {
|
||||
return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_AMPERE;
|
||||
}
|
||||
|
||||
static bool cp_async_available(const int cc) {
|
||||
return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_AMPERE;
|
||||
return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_AMPERE;
|
||||
}
|
||||
|
||||
static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
|
||||
@@ -412,26 +424,6 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
||||
#endif // FP16_AVAILABLE
|
||||
}
|
||||
|
||||
// Row reduction kernel template - compute sum (norm=false) or mean (norm=true)
|
||||
template<bool norm>
|
||||
static __global__ void reduce_rows_f32(const float * x, float * dst, const int ncols) {
|
||||
const int row = blockIdx.x;
|
||||
const int col = threadIdx.x;
|
||||
|
||||
float sum = 0.0f;
|
||||
for (int i = col; i < ncols; i += blockDim.x) {
|
||||
sum += x[row * ncols + i];
|
||||
}
|
||||
|
||||
sum = warp_reduce_sum(sum);
|
||||
|
||||
if (col != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
dst[row] = norm ? sum / ncols : sum;
|
||||
}
|
||||
|
||||
template<int width = WARP_SIZE>
|
||||
static __device__ __forceinline__ int warp_reduce_all(int x) {
|
||||
#ifdef GGML_USE_HIP
|
||||
@@ -472,25 +464,21 @@ static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
|
||||
#if defined(GGML_USE_HIP) && HIP_VERSION >= 50700000
|
||||
#if defined(GGML_USE_HIP)
|
||||
return half2(__hmax(a.x, b.x), __hmax(a.y, b.y));
|
||||
#elif !defined(GGML_USE_HIP) && CUDART_VERSION >= CUDART_HMAX
|
||||
#elif CUDART_VERSION >= CUDART_HMAX
|
||||
return __hmax2(a, b);
|
||||
#elif !defined(GGML_USE_HIP)
|
||||
#else
|
||||
half2 ret;
|
||||
reinterpret_cast<half&>(ret.x) = __float2half(fmaxf( __low2float(a), __low2float(b)));
|
||||
reinterpret_cast<half&>(ret.y) = __float2half(fmaxf(__high2float(a), __high2float(b)));
|
||||
return ret;
|
||||
#else
|
||||
GGML_UNUSED(a);
|
||||
GGML_UNUSED(b);
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
}
|
||||
|
||||
template<int width = WARP_SIZE>
|
||||
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
||||
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
|
||||
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || defined(GGML_USE_HIP)
|
||||
#pragma unroll
|
||||
for (int offset = width/2; offset > 0; offset >>= 1) {
|
||||
x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, width));
|
||||
@@ -499,7 +487,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
||||
#else
|
||||
GGML_UNUSED(x);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
|
||||
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || defined(GGML_USE_HIP)
|
||||
}
|
||||
|
||||
#if CUDART_VERSION < CUDART_HMASK
|
||||
|
||||
@@ -31,8 +31,8 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
|
||||
dequantize_kernel(vx, ib, iqs, v);
|
||||
|
||||
const int64_t iy0 = ((i03*ne02 + i02)*ne01 + i01)*ne00 + iybs + iqs;
|
||||
y[iy0 + 0] = float(v.x);
|
||||
y[iy0 + y_offset] = float(v.y);
|
||||
y[iy0 + 0] = ggml_cuda_cast<dst_t>(v.x);
|
||||
y[iy0 + y_offset] = ggml_cuda_cast<dst_t>(v.y);
|
||||
}
|
||||
|
||||
template <bool need_check>
|
||||
@@ -630,7 +630,7 @@ static __global__ void convert_unary(
|
||||
|
||||
const int64_t ix = i03*s03 + i02*s02 + i01*s01 + i00;
|
||||
const int64_t iy = ((i03*ne02 + i02)*ne01 + i01)*ne00 + i00;
|
||||
y[iy] = float(x[ix]);
|
||||
y[iy] = ggml_cuda_cast<dst_t>(x[ix]);
|
||||
}
|
||||
|
||||
template <typename src_t, typename dst_t>
|
||||
|
||||
@@ -29,3 +29,16 @@ typedef to_t_nc_cuda_t<nv_bfloat16> to_bf16_nc_cuda_t;
|
||||
to_fp32_nc_cuda_t ggml_get_to_fp32_nc_cuda(ggml_type type);
|
||||
to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type);
|
||||
to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type);
|
||||
|
||||
template<typename dst_t, typename src_t>
|
||||
__host__ __device__ inline dst_t ggml_cuda_cast(src_t x) {
|
||||
if constexpr (std::is_same_v<dst_t, src_t>) {
|
||||
return x;
|
||||
} else if constexpr(std::is_same_v<dst_t, nv_bfloat16>) {
|
||||
return __float2bfloat16(float(x));
|
||||
} else if constexpr(std::is_same_v<src_t, nv_bfloat16>) {
|
||||
return __bfloat162float(x);
|
||||
} else {
|
||||
return float(x);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,15 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml-common.h"
|
||||
|
||||
template<typename src_t, typename dst_t>
|
||||
static __device__ __forceinline__ void convert_flt(const src_t * src, dst_t * dst) {
|
||||
if constexpr (std::is_same_v<src_t, dst_t>) {
|
||||
*dst = *src;
|
||||
} else {
|
||||
*dst = float(*src);
|
||||
}
|
||||
}
|
||||
#include "convert.cuh"
|
||||
|
||||
static __device__ __forceinline__ int best_index_int8(int n, const int8_t * val, float x) {
|
||||
if (x <= val[0]) return 0;
|
||||
@@ -221,5 +213,5 @@ static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
|
||||
|
||||
template<typename src_t, typename dst_t>
|
||||
static __device__ void cpy_1_flt(const char * cxi, char * cdsti) {
|
||||
convert_flt((const src_t *)cxi, (dst_t *)cdsti);
|
||||
*(dst_t *) cdsti = ggml_cuda_cast<dst_t>(*(const src_t *) cxi);
|
||||
}
|
||||
|
||||
@@ -539,11 +539,15 @@ static __global__ void flash_attn_mask_to_KV_max(
|
||||
all_inf = warp_reduce_all(all_inf);
|
||||
|
||||
if (!all_inf) {
|
||||
KV_max_sj += FATTN_KQ_STRIDE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// If the break in the loop was not triggered, KV_max_sj is now -FATTN_KQ_STRIDE.
|
||||
// If the break was triggered it's the lower edge of the tile with the first non-masked values.
|
||||
// In either case, walk back the decrementation by FATTN_KQ_STRIDE.
|
||||
KV_max_sj += FATTN_KQ_STRIDE;
|
||||
|
||||
if (threadIdx.x != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -418,7 +418,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
||||
float * const __restrict__ KQ_max,
|
||||
float * const __restrict__ KQ_rowsum,
|
||||
const int kb0) {
|
||||
#ifdef NEW_MMA_AVAILABLE
|
||||
#ifdef TURING_MMA_AVAILABLE
|
||||
typedef fattn_mma_f16_config<DKQ, DV> c;
|
||||
|
||||
#ifdef CP_ASYNC_AVAILABLE
|
||||
@@ -776,7 +776,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
||||
GGML_UNUSED(VKQ_C); GGML_UNUSED(KQ_max); GGML_UNUSED(KQ_rowsum);
|
||||
GGML_UNUSED(kb0); GGML_UNUSED(tile_Q);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // NEW_MMA_AVAILABLE
|
||||
#endif // TURING_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
template<int DKQ, int DV, int ncols1, int ncols2, int nwarps, int ntiles, bool use_logit_softcap, bool mla, bool needs_fixup, bool is_fixup>
|
||||
@@ -785,6 +785,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
|
||||
const half2 * const __restrict__ K_h2,
|
||||
const half2 * const __restrict__ V_h2,
|
||||
const half2 * const __restrict__ mask_h2,
|
||||
const float * const __restrict__ sinks_f,
|
||||
float2 * const __restrict__ dstk,
|
||||
float2 * const __restrict__ dstk_fixup,
|
||||
const float scale,
|
||||
@@ -800,7 +801,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
|
||||
const int jt,
|
||||
const int kb0_start,
|
||||
const int kb0_stop) {
|
||||
#ifdef NEW_MMA_AVAILABLE
|
||||
#ifdef TURING_MMA_AVAILABLE
|
||||
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
||||
|
||||
typedef fattn_mma_f16_config<DKQ, DV> c;
|
||||
@@ -957,6 +958,52 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
|
||||
}
|
||||
}
|
||||
|
||||
// If attention sinks are used, potentially re-scale if KQ_max is small.
|
||||
// Also add the sink as a value to KQ_rowsum, this is done after synchonization of KQ_rowsum
|
||||
// so it's being done unconditionally for every thread.
|
||||
if (!is_fixup && (np == 1 || threadIdx.y % np == 0) && sinks_f) {
|
||||
float KQ_max_scale[cols_per_thread];
|
||||
#pragma unroll
|
||||
for (int col = 0; col < cols_per_thread; ++col) {
|
||||
static_assert(ntiles == 1 || ntiles == 2, "ntiles > 2 not implemented");
|
||||
const int jc = ntiles == 1 ? 2*tile_C_VKQ::get_j(col/2) + col % 2 : tile_C_VKQ_16::get_i(col);
|
||||
const float sink = sinks_f[jc % ncols2];
|
||||
|
||||
const float KQ_max_new = fmaxf(KQ_max[col], sink);
|
||||
const float KQ_max_diff = KQ_max[col] - KQ_max_new;
|
||||
KQ_max_scale[col] = expf(KQ_max_diff);
|
||||
KQ_max[col] = KQ_max_new;
|
||||
|
||||
*((uint32_t *) &KQ_max_scale[col]) *= KQ_max_diff >= SOFTMAX_FTZ_THRESHOLD;
|
||||
|
||||
const float KQ_max_add = expf(sink - KQ_max_new);
|
||||
KQ_rowsum[col] = KQ_max_scale[col]*KQ_rowsum[col] + KQ_max_add;
|
||||
}
|
||||
|
||||
if (ntiles == 1) {
|
||||
const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale[0], KQ_max_scale[1]);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < DV/tile_C_VKQ::I; ++i) {
|
||||
#pragma unroll
|
||||
for (int l = 0; l < tile_C_VKQ::ne; ++l) {
|
||||
VKQ_C[i].x[l] *= KQ_max_scale_h2;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
#pragma unroll
|
||||
for (int col = 0; col < cols_per_thread; ++col) {
|
||||
const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale[col], KQ_max_scale[col]);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < DV/tile_C_VKQ_16::J; ++i) {
|
||||
#pragma unroll
|
||||
for (int l0 = 0; l0 < tile_C_VKQ_16::ne; l0 += 2) {
|
||||
VKQ_C_16[i*ntiles/2 + col/2].x[l0 + col % 2] *= KQ_max_scale_h2;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Combine VKQ accumulator values if np > 1.
|
||||
// It's also faster to do small writes to shared memory, then large write to VRAM than to do small writes to VRAM.
|
||||
// So also write VKQ accumulators to shared memory in column-major format if np == 1.
|
||||
@@ -1196,7 +1243,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
|
||||
GGML_UNUSED(stride_Q2); GGML_UNUSED(stride_K); GGML_UNUSED(stride_V); GGML_UNUSED(stride_mask);
|
||||
GGML_UNUSED(jt); GGML_UNUSED(kb0_start); GGML_UNUSED(kb0_stop);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // NEW_MMA_AVAILABLE
|
||||
#endif // TURING_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
template<int DKQ, int DV, int ncols1, int ncols2, int nwarps, int ntiles, bool use_logit_softcap, bool mla>
|
||||
@@ -1223,7 +1270,7 @@ static __global__ void flash_attn_ext_f16(
|
||||
const int32_t nb21, const int32_t nb22, const int64_t nb23,
|
||||
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
||||
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
|
||||
#if defined(FLASH_ATTN_AVAILABLE) && defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(FLASH_ATTN_AVAILABLE) && defined(TURING_MMA_AVAILABLE)
|
||||
|
||||
// Skip unused kernel variants for faster compilation:
|
||||
if (use_logit_softcap && !(DKQ == 128 || DKQ == 256)) {
|
||||
@@ -1271,18 +1318,21 @@ static __global__ void flash_attn_ext_f16(
|
||||
|
||||
while (kbc < kbc_stop && kb0_stop == iter_k) {
|
||||
const int sequence = kbc / (iter_k*iter_j*(ne02/ncols2));
|
||||
const int head = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence) / (iter_k*iter_j);
|
||||
const int jt = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence - iter_k*iter_j*head) / iter_k; // j index of current tile.
|
||||
const int zt = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence) / (iter_k*iter_j); // head in units of ncols2
|
||||
const int jt = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence - iter_k*iter_j*zt) / iter_k; // j index of current tile.
|
||||
|
||||
const float2 * Q_f2 = (const float2 *) (Q + nb03*sequence + nb02*(head*ncols2));
|
||||
const half2 * K_h2 = (const half2 *) (K + nb13*sequence + nb12*(head*ncols2 / gqa_ratio));
|
||||
const int head0 = zt * ncols2;
|
||||
|
||||
const float2 * Q_f2 = (const float2 *) (Q + nb03*sequence + nb02* head0);
|
||||
const half2 * K_h2 = (const half2 *) (K + nb13*sequence + nb12*(head0 / gqa_ratio));
|
||||
const half2 * mask_h2 = ncols2 == 1 && !mask ? nullptr :
|
||||
(const half2 *) (mask + nb33*(sequence % ne33) + nb31*jt*ncols1);
|
||||
float2 * dstk = ((float2 *) dst) + (sequence*ne01*ne02 + head*ncols2) * (DV/2);
|
||||
float2 * dstk = ((float2 *) dst) + (sequence*ne01*ne02 + head0) * (DV/2);
|
||||
|
||||
const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb23*sequence + nb22*(head*ncols2 / gqa_ratio));
|
||||
const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb23*sequence + nb22*(head0 / gqa_ratio));
|
||||
const float * sinks_f = sinks ? (const float *) sinks + head0 : nullptr;
|
||||
|
||||
const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, head, n_head_log2, m0, m1) : 1.0f;
|
||||
const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, head0, n_head_log2, m0, m1) : 1.0f;
|
||||
|
||||
const int kb0_start_kernel = kb0_start * kb_niter;
|
||||
int kb0_stop_kernel = kb0_stop * kb_niter;
|
||||
@@ -1295,12 +1345,12 @@ static __global__ void flash_attn_ext_f16(
|
||||
if (kb0_start == 0) {
|
||||
constexpr bool needs_fixup = false; // CUDA block is working on an entire tile.
|
||||
flash_attn_ext_f16_process_tile<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla, needs_fixup, is_fixup>
|
||||
(Q_f2, K_h2, V_h2, mask_h2, dstk, dst_meta, scale, slope, logit_softcap,
|
||||
(Q_f2, K_h2, V_h2, mask_h2, sinks_f, dstk, dst_meta, scale, slope, logit_softcap,
|
||||
ne01, ne02, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
|
||||
} else {
|
||||
constexpr bool needs_fixup = true; // CUDA block is working on the beginning of a tile.
|
||||
flash_attn_ext_f16_process_tile<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla, needs_fixup, is_fixup>
|
||||
(Q_f2, K_h2, V_h2, mask_h2, dstk, dst_meta, scale, slope, logit_softcap,
|
||||
(Q_f2, K_h2, V_h2, mask_h2, sinks_f, dstk, dst_meta, scale, slope, logit_softcap,
|
||||
ne01, ne02, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
|
||||
}
|
||||
|
||||
@@ -1316,18 +1366,21 @@ static __global__ void flash_attn_ext_f16(
|
||||
}
|
||||
|
||||
const int sequence = kbc / (iter_k*iter_j*(ne02/ncols2));
|
||||
const int head = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence) / (iter_k*iter_j);
|
||||
const int jt = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence - iter_k*iter_j*head) / iter_k; // j index of current tile.
|
||||
const int zt = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence) / (iter_k*iter_j); // head in units of ncols2
|
||||
const int jt = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence - iter_k*iter_j*zt) / iter_k; // j index of current tile.
|
||||
|
||||
const float2 * Q_f2 = (const float2 *) (Q + nb03*sequence + nb02*(head*ncols2));
|
||||
const half2 * K_h2 = (const half2 *) (K + nb13*sequence + nb12*(head*ncols2 / gqa_ratio));
|
||||
const int head0 = zt * ncols2;
|
||||
|
||||
const float2 * Q_f2 = (const float2 *) (Q + nb03*sequence + nb02* head0);
|
||||
const half2 * K_h2 = (const half2 *) (K + nb13*sequence + nb12*(head0 / gqa_ratio));
|
||||
const half2 * mask_h2 = ncols2 == 1 && !mask ? nullptr :
|
||||
(const half2 *) (mask + nb33*(sequence % ne33) + nb31*jt*ncols1);
|
||||
float2 * dstk = ((float2 *) dst) + (sequence*ne01*ne02 + head*ncols2) * (DV/2);
|
||||
float2 * dstk = ((float2 *) dst) + (sequence*ne01*ne02 + head0) * (DV/2);
|
||||
|
||||
const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb23*sequence + nb22*(head*ncols2 / gqa_ratio));
|
||||
const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb23*sequence + nb22*(head0 / gqa_ratio));
|
||||
const float * sinks_f = sinks ? (const float *) sinks + head0 : nullptr;
|
||||
|
||||
const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, head, n_head_log2, m0, m1) : 1.0f;
|
||||
const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, head0, n_head_log2, m0, m1) : 1.0f;
|
||||
|
||||
const int kb0_start_kernel = kb0_start * kb_niter;
|
||||
int kb0_stop_kernel = kb0_stop * kb_niter;
|
||||
@@ -1339,7 +1392,7 @@ static __global__ void flash_attn_ext_f16(
|
||||
constexpr bool is_fixup = true; // Last index writes its data to fixup buffer to avoid data races with other blocks.
|
||||
constexpr bool needs_fixup = false;
|
||||
flash_attn_ext_f16_process_tile<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla, needs_fixup, is_fixup>
|
||||
(Q_f2, K_h2, V_h2, mask_h2, dstk, dst_meta, scale, slope, logit_softcap,
|
||||
(Q_f2, K_h2, V_h2, mask_h2, sinks_f, dstk, dst_meta, scale, slope, logit_softcap,
|
||||
ne01, ne02, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
|
||||
#else
|
||||
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask); GGML_UNUSED(sinks);
|
||||
@@ -1354,7 +1407,7 @@ static __global__ void flash_attn_ext_f16(
|
||||
GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
template <int DKQ, int DV, int ncols1, int ncols2>
|
||||
|
||||
@@ -49,10 +49,11 @@ static __global__ void flash_attn_tile_ext_f16(
|
||||
const int sequence = blockIdx.z / ne02;
|
||||
const int head = blockIdx.z - sequence*ne02;
|
||||
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
|
||||
const float2 * Q_f2 = (const float2 *) (Q + nb03* sequence + nb02* head + nb01*ic0);
|
||||
const half2 * K_h2 = (const half2 *) (K + nb13* sequence + nb12*(head / gqa_ratio));
|
||||
const half2 * V_h2 = (const half2 *) (V + nb13* sequence + nb12*(head / gqa_ratio)); // K and V have same shape
|
||||
const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0);
|
||||
const float2 * Q_f2 = (const float2 *) (Q + nb03* sequence + nb02* head + nb01*ic0);
|
||||
const half2 * K_h2 = (const half2 *) (K + nb13* sequence + nb12*(head / gqa_ratio));
|
||||
const half2 * V_h2 = (const half2 *) (V + nb13* sequence + nb12*(head / gqa_ratio)); // K and V have same shape
|
||||
const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0);
|
||||
const float * sinksf = (const float *) (sinks);
|
||||
|
||||
const int stride_KV2 = nb11 / sizeof(half2);
|
||||
|
||||
@@ -242,6 +243,31 @@ static __global__ void flash_attn_tile_ext_f16(
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
//Attention sink: adjust running max and sum once per head
|
||||
if (sinksf && blockIdx.y == 0) {
|
||||
const half sink = __float2half(sinksf[head]);
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
half kqmax_new_j = fmaxf(kqmax[j0/nwarps], sink);
|
||||
kqmax_new_j = warp_reduce_max(kqmax_new_j);
|
||||
|
||||
const half2 KQ_max_scale = __half2half2(hexp(kqmax[j0/nwarps] - kqmax_new_j));
|
||||
kqmax[j0/nwarps] = kqmax_new_j;
|
||||
|
||||
const half val = hexp(sink - kqmax[j0/nwarps]);
|
||||
kqsum[j0/nwarps] = kqsum[j0/nwarps] * KQ_max_scale;
|
||||
if (threadIdx.x == 0) {
|
||||
kqsum[j0/nwarps].x = __hadd(kqsum[j0/nwarps].x, val);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
VKQ[j0/nwarps][i0/WARP_SIZE] *= KQ_max_scale;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
float2 * dst2 = (float2 *) dst;
|
||||
|
||||
#pragma unroll
|
||||
|
||||
@@ -60,10 +60,11 @@ static __global__ void flash_attn_tile_ext_f32(
|
||||
const int sequence = blockIdx.z / ne02;
|
||||
const int head = blockIdx.z - sequence*ne02;
|
||||
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
|
||||
const float2 * Q_f2 = (const float2 *) (Q + nb03* sequence + nb02* head + nb01*ic0);
|
||||
const half2 * K_h2 = (const half2 *) (K + nb13* sequence + nb12*(head / gqa_ratio));
|
||||
const half2 * V_h2 = (const half2 *) (V + nb13* sequence + nb12*(head / gqa_ratio)); // K and V have same shape
|
||||
const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0);
|
||||
const float2 * Q_f2 = (const float2 *) (Q + nb03* sequence + nb02* head + nb01*ic0);
|
||||
const half2 * K_h2 = (const half2 *) (K + nb13* sequence + nb12*(head / gqa_ratio));
|
||||
const half2 * V_h2 = (const half2 *) (V + nb13* sequence + nb12*(head / gqa_ratio)); // K and V have same shape
|
||||
const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0);
|
||||
const float * sinksf = (const float *) (sinks);
|
||||
|
||||
const int stride_KV2 = nb11 / sizeof(half2);
|
||||
|
||||
@@ -252,6 +253,33 @@ static __global__ void flash_attn_tile_ext_f32(
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
|
||||
//Attention sink: adjust running max and sum once per head
|
||||
if (sinksf && blockIdx.y == 0) {
|
||||
const float sink = sinksf[head];
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
float kqmax_new_j = fmaxf(kqmax[j0/nwarps], sink);
|
||||
kqmax_new_j = warp_reduce_max(kqmax_new_j);
|
||||
|
||||
const float KQ_max_scale = expf(kqmax[j0/nwarps] - kqmax_new_j);
|
||||
kqmax[j0/nwarps] = kqmax_new_j;
|
||||
|
||||
const float val = expf(sink - kqmax[j0/nwarps]);
|
||||
kqsum[j0/nwarps] = kqsum[j0/nwarps] * KQ_max_scale;
|
||||
if (threadIdx.x == 0) {
|
||||
kqsum[j0/nwarps] += val;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
VKQ[j0/nwarps][i0/WARP_SIZE].x *= KQ_max_scale;
|
||||
VKQ[j0/nwarps][i0/WARP_SIZE].y *= KQ_max_scale;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
float2 * dst2 = (float2 *) dst;
|
||||
|
||||
#pragma unroll
|
||||
|
||||
@@ -15,7 +15,6 @@ namespace wmma = mtmusa::wmma;
|
||||
namespace wmma = nvcuda::wmma;
|
||||
#endif // GGML_USE_MUSA
|
||||
#elif defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE)
|
||||
#undef HIP_ENABLE_WARP_SYNC_BUILTINS // conflicts with rocWMMA headers
|
||||
#include <rocwmma/rocwmma.hpp>
|
||||
namespace wmma = rocwmma;
|
||||
#endif // !defined(GGML_USE_HIP)
|
||||
@@ -82,11 +81,12 @@ static __global__ void flash_attn_ext_f16(
|
||||
const int sequence = blockIdx.z / ne02;
|
||||
const int head = blockIdx.z - sequence*ne02;
|
||||
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
|
||||
const float * Q_f = (const float *) (Q + nb03* sequence + nb02* head + nb01*ic0);
|
||||
const half * K_h = (const half *) (K + nb13* sequence + nb12*(head / gqa_ratio));
|
||||
const half * V_h = (const half *) (V + nb13* sequence + nb12*(head / gqa_ratio)); // K and V have same shape
|
||||
const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0);
|
||||
const half2 * mask2 = (const half2 *) maskh;
|
||||
const float * Q_f = (const float *) (Q + nb03* sequence + nb02* head + nb01*ic0);
|
||||
const half * K_h = (const half *) (K + nb13* sequence + nb12*(head / gqa_ratio));
|
||||
const half * V_h = (const half *) (V + nb13* sequence + nb12*(head / gqa_ratio)); // K and V have same shape
|
||||
const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0);
|
||||
const half2 * mask2 = (const half2 *) maskh;
|
||||
const float * sinksf = (const float *) sinks;
|
||||
|
||||
const int stride_Q = nb01 / sizeof(float);
|
||||
const int stride_KV = nb11 / sizeof(half);
|
||||
@@ -381,6 +381,53 @@ static __global__ void flash_attn_ext_f16(
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// Apply attention sinks
|
||||
if (sinksf && blockIdx.y == 0) {
|
||||
const float sinkf = sinksf[head];
|
||||
const half sinkh = __float2half(sinkf);
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
if (std::is_same<KQ_acc_t, float>::value) {
|
||||
float kqmax_new = fmaxf(KQ_max_f[j0/nwarps], sinkf);
|
||||
|
||||
const float KQ_max_scale = expf(KQ_max_f[j0/nwarps] - kqmax_new);
|
||||
KQ_max_f[j0/nwarps] = kqmax_new;
|
||||
|
||||
KQ_rowsum_f[j0/nwarps] = KQ_rowsum_f[j0/nwarps] * KQ_max_scale + expf(sinkf - KQ_max_f[j0/nwarps]);
|
||||
|
||||
const half2 scale_h2 = make_half2(KQ_max_scale, KQ_max_scale);
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
if (i0 + warp_size > D/2 && i >= D/2) break;
|
||||
VKQ2[j*(D_padded/2) + i] *= scale_h2;
|
||||
}
|
||||
} else {
|
||||
half kqmax_old = __low2half(KQ_max_h2[j0/nwarps]);
|
||||
half kqmax_new = fmaxf(kqmax_old, sinkh);
|
||||
KQ_max_h2[j0/nwarps] = __half2half2(kqmax_new);
|
||||
|
||||
const half KQ_max_scale_h = hexp(kqmax_old - kqmax_new);
|
||||
const half2 KQ_max_scale = __half2half2(KQ_max_scale_h);
|
||||
|
||||
KQ_rowsum_h2[j0/nwarps] = KQ_rowsum_h2[j0/nwarps] * KQ_max_scale;
|
||||
const half val = hexp(sinkh - kqmax_new);
|
||||
KQ_rowsum_h2[j0/nwarps].x = __hadd(KQ_rowsum_h2[j0/nwarps].x, val);
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
if (i0 + warp_size > D/2 && i >= D/2) break;
|
||||
VKQ2[j*(D_padded/2) + i] *= KQ_max_scale;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
const int j_VKQ = j0 + threadIdx.y;
|
||||
|
||||
@@ -274,23 +274,12 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||
const ggml_tensor * K = dst->src[1];
|
||||
const ggml_tensor * V = dst->src[2];
|
||||
const ggml_tensor * mask = dst->src[3];
|
||||
const ggml_tensor * sinks = dst->src[4];
|
||||
|
||||
ggml_cuda_set_device(ctx.device);
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
const int warp_size = ggml_cuda_info().devices[ggml_cuda_get_device()].warp_size;
|
||||
const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV);
|
||||
|
||||
// TODO: currently only vec implementation for sinks is supported [TAG_ATTN_SINKS]
|
||||
if (sinks) {
|
||||
if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) {
|
||||
ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
|
||||
} else {
|
||||
ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
#if defined(GGML_HIP_ROCWMMA_FATTN)
|
||||
if (GGML_CUDA_CC_IS_AMD(cc) && fp16_mma_available(cc)) {
|
||||
ggml_cuda_flash_attn_ext_wmma_f16(ctx, dst);
|
||||
@@ -327,7 +316,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||
const bool gqa_opt_applies = ((Q->ne[2] / K->ne[2]) % 2 == 0) && mask; // The mma-based kernels have GQA-specific optimizations
|
||||
const bool mma_needs_data_conversion = K->type != GGML_TYPE_F16 || V->type != GGML_TYPE_F16;
|
||||
const bool mma_faster_for_rtx4000 = Q->ne[3] > 1 || (Q->ne[2] > 4*K->ne[2] && K->ne[1] >= 8192);
|
||||
const bool mma_faster_for_bs1 = new_mma_available(cc) && gqa_opt_applies && !mma_needs_data_conversion &&
|
||||
const bool mma_faster_for_bs1 = turing_mma_available(cc) && gqa_opt_applies && !mma_needs_data_conversion &&
|
||||
(cc < GGML_CUDA_CC_ADA_LOVELACE || mma_faster_for_rtx4000);
|
||||
const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % (2*warp_size) == 0;
|
||||
if (Q->ne[1] == 1 && can_use_vector_kernel && !mma_faster_for_bs1) {
|
||||
@@ -340,7 +329,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||
}
|
||||
|
||||
// The MMA implementation needs Turing or newer, use the old WMMA code for Volta:
|
||||
if (fp16_mma_available(cc) && !new_mma_available(cc)) {
|
||||
if (fp16_mma_available(cc) && !turing_mma_available(cc)) {
|
||||
ggml_cuda_flash_attn_ext_wmma_f16(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
#include "getrows.cuh"
|
||||
#include "dequantize.cuh"
|
||||
#include "convert.cuh"
|
||||
|
||||
template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||
static __global__ void k_get_rows(
|
||||
@@ -34,8 +35,8 @@ static __global__ void k_get_rows(
|
||||
dfloat2 v;
|
||||
dequantize_kernel(src0_row, ib, iqs, v);
|
||||
|
||||
dst_row[iybs + iqs + 0] = float(v.x);
|
||||
dst_row[iybs + iqs + y_offset] = float(v.y);
|
||||
dst_row[iybs + iqs + 0] = ggml_cuda_cast<dst_t>(v.x);
|
||||
dst_row[iybs + iqs + y_offset] = ggml_cuda_cast<dst_t>(v.y);
|
||||
}
|
||||
|
||||
template<typename src0_t, typename dst_t>
|
||||
@@ -62,7 +63,7 @@ static __global__ void k_get_rows_float(
|
||||
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
|
||||
const src0_t * src0_row = (const src0_t *)((const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03);
|
||||
|
||||
dst_row[i00] = float(src0_row[i00]);
|
||||
dst_row[i00] = ggml_cuda_cast<dst_t>(src0_row[i00]);
|
||||
}
|
||||
|
||||
template<typename grad_t, typename dst_t>
|
||||
|
||||
@@ -22,11 +22,13 @@
|
||||
#include "ggml-cuda/fattn.cuh"
|
||||
#include "ggml-cuda/getrows.cuh"
|
||||
#include "ggml-cuda/im2col.cuh"
|
||||
#include "ggml-cuda/mmf.cuh"
|
||||
#include "ggml-cuda/mmq.cuh"
|
||||
#include "ggml-cuda/mmv.cuh"
|
||||
#include "ggml-cuda/mmvf.cuh"
|
||||
#include "ggml-cuda/mmvq.cuh"
|
||||
#include "ggml-cuda/norm.cuh"
|
||||
#include "ggml-cuda/opt-step-adamw.cuh"
|
||||
#include "ggml-cuda/opt-step-sgd.cuh"
|
||||
#include "ggml-cuda/out-prod.cuh"
|
||||
#include "ggml-cuda/pad.cuh"
|
||||
#include "ggml-cuda/pool2d.cuh"
|
||||
@@ -179,30 +181,6 @@ static int ggml_cuda_parse_id(char devName[]) {
|
||||
#endif // defined(GGML_USE_HIP)
|
||||
|
||||
static ggml_cuda_device_info ggml_cuda_init() {
|
||||
#if defined(GGML_USE_HIP)
|
||||
// Workaround for a rocBLAS bug when using multiple graphics cards:
|
||||
// https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
|
||||
{
|
||||
int major_version = 0;
|
||||
size_t version_length = 0;
|
||||
if (rocblas_get_version_string_size(&version_length) == rocblas_status_success) {
|
||||
std::vector<char> version(version_length+1, '\0');
|
||||
if (rocblas_get_version_string(version.data(), version.size()) == rocblas_status_success) {
|
||||
version.resize(::strlen(version.data()));
|
||||
int parsed_value = 0;
|
||||
if (std::from_chars(version.data(), version.data() + version.size(), parsed_value).ec == std::errc()) {
|
||||
major_version = parsed_value;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (major_version < 4) {
|
||||
GGML_LOG_DEBUG(GGML_CUDA_NAME " calling rocblas_initialize as a workaround for a rocBLAS bug\n");
|
||||
rocblas_initialize();
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
ggml_cuda_device_info info = {};
|
||||
|
||||
cudaError_t err = cudaGetDeviceCount(&info.device_count);
|
||||
@@ -2008,7 +1986,9 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||
const bool bad_padding_clear = ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE
|
||||
&& ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) && src0->view_src;
|
||||
|
||||
bool use_mul_mat_vec = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
|
||||
bool use_mul_mat_vec_f = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
||||
bool use_mul_mat_f = !ggml_is_quantized(src0->type)
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
||||
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||
@@ -2028,14 +2008,18 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||
}
|
||||
|
||||
const int cc = ggml_cuda_info().devices[id].cc;
|
||||
const int warp_size = ggml_cuda_info().devices[id].warp_size;
|
||||
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
||||
use_mul_mat_vec = use_mul_mat_vec && ggml_cuda_should_use_mmv(src0->type, cc, src0->ne, src1->ne[1]);
|
||||
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src1->ne[1]);
|
||||
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src1->ne[1]);
|
||||
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
|
||||
}
|
||||
} else {
|
||||
const int cc = ggml_cuda_info().devices[ctx.device].cc;
|
||||
const int warp_size = ggml_cuda_info().devices[ctx.device].warp_size;
|
||||
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
||||
use_mul_mat_vec = use_mul_mat_vec && ggml_cuda_should_use_mmv(src0->type, cc, src0->ne, src1->ne[1]);
|
||||
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src1->ne[1]);
|
||||
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src1->ne[1]);
|
||||
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
|
||||
}
|
||||
|
||||
@@ -2048,15 +2032,17 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
||||
|
||||
//TODO update for generic tensor parallelism
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
bool use_batched_cublas_f16 = src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16);
|
||||
bool use_batched_cublas_bf16 = src0->type == GGML_TYPE_BF16 && bf16_mma_hardware_available(cc);
|
||||
bool use_batched_cublas_f32 = src0->type == GGML_TYPE_F32;
|
||||
|
||||
if (!split && use_mul_mat_vec) {
|
||||
if (!split && use_mul_mat_vec_f) {
|
||||
// the custom F16 vector kernel can be used over batched cuBLAS GEMM
|
||||
// but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
|
||||
ggml_cuda_mul_mat_vec(ctx, src0, src1, nullptr, dst);
|
||||
ggml_cuda_mul_mat_vec_f(ctx, src0, src1, nullptr, dst);
|
||||
} else if (!split && use_mul_mat_f) {
|
||||
ggml_cuda_mul_mat_f(ctx, src0, src1, nullptr, dst);
|
||||
} else if (!split && use_mul_mat_vec_q) {
|
||||
ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst);
|
||||
} else if (!split && use_mul_mat_q) {
|
||||
@@ -2065,8 +2051,8 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||
&& !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
||||
// general KQ + KQV multi-batch without FlashAttention
|
||||
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
|
||||
} else if (use_mul_mat_vec) {
|
||||
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec, nullptr);
|
||||
} else if (use_mul_mat_vec_f) {
|
||||
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_f, nullptr);
|
||||
} else if (use_mul_mat_vec_q) {
|
||||
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
|
||||
} else if (use_mul_mat_q) {
|
||||
@@ -2094,7 +2080,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
if (ggml_is_quantized(src0->type)) {
|
||||
ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst);
|
||||
} else {
|
||||
ggml_cuda_mul_mat_vec(ctx, src0, src1, ids, dst);
|
||||
ggml_cuda_mul_mat_vec_f(ctx, src0, src1, ids, dst);
|
||||
}
|
||||
return;
|
||||
}
|
||||
@@ -2494,6 +2480,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
case GGML_OP_OPT_STEP_ADAMW:
|
||||
ggml_cuda_opt_step_adamw(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_OPT_STEP_SGD:
|
||||
ggml_cuda_opt_step_sgd(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -3516,14 +3505,15 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
#endif // FLASH_ATTN_AVAILABLE
|
||||
if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
|
||||
const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
|
||||
if (!new_mma_available(cc)) {
|
||||
if (!turing_mma_available(cc)) {
|
||||
return false;
|
||||
}
|
||||
const int gqa_ratio = op->src[0]->ne[2] / op->src[1]->ne[2];
|
||||
return op->src[1]->ne[0] == 576 && op->src[2]->ne[0] == 512 && op->src[3] && gqa_ratio % 16 == 0;
|
||||
}
|
||||
// TODO: more general-purpose attention sink support [TAG_ATTN_SINKS]
|
||||
if (op->src[4] && op->src[0]->ne[0] != 64 && op->src[0]->ne[0] != 128) { // currently only sinks for head_size 64 and 128 are supported
|
||||
if (op->src[4] && !fp16_mma_available(ggml_cuda_info().devices[dev_ctx->device].cc)
|
||||
&& op->src[0]->ne[0] != 64 && op->src[0]->ne[0] != 128) {
|
||||
return false;
|
||||
}
|
||||
if (op->src[0]->ne[0] == 192) {
|
||||
@@ -3550,6 +3540,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_OP_CROSS_ENTROPY_LOSS:
|
||||
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
|
||||
case GGML_OP_OPT_STEP_ADAMW:
|
||||
case GGML_OP_OPT_STEP_SGD:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
@@ -3789,10 +3780,10 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
|
||||
}
|
||||
|
||||
ggml_backend_t cuda_backend = new ggml_backend {
|
||||
/* .guid = */ ggml_backend_cuda_guid(),
|
||||
/* .interface = */ ggml_backend_cuda_interface,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device),
|
||||
/* .context = */ ctx,
|
||||
/* .guid = */ ggml_backend_cuda_guid(),
|
||||
/* .iface = */ ggml_backend_cuda_interface,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device),
|
||||
/* .context = */ ctx,
|
||||
};
|
||||
|
||||
return cuda_backend;
|
||||
|
||||
@@ -1,4 +1,14 @@
|
||||
#include "mean.cuh"
|
||||
#include "reduce_rows.cuh"
|
||||
|
||||
#ifdef GGML_CUDA_USE_CUB
|
||||
#include <cub/cub.cuh>
|
||||
using namespace cub;
|
||||
#endif // GGML_CUDA_USE_CUB
|
||||
|
||||
template <typename T> __global__ void divide_by_count(T * result, size_t count) {
|
||||
*result /= static_cast<T>(count);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
@@ -13,7 +23,51 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const int64_t ncols = src0->ne[0];
|
||||
const int64_t nrows = ggml_nrows(src0);
|
||||
|
||||
const dim3 block_dims(WARP_SIZE, 1, 1);
|
||||
// Special case for reducing vectors
|
||||
#ifdef GGML_CUDA_USE_CUB
|
||||
#ifdef USE_CUDA_GRAPH
|
||||
cudaStreamCaptureStatus iscapturing;
|
||||
CUDA_CHECK(cudaStreamIsCapturing(stream, &iscapturing));
|
||||
#endif // USE_CUDA_GRAPH
|
||||
if ((nrows == 1) &&
|
||||
#ifdef USE_CUDA_GRAPH
|
||||
// CUDA_GRAPHS_DISABLED
|
||||
((ncols > 65536) &&
|
||||
((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) ||
|
||||
ctx.cuda_graph->disable_due_to_gpu_arch || ctx.cuda_graph->disable_due_to_too_many_updates ||
|
||||
ctx.cuda_graph->disable_due_to_failed_graph_capture)) ||
|
||||
// CUDA_GRAPHS ENABLED
|
||||
((ncols > 32768) &&
|
||||
!((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) ||
|
||||
ctx.cuda_graph->disable_due_to_gpu_arch || ctx.cuda_graph->disable_due_to_too_many_updates ||
|
||||
ctx.cuda_graph->disable_due_to_failed_graph_capture))) {
|
||||
#else
|
||||
(ncols > 65536)) {
|
||||
#endif // USE_CUDA_GRAPH
|
||||
// Single row - use device-wide reduction
|
||||
size_t tmp_size = 0;
|
||||
ggml_cuda_pool & pool = ctx.pool();
|
||||
|
||||
DeviceReduce::Sum(nullptr, tmp_size, src0_d, dst_d, ncols, stream);
|
||||
|
||||
ggml_cuda_pool_alloc<uint8_t> tmp_alloc(pool, tmp_size);
|
||||
DeviceReduce::Sum(tmp_alloc.ptr, tmp_size, src0_d, dst_d, ncols, stream);
|
||||
|
||||
// Divide by ncols
|
||||
divide_by_count<float><<<1, 1, 0, stream>>>(dst_d, ncols);
|
||||
return;
|
||||
}
|
||||
#endif // GGML_CUDA_USE_CUB
|
||||
|
||||
const dim3 block_nums(nrows, 1, 1);
|
||||
reduce_rows_f32</*norm*/ true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
|
||||
|
||||
const int id = ggml_cuda_get_device();
|
||||
const int nsm = ggml_cuda_info().devices[id].nsm;
|
||||
if ((nrows / nsm) < 2) {
|
||||
const dim3 block_dims(512, 1, 1);
|
||||
reduce_rows_f32</*norm=*/true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
|
||||
} else {
|
||||
const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1);
|
||||
reduce_rows_f32</*norm=*/true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
|
||||
}
|
||||
}
|
||||
|
||||
+88
-22
@@ -23,13 +23,13 @@
|
||||
static __device__ __forceinline__ int ggml_cuda_movmatrix(const int x) {
|
||||
int ret = 0;
|
||||
|
||||
#ifdef NEW_MMA_AVAILABLE
|
||||
#ifdef TURING_MMA_AVAILABLE
|
||||
asm("movmatrix.sync.aligned.m8n8.trans.b16 %0, %1;"
|
||||
: "=r"(ret) : "r"(x));
|
||||
#else
|
||||
GGML_UNUSED(x);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(TURING_MMA_AVAILABLE)
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -167,6 +167,38 @@ namespace ggml_cuda_mma {
|
||||
}
|
||||
};
|
||||
|
||||
template <int I_, int J_>
|
||||
struct tile<I_, J_, nv_bfloat162> {
|
||||
static constexpr int I = I_;
|
||||
static constexpr int J = J_;
|
||||
static constexpr int ne = I * J / WARP_SIZE;
|
||||
nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
|
||||
|
||||
static __device__ __forceinline__ int get_i(const int l) {
|
||||
if constexpr (I == 8 && J == 8) {
|
||||
return threadIdx.x / 4;
|
||||
} else if constexpr (I == 16 && J == 4) {
|
||||
return l * 8 + threadIdx.x / 4;
|
||||
} else if constexpr (I == 16 && J == 8) {
|
||||
return (l % 2) * 8 + threadIdx.x / 4;
|
||||
} else {
|
||||
static_assert(I == -1 && J == -1, "template specialization not implemented");
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int get_j(const int l) {
|
||||
if constexpr (I == 8 && J == 8) {
|
||||
return l * 4 + threadIdx.x % 4;
|
||||
} else if constexpr (I == 16 && J == 4) {
|
||||
return threadIdx.x % 4;
|
||||
} else if constexpr (I == 16 && J == 8) {
|
||||
return (l / 2) * 4 + threadIdx.x % 4;
|
||||
} else {
|
||||
static_assert(I == -1 && J == -1, "template specialization not implemented");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <int I, int J>
|
||||
static __device__ __forceinline__ tile<I, J/2, half2> get_half2(const tile<I, J, float> & tile_float) {
|
||||
tile<I, J/2, half2> ret;
|
||||
@@ -209,7 +241,7 @@ namespace ggml_cuda_mma {
|
||||
template <typename T>
|
||||
static __device__ __forceinline__ void load_ldmatrix(
|
||||
tile<8, 8, T> & t, const T * __restrict__ xs0, const int stride) {
|
||||
#ifdef NEW_MMA_AVAILABLE
|
||||
#ifdef TURING_MMA_AVAILABLE
|
||||
int * xi = (int *) t.x;
|
||||
const int * xs = (const int *) xs0 + (threadIdx.x % t.I) * stride + ((threadIdx.x / t.I) * (t.J / 2)) % t.J;
|
||||
asm volatile("ldmatrix.sync.aligned.m8n8.x2.b16 {%0, %1}, [%2];"
|
||||
@@ -217,13 +249,13 @@ namespace ggml_cuda_mma {
|
||||
: "l"(xs));
|
||||
#else
|
||||
load_generic(t, xs0, stride);
|
||||
#endif // NEW_MMA_AVAILABLE
|
||||
#endif // TURING_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__ void load_ldmatrix(
|
||||
tile<16, 4, T> & t, const T * __restrict__ xs0, const int stride) {
|
||||
#ifdef NEW_MMA_AVAILABLE
|
||||
#ifdef TURING_MMA_AVAILABLE
|
||||
int * xi = (int *) t.x;
|
||||
const int * xs = (const int *) xs0 + (threadIdx.x % t.I) * stride;
|
||||
asm volatile("ldmatrix.sync.aligned.m8n8.x2.b16 {%0, %1}, [%2];"
|
||||
@@ -232,13 +264,13 @@ namespace ggml_cuda_mma {
|
||||
#else
|
||||
load_generic(xs0, stride);
|
||||
GGML_UNUSED(t);
|
||||
#endif // NEW_MMA_AVAILABLE
|
||||
#endif // TURING_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__ void load_ldmatrix(
|
||||
tile<16, 8, T> & t, const T * __restrict__ xs0, const int stride) {
|
||||
#if defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(TURING_MMA_AVAILABLE)
|
||||
int * xi = (int * ) t.x;
|
||||
const int * xs = (const int *) xs0 + (threadIdx.x % t.I) * stride + (threadIdx.x / t.I) * (t.J / 2);
|
||||
asm volatile("ldmatrix.sync.aligned.m8n8.x4.b16 {%0, %1, %2, %3}, [%4];"
|
||||
@@ -246,13 +278,13 @@ namespace ggml_cuda_mma {
|
||||
: "l"(xs));
|
||||
#else
|
||||
load_generic(t, xs0, stride);
|
||||
#endif // NEW_MMA_AVAILABLE
|
||||
#endif // TURING_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__ void load_ldmatrix_trans(
|
||||
tile<16, 8, T> & t, const T * __restrict__ xs0, const int stride) {
|
||||
#ifdef NEW_MMA_AVAILABLE
|
||||
#ifdef TURING_MMA_AVAILABLE
|
||||
int * xi = (int * ) t.x;
|
||||
const int * xs = (const int *) xs0 + (threadIdx.x % t.I) * stride + (threadIdx.x / t.I) * (t.J / 2);
|
||||
asm volatile("ldmatrix.sync.aligned.m8n8.x4.trans.b16 {%0, %1, %2, %3}, [%4];"
|
||||
@@ -263,12 +295,12 @@ namespace ggml_cuda_mma {
|
||||
GGML_UNUSED(xs0);
|
||||
GGML_UNUSED(stride);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // NEW_MMA_AVAILABLE
|
||||
#endif // TURING_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void mma(
|
||||
tile<16, 8, int> & D, const tile<16, 4, int> & A, const tile<8, 4, int> & B) {
|
||||
#ifdef NEW_MMA_AVAILABLE
|
||||
#ifdef TURING_MMA_AVAILABLE
|
||||
#if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||
asm("mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5}, {%6}, {%0, %1, %2, %3};"
|
||||
: "+r"(D.x[0]), "+r"(D.x[1]), "+r"(D.x[2]), "+r"(D.x[3])
|
||||
@@ -287,12 +319,12 @@ namespace ggml_cuda_mma {
|
||||
GGML_UNUSED(A);
|
||||
GGML_UNUSED(B);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // NEW_MMA_AVAILABLE
|
||||
#endif // TURING_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void mma(
|
||||
tile<16, 8, int> & D, const tile<16, 8, int> & A, const tile<8, 8, int> & B) {
|
||||
#ifdef NEW_MMA_AVAILABLE
|
||||
#ifdef TURING_MMA_AVAILABLE
|
||||
#if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||
asm("mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3};"
|
||||
: "+r"(D.x[0]), "+r"(D.x[1]), "+r"(D.x[2]), "+r"(D.x[3])
|
||||
@@ -317,12 +349,12 @@ namespace ggml_cuda_mma {
|
||||
GGML_UNUSED(A);
|
||||
GGML_UNUSED(B);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // NEW_MMA_AVAILABLE
|
||||
#endif // TURING_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void mma(
|
||||
tile<16, 4, half2> & D, const tile<16, 8, half2> & A, const tile<8, 8, half2> & B) {
|
||||
#ifdef NEW_MMA_AVAILABLE
|
||||
#ifdef TURING_MMA_AVAILABLE
|
||||
const int * Axi = (const int *) A.x;
|
||||
const int * Bxi = (const int *) B.x;
|
||||
int * Dxi = (int *) D.x;
|
||||
@@ -344,12 +376,12 @@ namespace ggml_cuda_mma {
|
||||
GGML_UNUSED(A);
|
||||
GGML_UNUSED(B);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // NEW_MMA_AVAILABLE
|
||||
#endif // TURING_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void mma(
|
||||
tile<16, 8, half2> & D, const tile<16, 8, half2> & A, const tile<16, 8, half2> & B) {
|
||||
#ifdef NEW_MMA_AVAILABLE
|
||||
#ifdef TURING_MMA_AVAILABLE
|
||||
const int * Axi = (const int *) A.x;
|
||||
const int * Bxi = (const int *) B.x;
|
||||
int * Dxi = (int *) D.x;
|
||||
@@ -380,12 +412,29 @@ namespace ggml_cuda_mma {
|
||||
GGML_UNUSED(A);
|
||||
GGML_UNUSED(B);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // NEW_MMA_AVAILABLE
|
||||
#endif // TURING_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void mma(
|
||||
tile<16, 8, float> & D, const tile<16, 8, float> & A, const tile<8, 8, float> & B) {
|
||||
#ifdef AMPERE_MMA_AVAILABLE
|
||||
const int * Axi = (const int *) A.x;
|
||||
const int * Bxi = (const int *) B.x;
|
||||
int * Dxi = (int *) D.x;
|
||||
asm("mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 {%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3};"
|
||||
: "+r"(Dxi[0]), "+r"(Dxi[1]), "+r"(Dxi[2]), "+r"(Dxi[3])
|
||||
: "r"(Axi[0]), "r"(Axi[1]), "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[0]), "r"(Bxi[1]));
|
||||
#else
|
||||
GGML_UNUSED(D);
|
||||
GGML_UNUSED(A);
|
||||
GGML_UNUSED(B);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // AMPERE_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void mma(
|
||||
tile<16, 8, float> & D, const tile<16, 8, half2> & A, const tile<8, 8, half2> & B) {
|
||||
#ifdef NEW_MMA_AVAILABLE
|
||||
#ifdef TURING_MMA_AVAILABLE
|
||||
const int * Axi = (const int *) A.x;
|
||||
const int * Bxi = (const int *) B.x;
|
||||
int * Dxi = (int *) D.x;
|
||||
@@ -407,12 +456,29 @@ namespace ggml_cuda_mma {
|
||||
GGML_UNUSED(A);
|
||||
GGML_UNUSED(B);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // NEW_MMA_AVAILABLE
|
||||
#endif // TURING_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void mma(
|
||||
tile<16, 8, float> & D, const tile<16, 8, nv_bfloat162> & A, const tile<8, 8, nv_bfloat162> & B) {
|
||||
#ifdef AMPERE_MMA_AVAILABLE
|
||||
const int * Axi = (const int *) A.x;
|
||||
const int * Bxi = (const int *) B.x;
|
||||
int * Dxi = (int *) D.x;
|
||||
asm("mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 {%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3};"
|
||||
: "+r"(Dxi[0]), "+r"(Dxi[1]), "+r"(Dxi[2]), "+r"(Dxi[3])
|
||||
: "r"(Axi[0]), "r"(Axi[1]), "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[0]), "r"(Bxi[1]));
|
||||
#else
|
||||
GGML_UNUSED(D);
|
||||
GGML_UNUSED(A);
|
||||
GGML_UNUSED(B);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // AMPERE_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void mma(
|
||||
tile<16, 16, float> & D, const tile<16, 8, half2> & A, const tile<16, 8, half2> & B) {
|
||||
#ifdef NEW_MMA_AVAILABLE
|
||||
#ifdef TURING_MMA_AVAILABLE
|
||||
const int * Axi = (const int *) A.x;
|
||||
const int * Bxi = (const int *) B.x;
|
||||
int * Dxi = (int *) D.x;
|
||||
@@ -443,7 +509,7 @@ namespace ggml_cuda_mma {
|
||||
GGML_UNUSED(A);
|
||||
GGML_UNUSED(B);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // NEW_MMA_AVAILABLE
|
||||
#endif // TURING_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void mma(
|
||||
|
||||
@@ -0,0 +1,431 @@
|
||||
#include "ggml.h"
|
||||
#include "common.cuh"
|
||||
#include "mma.cuh"
|
||||
#include "mmf.cuh"
|
||||
|
||||
using namespace ggml_cuda_mma;
|
||||
|
||||
#define MMF_ROWS_PER_BLOCK 32
|
||||
|
||||
template <typename T, int rows_per_block, int cols_per_block, int nwarps>
|
||||
__launch_bounds__(ggml_cuda_get_physical_warp_size()*nwarps, 1)
|
||||
static __global__ void mul_mat_f(
|
||||
const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, float * __restrict__ dst,
|
||||
const int ncols, const int nchannels_y, const int stride_row, const int stride_col_y, const int stride_col_dst,
|
||||
const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
|
||||
const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) {
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
typedef tile<16, 8, T> tile_A;
|
||||
typedef tile< 8, 8, T> tile_B;
|
||||
typedef tile<16, 8, float> tile_C;
|
||||
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
constexpr int tile_k_padded = warp_size + 4;
|
||||
constexpr int ntA = rows_per_block / tile_A::I;
|
||||
constexpr int ntB = (cols_per_block + tile_B::I - 1) / tile_B::I;
|
||||
|
||||
const int row0 = blockIdx.x * rows_per_block;
|
||||
const int channel_dst = blockIdx.y;
|
||||
const int channel_x = channel_dst / channel_ratio;
|
||||
const int channel_y = channel_dst;
|
||||
const int sample_dst = blockIdx.z;
|
||||
const int sample_x = sample_dst / sample_ratio;
|
||||
const int sample_y = sample_dst;
|
||||
|
||||
x += int64_t(sample_x) *stride_sample_x + channel_x *stride_channel_x + row0*stride_row ;
|
||||
y += int64_t(sample_y) *stride_sample_y + channel_y *stride_channel_y;
|
||||
dst += int64_t(sample_dst)*stride_sample_dst + channel_dst*stride_channel_dst;
|
||||
|
||||
const float2 * y2 = (const float2 *) y;
|
||||
|
||||
extern __shared__ char data_mmv[];
|
||||
|
||||
tile_C C[ntA][ntB];
|
||||
|
||||
T * tile_xy = (T *) data_mmv + threadIdx.y*(tile_A::I * tile_k_padded);
|
||||
|
||||
for (int col = threadIdx.y*warp_size + threadIdx.x; col < ncols; col += nwarps*warp_size) {
|
||||
tile_A A[ntA][warp_size / tile_A::J];
|
||||
#pragma unroll
|
||||
for (int itA = 0; itA < ntA; ++itA) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < tile_A::I; ++i) {
|
||||
tile_xy[i*tile_k_padded + threadIdx.x] = x[(itA*tile_A::I + i)*stride_row + col];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int k0 = 0; k0 < warp_size; k0 += tile_A::J) {
|
||||
load_ldmatrix(A[itA][k0/tile_A::J], tile_xy + k0, tile_k_padded);
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int itB = 0; itB < ntB; ++itB) {
|
||||
if constexpr (std::is_same_v<T, float>) {
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < tile_B::I; ++j0) {
|
||||
const int j = j0 + itB*tile_B::I;
|
||||
|
||||
tile_xy[j0*tile_k_padded + threadIdx.x] = j < cols_per_block ? y[j*stride_col_y + col] : 0.0f;
|
||||
}
|
||||
} else if constexpr (std::is_same_v<T, half2> || std::is_same_v<T, nv_bfloat162>) {
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < tile_B::I; ++j0) {
|
||||
const int j = j0 + itB*tile_B::I;
|
||||
|
||||
const float2 tmp = j < cols_per_block ? y2[j*stride_col_y + col] : make_float2(0.0f, 0.0f);
|
||||
tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y};
|
||||
}
|
||||
} else {
|
||||
static_assert(std::is_same_v<T, void>, "unsupported type");
|
||||
}
|
||||
#pragma unroll
|
||||
for (int k0 = 0; k0 < warp_size; k0 += tile_B::J) {
|
||||
tile_B B;
|
||||
load_ldmatrix(B, tile_xy + k0, tile_k_padded);
|
||||
#pragma unroll
|
||||
for (int itA = 0; itA < ntA; ++itA) {
|
||||
mma(C[itA][itB], A[itA][k0/tile_B::J], B);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
float * buf_iw = (float *) data_mmv;
|
||||
constexpr int kiw = nwarps*rows_per_block + 4;
|
||||
|
||||
if (nwarps > 1) {
|
||||
__syncthreads();
|
||||
}
|
||||
#pragma unroll
|
||||
for (int itB = 0; itB < ntB; ++itB) {
|
||||
#pragma unroll
|
||||
for (int itA = 0; itA < ntA; ++itA) {
|
||||
#pragma unroll
|
||||
for (int l = 0; l < tile_C::ne; ++l) {
|
||||
const int i = threadIdx.y*rows_per_block + itA*tile_C::I + tile_C::get_i(l);
|
||||
const int j = itB*tile_C::J + tile_C::get_j(l);
|
||||
buf_iw[j*kiw + i] = C[itA][itB].x[l];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (nwarps > 1) {
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < cols_per_block; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
if (j0 + nwarps > cols_per_block && j >= cols_per_block) {
|
||||
return;
|
||||
}
|
||||
|
||||
float sum = 0.0f;
|
||||
static_assert(rows_per_block == warp_size, "need loop/check");
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < nwarps*rows_per_block; i0 += rows_per_block) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
sum += buf_iw[j*kiw + i];
|
||||
}
|
||||
dst[j*stride_col_dst + row0 + threadIdx.x] = sum;
|
||||
}
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(ids); GGML_UNUSED(dst);
|
||||
GGML_UNUSED(ncols); GGML_UNUSED(nchannels_y); GGML_UNUSED(stride_row); GGML_UNUSED(stride_col_y); GGML_UNUSED(stride_col_dst);
|
||||
GGML_UNUSED(channel_ratio); GGML_UNUSED(stride_channel_x); GGML_UNUSED(stride_channel_y); GGML_UNUSED(stride_channel_dst);
|
||||
GGML_UNUSED(sample_ratio); GGML_UNUSED(stride_sample_x); GGML_UNUSED(stride_sample_y); GGML_UNUSED(stride_sample_dst);
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
}
|
||||
|
||||
template <typename T, int cols_per_block>
|
||||
static void mul_mat_f_cuda(
|
||||
const T * x, const float * y, const int32_t * ids, float * dst,
|
||||
const int64_t ncols_x, const int64_t nrows_x,
|
||||
const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
|
||||
const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
|
||||
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
|
||||
const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
|
||||
cudaStream_t stream) {
|
||||
typedef tile<16, 8, T> tile_A;
|
||||
typedef tile< 8, 8, T> tile_B;
|
||||
typedef tile<16, 8, float> tile_C;
|
||||
|
||||
GGML_ASSERT(!ids && "mul_mat_id not implemented");
|
||||
|
||||
GGML_ASSERT(ncols_x % 2 == 0);
|
||||
GGML_ASSERT(stride_row % 2 == 0);
|
||||
GGML_ASSERT(stride_col_y % 2 == 0);
|
||||
GGML_ASSERT(ids || nchannels_dst % nchannels_x == 0);
|
||||
GGML_ASSERT( nsamples_dst % nsamples_x == 0);
|
||||
const int64_t channel_ratio = nchannels_dst / nchannels_x;
|
||||
const int64_t sample_ratio = nsamples_dst / nsamples_x;
|
||||
|
||||
const int device = ggml_cuda_get_device();
|
||||
const int warp_size = ggml_cuda_info().devices[device].warp_size;
|
||||
|
||||
int64_t nwarps_best = 1;
|
||||
int64_t niter_best = (ncols_x + warp_size*2 - 1) / (warp_size*2);
|
||||
int64_t max_block_size = 256;
|
||||
for (int64_t nwarps = 2; nwarps <= max_block_size/warp_size; nwarps++) {
|
||||
const int64_t niter = (ncols_x + nwarps*warp_size*2 - 1) / (nwarps*warp_size*2);
|
||||
if (niter < niter_best) {
|
||||
niter_best = niter;
|
||||
nwarps_best = nwarps;
|
||||
}
|
||||
}
|
||||
|
||||
constexpr int rows_per_block = MMF_ROWS_PER_BLOCK;
|
||||
const int nbytes_shared_iter = nwarps_best * tile_A::I * (warp_size + 4) * 4;
|
||||
const int nbytes_shared_combine = GGML_PAD(cols_per_block, tile_B::I) * (nwarps_best*rows_per_block + 4) * 4;
|
||||
const int nbytes_shared = std::max(nbytes_shared_iter, nbytes_shared_combine);
|
||||
const dim3 block_nums(nrows_x/rows_per_block, nchannels_dst, nsamples_dst);
|
||||
const dim3 block_dims(warp_size, nwarps_best, 1);
|
||||
switch (nwarps_best) {
|
||||
case 1: {
|
||||
mul_mat_f<T, rows_per_block, cols_per_block, 1><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} break;
|
||||
case 2: {
|
||||
mul_mat_f<T, rows_per_block, cols_per_block, 2><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} break;
|
||||
case 3: {
|
||||
mul_mat_f<T, rows_per_block, cols_per_block, 3><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} break;
|
||||
case 4: {
|
||||
mul_mat_f<T, rows_per_block, cols_per_block, 4><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} break;
|
||||
case 5: {
|
||||
mul_mat_f<T, rows_per_block, cols_per_block, 5><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} break;
|
||||
case 6: {
|
||||
mul_mat_f<T, rows_per_block, cols_per_block, 6><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} break;
|
||||
case 7: {
|
||||
mul_mat_f<T, rows_per_block, cols_per_block, 7><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} break;
|
||||
case 8: {
|
||||
mul_mat_f<T, rows_per_block, cols_per_block, 8><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} break;
|
||||
default: {
|
||||
GGML_ABORT("fatal error");
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void mul_mat_f_switch_cols_per_block(
|
||||
const T * x, const float * y, const int32_t * ids, float * dst,
|
||||
const int64_t ncols_x, const int64_t nrows_x, const int64_t ncols_dst,
|
||||
const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
|
||||
const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
|
||||
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
|
||||
const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
|
||||
cudaStream_t stream) {
|
||||
switch (ncols_dst) {
|
||||
case 1: {
|
||||
mul_mat_f_cuda<T, 1>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 2: {
|
||||
mul_mat_f_cuda<T, 2>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 3: {
|
||||
mul_mat_f_cuda<T, 3>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 4: {
|
||||
mul_mat_f_cuda<T, 4>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 5: {
|
||||
mul_mat_f_cuda<T, 5>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 6: {
|
||||
mul_mat_f_cuda<T, 6>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 7: {
|
||||
mul_mat_f_cuda<T, 7>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 8: {
|
||||
mul_mat_f_cuda<T, 8>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 9: {
|
||||
mul_mat_f_cuda<T, 9>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 10: {
|
||||
mul_mat_f_cuda<T, 10>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 11: {
|
||||
mul_mat_f_cuda<T, 11>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 12: {
|
||||
mul_mat_f_cuda<T, 12>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 13: {
|
||||
mul_mat_f_cuda<T, 13>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 14: {
|
||||
mul_mat_f_cuda<T, 14>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 15: {
|
||||
mul_mat_f_cuda<T, 15>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 16: {
|
||||
mul_mat_f_cuda<T, 16>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
default: {
|
||||
GGML_ABORT("fatal error");
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
|
||||
GGML_ASSERT( src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(!ids || ids->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS;
|
||||
|
||||
const size_t ts_src0 = ggml_type_size(src0->type);
|
||||
const size_t ts_src1 = ggml_type_size(src1->type);
|
||||
const size_t ts_dst = ggml_type_size(dst->type);
|
||||
|
||||
GGML_ASSERT(ne13 == ne3);
|
||||
|
||||
GGML_ASSERT( nb00 == ts_src0);
|
||||
GGML_ASSERT( nb10 == ts_src1);
|
||||
GGML_ASSERT(!ids || ids->nb[0] == ggml_type_size(ids->type));
|
||||
GGML_ASSERT( nb0 == ts_dst);
|
||||
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32;
|
||||
|
||||
const float * src1_d = (const float *) src1->data;
|
||||
const int32_t * ids_d = ids ? (const int32_t *) ids->data : nullptr;
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
const int64_t s01 = src0->nb[1] / ts_src0;
|
||||
const int64_t s11 = src1->nb[1] / ts_src1;
|
||||
const int64_t s1 = dst->nb[1] / ts_dst;
|
||||
const int64_t s02 = src0->nb[2] / ts_src0;
|
||||
const int64_t s12 = src1->nb[2] / ts_src1;
|
||||
const int64_t s2 = dst->nb[2] / ts_dst;
|
||||
const int64_t s03 = src0->nb[3] / ts_src0;
|
||||
const int64_t s13 = src1->nb[3] / ts_src1;
|
||||
const int64_t s3 = dst->nb[3] / ts_dst;
|
||||
|
||||
// For MUL_MAT_ID the memory layout is different than for MUL_MAT:
|
||||
const int64_t ncols_dst = ids ? ne2 : ne1;
|
||||
const int64_t nchannels_y = ids ? ne11 : ne12;
|
||||
const int64_t nchannels_dst = ids ? ne1 : ne2;
|
||||
const int64_t stride_channel_dst = ids ? s1 : s2;
|
||||
const int64_t stride_channel_y = ids ? s11 : s12;
|
||||
|
||||
GGML_ASSERT(!ids || ncols_dst == 1);
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32: {
|
||||
const float * src0_d = (const float *) src0->data;
|
||||
constexpr int vals_per_T = 1;
|
||||
mul_mat_f_switch_cols_per_block(
|
||||
src0_d, src1_d, ids_d, dst_d, ne00/vals_per_T, ne01, ncols_dst, s01/vals_per_T, s11/vals_per_T, s1,
|
||||
ne02, nchannels_y, nchannels_dst, s02/vals_per_T, stride_channel_y, stride_channel_dst,
|
||||
ne03, ne3, s03/vals_per_T, s13, s3, ctx.stream());
|
||||
} break;
|
||||
case GGML_TYPE_F16: {
|
||||
const half2 * src0_d = (const half2 *) src0->data;
|
||||
constexpr int vals_per_T = 2;
|
||||
mul_mat_f_switch_cols_per_block(
|
||||
src0_d, src1_d, ids_d, dst_d, ne00/vals_per_T, ne01, ncols_dst, s01/vals_per_T, s11/vals_per_T, s1,
|
||||
ne02, nchannels_y, nchannels_dst, s02/vals_per_T, stride_channel_y, stride_channel_dst,
|
||||
ne03, ne3, s03/vals_per_T, s13, s3, ctx.stream());
|
||||
} break;
|
||||
case GGML_TYPE_BF16: {
|
||||
const nv_bfloat162 * src0_d = (const nv_bfloat162 *) src0->data;
|
||||
constexpr int vals_per_T = 2;
|
||||
mul_mat_f_switch_cols_per_block(
|
||||
src0_d, src1_d, ids_d, dst_d, ne00/vals_per_T, ne01, ncols_dst, s01/vals_per_T, s11/vals_per_T, s1,
|
||||
ne02, nchannels_y, nchannels_dst, s02/vals_per_T, stride_channel_y, stride_channel_dst,
|
||||
ne03, ne3, s03/vals_per_T, s13, s3, ctx.stream());
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type));
|
||||
}
|
||||
}
|
||||
|
||||
bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * src0_ne, int64_t ne11) {
|
||||
if (src0_ne[0] % (warp_size * (4/ggml_type_size(type))) != 0) {
|
||||
return false;
|
||||
}
|
||||
if (src0_ne[1] % MMF_ROWS_PER_BLOCK != 0) {
|
||||
return false;
|
||||
}
|
||||
if (ne11 > 16) {
|
||||
return false;
|
||||
}
|
||||
switch (type) {
|
||||
case GGML_TYPE_F32:
|
||||
return ampere_mma_available(cc);
|
||||
case GGML_TYPE_F16:
|
||||
return turing_mma_available(cc);
|
||||
case GGML_TYPE_BF16:
|
||||
return ampere_mma_available(cc);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,5 @@
|
||||
#include "common.cuh"
|
||||
|
||||
void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
|
||||
|
||||
bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * scr0_ne, int64_t ne11);
|
||||
@@ -310,7 +310,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (new_mma_available(cc)) {
|
||||
if (turing_mma_available(cc)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
+132
-132
@@ -92,7 +92,7 @@ struct tile_x_sizes {
|
||||
};
|
||||
|
||||
static int get_mmq_x_max_host(const int cc) {
|
||||
return (amd_mfma_available(cc) || new_mma_available(cc)) ? 128 :
|
||||
return (amd_mfma_available(cc) || turing_mma_available(cc)) ? 128 :
|
||||
GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA ?
|
||||
#ifdef GGML_CUDA_FORCE_MMQ
|
||||
128 : 64;
|
||||
@@ -102,9 +102,9 @@ static int get_mmq_x_max_host(const int cc) {
|
||||
}
|
||||
|
||||
static constexpr __device__ int get_mmq_x_max_device() {
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
return 128;
|
||||
#else // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#else // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
|
||||
#if defined(GGML_USE_HIP)
|
||||
return 64;
|
||||
@@ -121,7 +121,7 @@ static constexpr __device__ int get_mmq_x_max_device() {
|
||||
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
||||
|
||||
#endif // defined(GGML_USE_HIP)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
static int get_mmq_y_host(const int cc) {
|
||||
@@ -233,7 +233,7 @@ static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) {
|
||||
static int mmq_get_granularity_host(const int mmq_x, const int cc) {
|
||||
if (amd_mfma_available(cc)) {
|
||||
return mmq_x >= 128 ? 32 : 16;
|
||||
} else if (new_mma_available(cc) && mmq_x >= 48) {
|
||||
} else if (turing_mma_available(cc) && mmq_x >= 48) {
|
||||
return 16;
|
||||
} else {
|
||||
return 8;
|
||||
@@ -244,7 +244,7 @@ static int mmq_get_granularity_host(const int mmq_x, const int cc) {
|
||||
static constexpr __device__ int mmq_get_granularity_device(const int mmq_x) {
|
||||
return mmq_x >= 128 ? 32 : 16;
|
||||
}
|
||||
#elif defined(NEW_MMA_AVAILABLE)
|
||||
#elif defined(TURING_MMA_AVAILABLE)
|
||||
static constexpr __device__ int mmq_get_granularity_device(const int mmq_x) {
|
||||
return mmq_x >= 48 ? 16 : 8;
|
||||
}
|
||||
@@ -279,14 +279,14 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + 2*MMQ_TILE_NE_K);
|
||||
#else
|
||||
constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_0, mmq_y);
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + txs.qs);
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
|
||||
constexpr int threads_per_row = MMQ_ITER_K / (4 * QR4_0);
|
||||
constexpr int nrows = warp_size / threads_per_row;
|
||||
@@ -305,12 +305,12 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
const block_q4_0 * bxi = (const block_q4_0 *) x + kbx0 + i*stride + kbx;
|
||||
const int qs0 = get_int_b2(bxi->qs, kqsx);
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + kbx*(2*QI4_0) + kqsx + 0] = __vsubss4((qs0 >> 0) & 0x0F0F0F0F, 0x08080808);
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + kbx*(2*QI4_0) + kqsx + QI4_0] = __vsubss4((qs0 >> 4) & 0x0F0F0F0F, 0x08080808);
|
||||
#else
|
||||
x_qs[i*(MMQ_TILE_NE_K + 1) + txi] = qs0;
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
constexpr int blocks_per_tile_x_row = MMQ_TILE_NE_K / QI4_0;
|
||||
@@ -327,11 +327,11 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
|
||||
const block_q4_0 * bxi = (const block_q4_0 *) x + kbx0 + i*stride + kbxd;
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kbxd] = bxi->d;
|
||||
#else
|
||||
x_df[i*(MMQ_TILE_NE_K/QI4_0) + i/QI4_0 + kbxd] = bxi->d;
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -382,14 +382,14 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
int * x_qs = (int *) x_tile;
|
||||
half2 * x_dm = (half2 *) (x_qs + 2*MMQ_TILE_NE_K);
|
||||
#else
|
||||
constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_1, mmq_y);
|
||||
int * x_qs = (int *) x_tile;
|
||||
half2 * x_dm = (half2 *) (x_qs + txs.qs);
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
|
||||
constexpr int threads_per_row = MMQ_ITER_K / (4 * QR4_1);
|
||||
constexpr int nrows = warp_size / threads_per_row;
|
||||
@@ -408,12 +408,12 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
const block_q4_1 * bxi = (const block_q4_1 *) x + kbx0 + i*stride + kbx;
|
||||
const int qs0 = get_int_b4(bxi->qs, kqsx);
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + kbx*(2*QI4_1) + kqsx + 0] = (qs0 >> 0) & 0x0F0F0F0F;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + kbx*(2*QI4_1) + kqsx + QI4_1] = (qs0 >> 4) & 0x0F0F0F0F;
|
||||
#else
|
||||
x_qs[i*(MMQ_TILE_NE_K + 1) + txi] = qs0;
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
constexpr int blocks_per_tile_x_row = MMQ_TILE_NE_K / QI4_1;
|
||||
@@ -430,11 +430,11 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
|
||||
const block_q4_1 * bxi = (const block_q4_1 *) x + kbx0 + i*stride + kbxd;
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + kbxd] = bxi->dm;
|
||||
#else
|
||||
x_dm[i*(MMQ_TILE_NE_K/QI4_1) + i/QI4_1 + kbxd] = bxi->dm;
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -485,14 +485,14 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + MMQ_TILE_NE_K*2);
|
||||
#else
|
||||
constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q5_0, mmq_y);
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + txs.qs);
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
|
||||
constexpr int threads_per_row = MMQ_ITER_K / (4 * QR5_0);
|
||||
constexpr int nrows = warp_size / threads_per_row;
|
||||
@@ -527,13 +527,13 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
|
||||
qs1 = __vsubss4(qs1, 0x10101010); // subtract 16
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + kbx*(2*QI5_0) + kqsx + 0] = qs0;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + kbx*(2*QI5_0) + kqsx + QI5_0] = qs1;
|
||||
#else
|
||||
x_qs[i*(2*MMQ_TILE_NE_K + 1) + kbx*(2*QI5_0) + kqsx + 0] = qs0;
|
||||
x_qs[i*(2*MMQ_TILE_NE_K + 1) + kbx*(2*QI5_0) + kqsx + QI5_0] = qs1;
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
constexpr int blocks_per_tile_x_row = MMQ_TILE_NE_K / QI5_0;
|
||||
@@ -550,11 +550,11 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
|
||||
const block_q5_0 * bxi = (const block_q5_0 *) x + kbx0 + i*stride + kbxd;
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kbxd] = bxi->d;
|
||||
#else
|
||||
x_df[i*(MMQ_TILE_NE_K/QI5_0) + i/QI5_0 + kbxd] = bxi->d;
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -563,14 +563,14 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
int * x_qs = (int *) x_tile;
|
||||
half2 * x_dm = (half2 *) (x_qs + 2*MMQ_TILE_NE_K);
|
||||
#else
|
||||
constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q5_1, mmq_y);
|
||||
int * x_qs = (int *) x_tile;
|
||||
half2 * x_dm = (half2 *) (x_qs + txs.qs);
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
|
||||
constexpr int threads_per_row = MMQ_ITER_K / (4 * QR5_1);
|
||||
constexpr int nrows = warp_size / threads_per_row;
|
||||
@@ -603,13 +603,13 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
|
||||
qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + kbx*(2*QI5_1) + kqsx + 0] = qs0;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + kbx*(2*QI5_1) + kqsx + QI5_1] = qs1;
|
||||
#else
|
||||
x_qs[i*(2*MMQ_TILE_NE_K + 1) + kbx*(2*QI5_1) + kqsx + 0] = qs0;
|
||||
x_qs[i*(2*MMQ_TILE_NE_K + 1) + kbx*(2*QI5_1) + kqsx + QI5_1] = qs1;
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
constexpr int blocks_per_tile_x_row = MMQ_TILE_NE_K / QI5_1;
|
||||
@@ -626,11 +626,11 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
|
||||
const block_q5_1 * bxi = (const block_q5_1 *) x + kbx0 + i*stride + kbxd;
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + kbxd] = bxi->dm;
|
||||
#else
|
||||
x_dm[i*(MMQ_TILE_NE_K/QI5_1) + i/QI5_1 + kbxd] = bxi->dm;
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -639,14 +639,14 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_tile + 2*MMQ_TILE_NE_K);
|
||||
#else
|
||||
constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q8_0, mmq_y);
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + txs.qs);
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
|
||||
// MMQ_ITER_K / (4 * QR8_0) == 64 required. but NV has only 32 threads per warp
|
||||
constexpr int threads_per_row = 32;
|
||||
@@ -665,13 +665,13 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
|
||||
const block_q8_0 * bxi = (const block_q8_0 *) x + kbx0 + i*stride + kbx;
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 0 + txi] = get_int_b2(bxi[0].qs, kqsx);
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + MMQ_TILE_NE_K + txi] = get_int_b2(bxi[MMQ_TILE_NE_K/QI8_0].qs, kqsx);
|
||||
#else
|
||||
x_qs[i*(2*MMQ_TILE_NE_K + 1) + 0 + txi] = get_int_b2(bxi[0].qs, kqsx);
|
||||
x_qs[i*(2*MMQ_TILE_NE_K + 1) + MMQ_TILE_NE_K + txi] = get_int_b2(bxi[MMQ_TILE_NE_K/QI8_0].qs, kqsx);
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
constexpr int blocks_per_tile_x_row = 2*MMQ_TILE_NE_K / QI8_0;
|
||||
@@ -688,11 +688,11 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
|
||||
const block_q8_0 * bxi = (const block_q8_0 *) x + kbx0 + i*stride + kbxd;
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kbxd] = bxi->d;
|
||||
#else
|
||||
x_df[i*(2*MMQ_TILE_NE_K/QI8_0) + i/(QI8_0/2) + kbxd] = bxi->d;
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -701,14 +701,14 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + MMQ_TILE_NE_K*2);
|
||||
#else
|
||||
constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_MXFP4, mmq_y);
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + txs.qs);
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
|
||||
constexpr int threads_per_row = MMQ_ITER_K / (4 * QR_MXFP4);
|
||||
constexpr int nrows = warp_size / threads_per_row;
|
||||
@@ -730,13 +730,13 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
const int2 v = get_int_from_table_16(aux_q4, kvalues_mxfp4);
|
||||
const int k0 = kbx * (2 * QI_MXFP4) + kqsx;
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + k0 + 0] = v.x;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + k0 + QI_MXFP4] = v.y;
|
||||
#else
|
||||
x_qs[i*(2*MMQ_TILE_NE_K + 1) + k0 + 0] = v.x;
|
||||
x_qs[i*(2*MMQ_TILE_NE_K + 1) + k0 + QI_MXFP4] = v.y;
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
constexpr int blocks_per_tile_x_row = MMQ_TILE_NE_K / QI_MXFP4;
|
||||
@@ -753,11 +753,11 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
|
||||
const block_mxfp4 * bxi = (const block_mxfp4 *) x + kbx0 + i*stride + kbxd;
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q8_1 + kbxd] = ggml_cuda_e8m0_to_fp32(bxi->e)*0.5f;
|
||||
#else
|
||||
x_df[i*(MMQ_TILE_NE_K/QI_MXFP4) + i/QI_MXFP4 + kbxd] = ggml_cuda_e8m0_to_fp32(bxi->e)*0.5f;
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1178,7 +1178,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma(
|
||||
}
|
||||
}
|
||||
}
|
||||
#elif defined(NEW_MMA_AVAILABLE)
|
||||
#elif defined(TURING_MMA_AVAILABLE)
|
||||
|
||||
typedef tile<16, 4, int> tile_A;
|
||||
typedef tile<16, 8, int> tile_A_8;
|
||||
@@ -1264,14 +1264,14 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
int * x_qs = (int *) x_tile;
|
||||
half2 * x_dm = (half2 *) (x_qs + 2*MMQ_TILE_NE_K);
|
||||
#else
|
||||
constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q2_K, mmq_y);
|
||||
int * x_qs = (int *) x_tile;
|
||||
half2 * x_dm = (half2 *) (x_qs + txs.qs);
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
|
||||
constexpr int threads_per_row = MMQ_ITER_K / (4 * QR2_K);
|
||||
constexpr int nrows = ggml_cuda_get_physical_warp_size() / threads_per_row;
|
||||
@@ -1295,11 +1295,11 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
|
||||
const int x_qs_k = (x_ql_0 >> (2*l)) & 0x03030303;
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q2_K + k] = x_qs_k;
|
||||
#else
|
||||
x_qs[i*(2*MMQ_TILE_NE_K + 1) + k] = x_qs_k;
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
const int sc_m = bxi->scales[kqsx];
|
||||
@@ -1310,11 +1310,11 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
const half2 x_dm_ik = make_half2(bxi_dmf.x*(sc_m & 0x0F), bxi_dmf.y*(sc_m >> 4));
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
x_dm[i*MMQ_MMA_TILE_X_K_Q2_K + kqsx] = x_dm_ik;
|
||||
#else
|
||||
x_dm[i*(MMQ_TILE_NE_K + 1) + kqsx] = x_dm_ik;
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1452,7 +1452,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma(
|
||||
}
|
||||
}
|
||||
}
|
||||
#elif defined(NEW_MMA_AVAILABLE)
|
||||
#elif defined(TURING_MMA_AVAILABLE)
|
||||
|
||||
typedef tile<16, 4, int> tile_A;
|
||||
typedef tile<16, 8, int> tile_A_8;
|
||||
@@ -1582,7 +1582,7 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + MMQ_TILE_NE_K*2);
|
||||
#else
|
||||
@@ -1590,7 +1590,7 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + txs.qs);
|
||||
int * x_sc = (int *) (x_df + txs.dm);
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
|
||||
constexpr int threads_per_row = MMQ_ITER_K / (4 * QR3_K);
|
||||
constexpr int nrows = warp_size / threads_per_row;
|
||||
@@ -1618,11 +1618,11 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
|
||||
const int x_qs_k = __vsubss4(x_ql_k | x_qh_k, 0x04040404);
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + k] = x_qs_k;
|
||||
#else
|
||||
x_qs[i*(2*MMQ_TILE_NE_K + 1) + k] = x_qs_k;
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1649,7 +1649,7 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
|
||||
const int sc = __vsubss4(sc_low | sc_high, 0x20202020);
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
const int8_t * sc8 = (const int8_t *) ≻
|
||||
const float d = bxi->d;
|
||||
|
||||
@@ -1659,10 +1659,10 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
}
|
||||
#else
|
||||
x_sc[i*(MMQ_TILE_NE_K/8) + i/8 + ksc] = sc;
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
#if !(defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE))
|
||||
#if !(defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE))
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps*warp_size) {
|
||||
int i = (i0 + threadIdx.y*warp_size + threadIdx.x) % mmq_y;
|
||||
@@ -1675,7 +1675,7 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
|
||||
x_df[i] = bxi->d;
|
||||
}
|
||||
#endif // !(defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE))
|
||||
#endif // !(defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE))
|
||||
}
|
||||
|
||||
template <int mmq_x, int mmq_y>
|
||||
@@ -1728,7 +1728,7 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
int * x_qs = (int *) x_tile;
|
||||
half2 * x_dm = (half2 *) (x_qs + 2*MMQ_TILE_NE_K);
|
||||
#else
|
||||
@@ -1736,7 +1736,7 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
int * x_qs = (int *) x_tile;
|
||||
half2 * x_dm = (half2 *) (x_qs + txs.qs);
|
||||
int * x_sc = (int *) (x_dm + txs.dm);
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
|
||||
constexpr int threads_per_row = MMQ_ITER_K / (4 * QR4_K);
|
||||
constexpr int nrows = warp_size / threads_per_row;
|
||||
@@ -1753,15 +1753,15 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
const block_q4_K * bxi = (const block_q4_K *) x + kbx0 + i*stride;
|
||||
const int qs0 = get_int_b4(bxi->qs, txi);
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + 16*(txi/8) + txi % 8 + 0] = (qs0 >> 0) & 0x0F0F0F0F;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + 16*(txi/8) + txi % 8 + 8] = (qs0 >> 4) & 0x0F0F0F0F;
|
||||
#else
|
||||
x_qs[i*(MMQ_TILE_NE_K + 1) + txi] = qs0;
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
constexpr int rows_per_warp = warp_size / 2;
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps*rows_per_warp) {
|
||||
@@ -1829,7 +1829,7 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
|
||||
x_sc[i*(MMQ_TILE_NE_K/8) + i/8 + ksc] = scales8;
|
||||
}
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
template <int mmq_x, int mmq_y>
|
||||
@@ -1872,7 +1872,7 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
int * x_qs = (int *) x_tile;
|
||||
half2 * x_dm = (half2 *) (x_qs + MMQ_TILE_NE_K*2);
|
||||
#else
|
||||
@@ -1880,7 +1880,7 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
int * x_qs = (int *) x_tile;
|
||||
half2 * x_dm = (half2 *) (x_qs + txs.qs);
|
||||
int * x_sc = (int *) (x_dm + txs.dm);
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
|
||||
constexpr int threads_per_row = MMQ_ITER_K / (4 * QR5_K);
|
||||
constexpr int nrows = warp_size / threads_per_row;
|
||||
@@ -1908,16 +1908,16 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
const int kq0 = ky - ky % (QI5_K/2) + txi % (QI5_K/4) + 0;
|
||||
const int kq1 = ky - ky % (QI5_K/2) + txi % (QI5_K/4) + QI5_K/4;
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + kq0] = ql0 | qh0;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + kq1] = ql1 | qh1;
|
||||
#else
|
||||
x_qs[i*(2*MMQ_TILE_NE_K + 1) + kq0] = ql0 | qh0;
|
||||
x_qs[i*(2*MMQ_TILE_NE_K + 1) + kq1] = ql1 | qh1;
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
constexpr int rows_per_warp = warp_size / 2;
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps*rows_per_warp) {
|
||||
@@ -1986,7 +1986,7 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
|
||||
x_sc[i*(MMQ_TILE_NE_K/8) + i/8 + ksc] = scales8;
|
||||
}
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
template <int mmq_x, int mmq_y>
|
||||
@@ -2029,7 +2029,7 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + MMQ_TILE_NE_K*2);
|
||||
int * x_sc = (int *) (x_df + MMQ_TILE_NE_K/QI6_K);
|
||||
@@ -2038,7 +2038,7 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + txs.qs);
|
||||
int * x_sc = (int *) (x_df + txs.dm);
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
|
||||
constexpr int threads_per_row = MMQ_ITER_K / (4 * QR6_K);
|
||||
constexpr int nrows = warp_size / threads_per_row;
|
||||
@@ -2065,13 +2065,13 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
const int kq0 = 2*txi - txi % (QI6_K/2) + 0;
|
||||
const int kq1 = 2*txi - txi % (QI6_K/2) + QI6_K/2;
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q6_K + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q6_K + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
|
||||
#else
|
||||
x_qs[i*(2*MMQ_TILE_NE_K + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
|
||||
x_qs[i*(2*MMQ_TILE_NE_K + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
@@ -2084,11 +2084,11 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
|
||||
const block_q6_K * bxi = (const block_q6_K *) x + kbx0 + i*stride;
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q6_K] = bxi->d;
|
||||
#else
|
||||
x_df[i*(MMQ_TILE_NE_K/QI6_K) + i/QI6_K] = bxi->d;
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
constexpr int rows_per_warp = warp_size / 4;
|
||||
@@ -2102,11 +2102,11 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
|
||||
const block_q6_K * bxi = (const block_q6_K *) x + kbx0 + i*stride + (threadIdx.x % (MMQ_TILE_NE_K/8)) / 4;
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
x_sc[i*MMQ_MMA_TILE_X_K_Q6_K + threadIdx.x%4] = get_int_b2(bxi->scales, threadIdx.x % (MMQ_TILE_NE_K/8));
|
||||
#else
|
||||
x_sc[i*(MMQ_TILE_NE_K/8) + i/8 + threadIdx.x%(MMQ_TILE_NE_K/8)] = get_int_b2(bxi->scales, threadIdx.x%(QI6_K/8));
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2199,7 +2199,7 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
|
||||
}
|
||||
}
|
||||
}
|
||||
#elif defined(NEW_MMA_AVAILABLE)
|
||||
#elif defined(TURING_MMA_AVAILABLE)
|
||||
|
||||
typedef tile<16, 4, int> tile_A;
|
||||
typedef tile< 8, 4, int> tile_B;
|
||||
@@ -2311,14 +2311,14 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + MMQ_TILE_NE_K*2);
|
||||
#else
|
||||
constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ4_NL, mmq_y);
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + txs.qs);
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
|
||||
constexpr int threads_per_row = MMQ_ITER_K / (4 * QR4_NL);
|
||||
constexpr int nrows = warp_size / threads_per_row;
|
||||
@@ -2340,13 +2340,13 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
const int2 v = get_int_from_table_16(aux_q4, kvalues_iq4nl);
|
||||
const int k0 = kbx * (2 * QI4_NL) + kqsx;
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + k0 + 0] = v.x;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + k0 + QI4_NL] = v.y;
|
||||
#else
|
||||
x_qs[i*(2*MMQ_TILE_NE_K + 1) + k0 + 0] = v.x;
|
||||
x_qs[i*(2*MMQ_TILE_NE_K + 1) + k0 + QI4_NL] = v.y;
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
constexpr int blocks_per_tile_x_row = MMQ_TILE_NE_K / QI4_NL;
|
||||
@@ -2363,11 +2363,11 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
|
||||
const block_iq4_nl * bxi = (const block_iq4_nl *) x + kbx0 + i*stride + kbxd;
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kbxd] = __half2float(bxi->d);
|
||||
#else
|
||||
x_df[i*(MMQ_TILE_NE_K/QI4_NL) + i/QI4_NL + kbxd] = __half2float(bxi->d);
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2376,14 +2376,14 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + MMQ_TILE_NE_K*2);
|
||||
#else
|
||||
constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ2_XXS, mmq_y);
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + txs.qs);
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
|
||||
constexpr int threads_per_row = (MMQ_ITER_K / (4 * QR2_XXS)) / 2;
|
||||
constexpr int nrows = warp_size / threads_per_row;
|
||||
@@ -2414,22 +2414,22 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
const int signs1 = __vcmpne4(((signs_packed & 0x30) << 3) | ((signs_packed & 0xC0) << 17), 0x00000000);
|
||||
const int grid1 = __vsub4(grid_pos[1] ^ signs1, signs1);
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l + 0)] = grid0;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l + 1)] = grid1;
|
||||
#else
|
||||
x_qs[i*(2*MMQ_TILE_NE_K + 1) + 8*kqsx + (2*l + 0)] = grid0;
|
||||
x_qs[i*(2*MMQ_TILE_NE_K + 1) + 8*kqsx + (2*l + 1)] = grid1;
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
const int ls = aux32 >> 28;
|
||||
const float d = bxi->d;
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx] = (ls*d + d/2)/4;
|
||||
#else
|
||||
x_df[i*(MMQ_TILE_NE_K/4) + i/4 + kqsx] = (ls*d + d/2)/4;
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2438,14 +2438,14 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + MMQ_TILE_NE_K*2);
|
||||
#else
|
||||
constexpr tile_x_sizes txs = MMQ_DP4A_TXS_Q8_0_16;
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + txs.qs);
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
|
||||
constexpr int threads_per_row = (MMQ_ITER_K / (4 * QR2_XS)) / 2;
|
||||
constexpr int nrows = warp_size / threads_per_row;
|
||||
@@ -2472,24 +2472,24 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
const int grid_l = __vsub4(grid_pos[0] ^ signs[0], signs[0]);
|
||||
const int grid_h = __vsub4(grid_pos[1] ^ signs[1], signs[1]);
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + (2*l + 0)] = grid_l;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + (2*l + 1)] = grid_h;
|
||||
#else
|
||||
x_qs[i*(2*MMQ_TILE_NE_K + 1) + 8*kqsx + (2*l + 0)] = grid_l;
|
||||
x_qs[i*(2*MMQ_TILE_NE_K + 1) + 8*kqsx + (2*l + 1)] = grid_h;
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
const int ls = bxi->scales[kqsx];
|
||||
const float d = bxi->d;
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q3_K + 2*kqsx+0] = ((ls & 0x0F)*d + d/2)/4;
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q3_K + 2*kqsx+1] = ((ls >> 4)*d + d/2)/4;
|
||||
#else
|
||||
x_df[i*(2*MMQ_TILE_NE_K*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+0] = ((ls & 0x0F)*d + d/2)/4;
|
||||
x_df[i*(2*MMQ_TILE_NE_K*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+1] = ((ls >> 4)*d + d/2)/4;
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2498,14 +2498,14 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + MMQ_TILE_NE_K*2);
|
||||
#else
|
||||
constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ2_S, mmq_y);
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + txs.qs);
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
|
||||
constexpr int threads_per_row = (MMQ_ITER_K / (4 * QR2_S)) / 2;
|
||||
constexpr int nrows = warp_size / threads_per_row;
|
||||
@@ -2539,24 +2539,24 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
const int grid_l = __vsub4(grid_pos[0] ^ signs0, signs0);
|
||||
const int grid_h = __vsub4(grid_pos[1] ^ signs1, signs1);
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + (2*l + 0)] = grid_l;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + (2*l + 1)] = grid_h;
|
||||
#else
|
||||
x_qs[i*(2*MMQ_TILE_NE_K + 1) + 8*kqsx + (2*l + 0)] = grid_l;
|
||||
x_qs[i*(2*MMQ_TILE_NE_K + 1) + 8*kqsx + (2*l + 1)] = grid_h;
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
const int ls = bxi->scales[kqsx];
|
||||
const float d = bxi->d;
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q3_K + 2*kqsx+0] = ((ls & 0x0F)*d + d/2)/4;
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q3_K + 2*kqsx+1] = ((ls >> 4)*d + d/2)/4;
|
||||
#else
|
||||
x_df[i*(2*MMQ_TILE_NE_K*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+0] = ((ls & 0x0F)*d + d/2)/4;
|
||||
x_df[i*(2*MMQ_TILE_NE_K*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+1] = ((ls >> 4)*d + d/2)/4;
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2565,14 +2565,14 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + MMQ_TILE_NE_K*2);
|
||||
#else
|
||||
constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ3_XXS, mmq_y);
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + txs.qs);
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
|
||||
constexpr int threads_per_row = (MMQ_ITER_K / (4 * QR3_XXS)) / 2;
|
||||
constexpr int nrows = warp_size / threads_per_row;
|
||||
@@ -2601,22 +2601,22 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
const int grid_l = __vsub4(grid_pos.x ^ signs[0], signs[0]);
|
||||
const int grid_h = __vsub4(grid_pos.y ^ signs[1], signs[1]);
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l + 0)] = grid_l;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l + 1)] = grid_h;
|
||||
#else
|
||||
x_qs[i*(2*MMQ_TILE_NE_K + 1) + 8*kqsx + (2*l + 0)] = grid_l;
|
||||
x_qs[i*(2*MMQ_TILE_NE_K + 1) + 8*kqsx + (2*l + 1)] = grid_h;
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
const int ls = aux32 >> 28;
|
||||
const float d = bxi->d;
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx] = (ls*d + d/2)/2;
|
||||
#else
|
||||
x_df[i*(MMQ_TILE_NE_K/4) + i/4 + kqsx] = (ls*d + d/2)/2;
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2625,14 +2625,14 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + MMQ_TILE_NE_K*2);
|
||||
#else
|
||||
constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ3_S, mmq_y);
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + txs.qs);
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
|
||||
constexpr int threads_per_row = (MMQ_ITER_K / (4 * QR3_S)) / 2;
|
||||
constexpr int nrows = warp_size / threads_per_row;
|
||||
@@ -2668,22 +2668,22 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
const int grid_l = __vsub4(grid_pos.x ^ signs0, signs0);
|
||||
const int grid_h = __vsub4(grid_pos.y ^ signs1, signs1);
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l+0)] = grid_l;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l+1)] = grid_h;
|
||||
#else
|
||||
x_qs[i*(2*MMQ_TILE_NE_K + 1) + 8*kqsx + (2*l+0)] = grid_l;
|
||||
x_qs[i*(2*MMQ_TILE_NE_K + 1) + 8*kqsx + (2*l+1)] = grid_h;
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
const int ls = 1 + 2*((bxi->scales[kqsx/2] >> (((2*kqsx) << 1) & 0x04)) & 0x0F);
|
||||
const float d = bxi->d;
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx] = ls*d;
|
||||
#else
|
||||
x_df[i*(MMQ_TILE_NE_K/4) + i/4 + kqsx] = ls*d;
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2692,14 +2692,14 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
int * x_qs = (int *) x_tile;
|
||||
half2 * x_ds = (half2 *) (x_qs + MMQ_TILE_NE_K*2);
|
||||
#else
|
||||
constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ3_S, mmq_y);
|
||||
int * x_qs = (int *) x_tile;
|
||||
half2 * x_ds = (half2 *) (x_qs + txs.qs);
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
|
||||
constexpr int threads_per_row = MMQ_ITER_K / (4 * QR1_S);
|
||||
constexpr int nrows = warp_size / threads_per_row;
|
||||
@@ -2727,23 +2727,23 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
const int grid0 = (grid >> 0) & 0x0F0F0F0F;
|
||||
const int grid1 = (grid >> 4) & 0x0F0F0F0F;
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + 8*kqsx + (2*l+0)] = grid0;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + 8*kqsx + (2*l+1)] = grid1;
|
||||
#else
|
||||
x_qs[i*(2*MMQ_TILE_NE_K + 1) + 8*kqsx + (2*l+0)] = grid0;
|
||||
x_qs[i*(2*MMQ_TILE_NE_K + 1) + 8*kqsx + (2*l+1)] = grid1;
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
const float d1q = __half2float(bxi->d) * (((qh >> 11) & 0x0E) + 1);
|
||||
const float delta = -1.0f + IQ1S_DELTA - (qh & 0x8000) * (2.0f*IQ1S_DELTA/0x8000);
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
x_ds[i*MMQ_MMA_TILE_X_K_Q8_1 + kqsx] = make_half2(d1q, d1q*delta);
|
||||
#else
|
||||
x_ds[i*(MMQ_TILE_NE_K/4) + i/4 + kqsx] = make_half2(d1q, d1q*delta);
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2752,14 +2752,14 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + MMQ_TILE_NE_K*2);
|
||||
#else
|
||||
constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ4_XS, mmq_y);
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + txs.qs);
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
|
||||
constexpr int threads_per_row = MMQ_ITER_K / (4 * QR4_XS);
|
||||
constexpr int nrows = warp_size / threads_per_row;
|
||||
@@ -2779,13 +2779,13 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
const int2 v = get_int_from_table_16(aux_q4, kvalues_iq4nl);
|
||||
const int k0 = 8 * (kqsx / 4) + kqsx % 4;
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + k0 + 0] = v.x;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + k0 + 4] = v.y;
|
||||
#else
|
||||
x_qs[i*(2*MMQ_TILE_NE_K + 1) + k0 + 0] = v.x;
|
||||
x_qs[i*(2*MMQ_TILE_NE_K + 1) + k0 + 4] = v.y;
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
constexpr int rows_per_warp = warp_size / 8;
|
||||
@@ -2804,11 +2804,11 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
const int ls = ((bxi->scales_l[(threadIdx.x % 8)/2] >> (4*(threadIdx.x % 2))) & 0x0F)
|
||||
| (((bxi->scales_h >> (2*(threadIdx.x % 8))) & 0x03) << 4);
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + threadIdx.x % 8] = d * (ls - 32);
|
||||
#else
|
||||
x_df[i*(MMQ_TILE_NE_K/4) + i/4 + threadIdx.x % 8] = d * (ls - 32);
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2859,9 +2859,9 @@ static __device__ __forceinline__ void mmq_write_back_mma(
|
||||
constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp.
|
||||
|
||||
const int i0 = (threadIdx.y / ntx) * (ntx*tile_C::I);
|
||||
#if defined(NEW_MMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
|
||||
#if defined(TURING_MMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
|
||||
static_assert(nwarps*tile_C::I == mmq_y, "nwarps*tile_C::I != mmq_y");
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
|
||||
@@ -3061,13 +3061,13 @@ static __device__ __forceinline__ void mul_mat_q_process_tile(
|
||||
int * tile_y = data_mul_mat_q + mmq_x;
|
||||
int * tile_x = tile_y + GGML_PAD(mmq_x*MMQ_TILE_Y_K, nwarps*warp_size);
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
constexpr vec_dot_mmq_t vec_dot = mmq_type_traits<mmq_x, mmq_y, need_check, type>::vec_dot_mma;
|
||||
constexpr mmq_write_back_t write_back = mmq_write_back_mma<type, mmq_x, mmq_y, need_check>;
|
||||
#else
|
||||
constexpr vec_dot_mmq_t vec_dot = mmq_type_traits<mmq_x, mmq_y, need_check, type>::vec_dot_dp4a;
|
||||
constexpr mmq_write_back_t write_back = mmq_write_back_dp4a<mmq_x, mmq_y, need_check>;
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
|
||||
constexpr int blocks_per_iter = MMQ_ITER_K / qk;
|
||||
|
||||
@@ -3534,7 +3534,7 @@ static size_t mmq_get_nbytes_shared(const int mmq_x, const int mmq_y, const int
|
||||
const tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(type, mmq_y);
|
||||
const int mmq_tile_x_k = mmq_get_mma_tile_x_k(type);
|
||||
const size_t nbs_ids = mmq_x*sizeof(int);
|
||||
const size_t nbs_x = (new_mma_available(cc) || amd_mfma_available(cc)) ? mmq_y*mmq_tile_x_k*sizeof(int) : txs.qs*sizeof(int) + txs.dm*sizeof(half2) + txs.sc*sizeof(int);
|
||||
const size_t nbs_x = (turing_mma_available(cc) || amd_mfma_available(cc)) ? mmq_y*mmq_tile_x_k*sizeof(int) : txs.qs*sizeof(int) + txs.dm*sizeof(half2) + txs.sc*sizeof(int);
|
||||
const size_t nbs_y = mmq_x*sizeof(block_q8_1_mmq);
|
||||
return nbs_ids + nbs_x + GGML_PAD(nbs_y, nwarps*warp_size*sizeof(int));
|
||||
}
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
#include "ggml.h"
|
||||
#include "common.cuh"
|
||||
#include "mmv.cuh"
|
||||
#include "convert.cuh"
|
||||
#include "mmvf.cuh"
|
||||
|
||||
template <typename T, typename type_acc, int ncols_dst, int block_size>
|
||||
static __global__ void mul_mat_vec(
|
||||
static __global__ void mul_mat_vec_f(
|
||||
const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, float * __restrict__ dst,
|
||||
const int ncols2, const int nchannels_y, const int stride_row, const int stride_col_y2, const int stride_col_dst,
|
||||
const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
|
||||
@@ -37,7 +38,7 @@ static __global__ void mul_mat_vec(
|
||||
|
||||
float sumf[ncols_dst] = {0.0f};
|
||||
|
||||
if constexpr (std::is_same<T, float>::value) {
|
||||
if constexpr (std::is_same_v<T, float>) {
|
||||
const float2 * x2 = (const float2 *) x;
|
||||
|
||||
for (int col2 = tid; col2 < ncols2; col2 += block_size) {
|
||||
@@ -50,10 +51,10 @@ static __global__ void mul_mat_vec(
|
||||
sumf[j] += tmpx.y*tmpy.y;
|
||||
}
|
||||
}
|
||||
} else if constexpr (std::is_same<T, half>::value) {
|
||||
} else if constexpr (std::is_same_v<T, half>) {
|
||||
const half2 * x2 = (const half2 *) x;
|
||||
|
||||
if (std::is_same<type_acc, float>::value) {
|
||||
if (std::is_same_v<type_acc, float>) {
|
||||
for (int col2 = tid; col2 < ncols2; col2 += block_size) {
|
||||
const float2 tmpx = __half22float2(x2[col2]);
|
||||
|
||||
@@ -86,19 +87,19 @@ static __global__ void mul_mat_vec(
|
||||
NO_DEVICE_CODE;
|
||||
#endif // FP16_AVAILABLE
|
||||
}
|
||||
} else if constexpr (std::is_same<T, nv_bfloat16>::value) {
|
||||
} else if constexpr (std::is_same_v<T, nv_bfloat16>) {
|
||||
const int * x2 = (const int *) x;
|
||||
for (int col2 = tid; col2 < ncols2; col2 += block_size) {
|
||||
const int tmpx = x2[col2];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols_dst; ++j) {
|
||||
const float2 tmpy = y2[j*stride_col_y2 + col2];
|
||||
sumf[j] += float(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[0]) * tmpy.x;
|
||||
sumf[j] += float(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[1]) * tmpy.y;
|
||||
sumf[j] += ggml_cuda_cast<float>(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[0]) * tmpy.x;
|
||||
sumf[j] += ggml_cuda_cast<float>(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[1]) * tmpy.y;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
static_assert(std::is_same<T, void>::value, "unsupported type");
|
||||
static_assert(std::is_same_v<T, void>, "unsupported type");
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
@@ -126,7 +127,7 @@ static __global__ void mul_mat_vec(
|
||||
}
|
||||
|
||||
template <typename T, typename type_acc, int ncols_dst>
|
||||
static void launch_mul_mat_vec_cuda(
|
||||
static void launch_mul_mat_vec_f_cuda(
|
||||
const T * x, const float * y, const int32_t * ids, float * dst,
|
||||
const int64_t ncols, const int64_t nrows,
|
||||
const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
|
||||
@@ -141,11 +142,9 @@ static void launch_mul_mat_vec_cuda(
|
||||
GGML_ASSERT( nsamples_dst % nsamples_x == 0);
|
||||
const int64_t channel_ratio = nchannels_dst / nchannels_x;
|
||||
const int64_t sample_ratio = nsamples_dst / nsamples_x;
|
||||
int device;
|
||||
int warp_size;
|
||||
|
||||
CUDA_CHECK(cudaGetDevice(&device));
|
||||
warp_size = ggml_cuda_info().devices[device].warp_size;
|
||||
const int device = ggml_cuda_get_device();
|
||||
const int warp_size = ggml_cuda_info().devices[device].warp_size;
|
||||
|
||||
int64_t block_size_best = warp_size;
|
||||
int64_t niter_best = (ncols + 2*warp_size - 1) / (2*warp_size);
|
||||
@@ -161,54 +160,54 @@ static void launch_mul_mat_vec_cuda(
|
||||
}
|
||||
}
|
||||
|
||||
const int smem = warp_size*sizeof(float);
|
||||
const int nbytes_shared = warp_size*sizeof(float);
|
||||
const dim3 block_nums(nrows, nchannels_dst, nsamples_dst);
|
||||
const dim3 block_dims(block_size_best, 1, 1);
|
||||
switch (block_size_best) {
|
||||
case 32: {
|
||||
mul_mat_vec<T, type_acc, ncols_dst, 32><<<block_nums, block_dims, smem, stream>>>
|
||||
mul_mat_vec_f<T, type_acc, ncols_dst, 32><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} break;
|
||||
case 64: {
|
||||
mul_mat_vec<T, type_acc, ncols_dst, 64><<<block_nums, block_dims, smem, stream>>>
|
||||
mul_mat_vec_f<T, type_acc, ncols_dst, 64><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} break;
|
||||
case 96: {
|
||||
mul_mat_vec<T, type_acc, ncols_dst, 96><<<block_nums, block_dims, smem, stream>>>
|
||||
mul_mat_vec_f<T, type_acc, ncols_dst, 96><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} break;
|
||||
case 128: {
|
||||
mul_mat_vec<T, type_acc, ncols_dst, 128><<<block_nums, block_dims, smem, stream>>>
|
||||
mul_mat_vec_f<T, type_acc, ncols_dst, 128><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} break;
|
||||
case 160: {
|
||||
mul_mat_vec<T, type_acc, ncols_dst, 160><<<block_nums, block_dims, smem, stream>>>
|
||||
mul_mat_vec_f<T, type_acc, ncols_dst, 160><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} break;
|
||||
case 192: {
|
||||
mul_mat_vec<T, type_acc, ncols_dst, 192><<<block_nums, block_dims, smem, stream>>>
|
||||
mul_mat_vec_f<T, type_acc, ncols_dst, 192><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} break;
|
||||
case 224: {
|
||||
mul_mat_vec<T, type_acc, ncols_dst, 224><<<block_nums, block_dims, smem, stream>>>
|
||||
mul_mat_vec_f<T, type_acc, ncols_dst, 224><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} break;
|
||||
case 256: {
|
||||
mul_mat_vec<T, type_acc, ncols_dst, 256><<<block_nums, block_dims, smem, stream>>>
|
||||
mul_mat_vec_f<T, type_acc, ncols_dst, 256><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
@@ -220,7 +219,7 @@ static void launch_mul_mat_vec_cuda(
|
||||
}
|
||||
|
||||
template <typename T, typename type_acc>
|
||||
static void mul_mat_vec_cuda_switch_ncols_dst(
|
||||
static void mul_mat_vec_f_cuda_switch_ncols_dst(
|
||||
const T * x, const float * y, const int32_t * ids, float * dst,
|
||||
const int64_t ncols, const int64_t nrows, const int64_t ncols_dst,
|
||||
const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
|
||||
@@ -230,49 +229,49 @@ static void mul_mat_vec_cuda_switch_ncols_dst(
|
||||
cudaStream_t stream) {
|
||||
switch (ncols_dst) {
|
||||
case 1:
|
||||
launch_mul_mat_vec_cuda<T, type_acc, 1>
|
||||
launch_mul_mat_vec_f_cuda<T, type_acc, 1>
|
||||
(x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
|
||||
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
break;
|
||||
case 2:
|
||||
launch_mul_mat_vec_cuda<T, type_acc, 2>
|
||||
launch_mul_mat_vec_f_cuda<T, type_acc, 2>
|
||||
(x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
|
||||
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
break;
|
||||
case 3:
|
||||
launch_mul_mat_vec_cuda<T, type_acc, 3>
|
||||
launch_mul_mat_vec_f_cuda<T, type_acc, 3>
|
||||
(x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
|
||||
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
break;
|
||||
case 4:
|
||||
launch_mul_mat_vec_cuda<T, type_acc, 4>
|
||||
launch_mul_mat_vec_f_cuda<T, type_acc, 4>
|
||||
(x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
|
||||
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
break;
|
||||
case 5:
|
||||
launch_mul_mat_vec_cuda<T, type_acc, 5>
|
||||
launch_mul_mat_vec_f_cuda<T, type_acc, 5>
|
||||
(x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
|
||||
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
break;
|
||||
case 6:
|
||||
launch_mul_mat_vec_cuda<T, type_acc, 6>
|
||||
launch_mul_mat_vec_f_cuda<T, type_acc, 6>
|
||||
(x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
|
||||
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
break;
|
||||
case 7:
|
||||
launch_mul_mat_vec_cuda<T, type_acc, 7>
|
||||
launch_mul_mat_vec_f_cuda<T, type_acc, 7>
|
||||
(x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
|
||||
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
break;
|
||||
case 8:
|
||||
launch_mul_mat_vec_cuda<T, type_acc, 8>
|
||||
launch_mul_mat_vec_f_cuda<T, type_acc, 8>
|
||||
(x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
|
||||
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
@@ -284,7 +283,7 @@ static void mul_mat_vec_cuda_switch_ncols_dst(
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void mul_mat_vec_cuda(
|
||||
static void mul_mat_vec_f_cuda(
|
||||
const T * x, const float * y, const int32_t * ids, float * dst,
|
||||
const int64_t ncols, const int64_t nrows, const int64_t ncols_dst,
|
||||
const int64_t stride_row, const int64_t stride_col_y, const int stride_col_dst,
|
||||
@@ -292,22 +291,22 @@ static void mul_mat_vec_cuda(
|
||||
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
|
||||
const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
|
||||
enum ggml_prec prec, cudaStream_t stream) {
|
||||
if constexpr(std::is_same<T, half>::value) {
|
||||
if constexpr(std::is_same_v<T, half>) {
|
||||
if (prec == GGML_PREC_DEFAULT) {
|
||||
mul_mat_vec_cuda_switch_ncols_dst<T, half>
|
||||
mul_mat_vec_f_cuda_switch_ncols_dst<T, half>
|
||||
(x, y, ids, dst, ncols, nrows, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
|
||||
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
return;
|
||||
}
|
||||
}
|
||||
mul_mat_vec_cuda_switch_ncols_dst<T, float>
|
||||
mul_mat_vec_f_cuda_switch_ncols_dst<T, float>
|
||||
(x, y, ids, dst, ncols, nrows, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
|
||||
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
}
|
||||
|
||||
void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
|
||||
void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
|
||||
GGML_ASSERT( src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(!ids || ids->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
@@ -355,19 +354,19 @@ void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor *
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32: {
|
||||
const float * src0_d = (const float *) src0->data;
|
||||
mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
|
||||
mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
|
||||
ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
|
||||
ne03, ne3, s03, s13, s3, prec, ctx.stream());
|
||||
} break;
|
||||
case GGML_TYPE_F16: {
|
||||
const half * src0_d = (const half *) src0->data;
|
||||
mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
|
||||
mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
|
||||
ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
|
||||
ne03, ne3, s03, s13, s3, prec, ctx.stream());
|
||||
} break;
|
||||
case GGML_TYPE_BF16: {
|
||||
const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0->data;
|
||||
mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
|
||||
mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
|
||||
ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
|
||||
ne03, ne3, s03, s13, s3, prec, ctx.stream());
|
||||
} break;
|
||||
@@ -376,7 +375,7 @@ void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor *
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_op_mul_mat_vec(
|
||||
void ggml_cuda_op_mul_mat_vec_f(
|
||||
ggml_backend_cuda_context & ctx,
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
||||
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
||||
@@ -414,19 +413,19 @@ void ggml_cuda_op_mul_mat_vec(
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32: {
|
||||
const float * src0_d = (const float *) src0_dd_i;
|
||||
mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
|
||||
mul_mat_vec_f_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
|
||||
} break;
|
||||
case GGML_TYPE_F16: {
|
||||
const half * src0_d = (const half *) src0_dd_i;
|
||||
mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
|
||||
mul_mat_vec_f_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
|
||||
} break;
|
||||
case GGML_TYPE_BF16: {
|
||||
const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0_dd_i;
|
||||
mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
|
||||
mul_mat_vec_f_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
|
||||
} break;
|
||||
@@ -442,15 +441,15 @@ void ggml_cuda_op_mul_mat_vec(
|
||||
GGML_UNUSED(src1_padded_row_size);
|
||||
}
|
||||
|
||||
bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_ne, int64_t ne11) {
|
||||
bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0_ne, int64_t ne11) {
|
||||
if (src0_ne[0] % 2 != 0) {
|
||||
return false;
|
||||
}
|
||||
switch (type) {
|
||||
case GGML_TYPE_F32:
|
||||
if (GGML_CUDA_CC_IS_NVIDIA(cc)) {
|
||||
if (cc >= GGML_CUDA_CC_ADA_LOVELACE) {
|
||||
return ne11 <= 8;
|
||||
if (ampere_mma_available(cc)) {
|
||||
return ne11 <= 3;
|
||||
}
|
||||
if (cc >= GGML_CUDA_CC_TURING) {
|
||||
return ne11 <= 4;
|
||||
@@ -466,6 +465,9 @@ bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_
|
||||
case GGML_TYPE_F16:
|
||||
if (GGML_CUDA_CC_IS_NVIDIA(cc)) {
|
||||
const bool src0_small = (src0_ne[1] <= 512 || src0_ne[2]*src0_ne[3] == 1);
|
||||
if (ampere_mma_available(cc)) {
|
||||
return src0_small && ne11 == 1;
|
||||
}
|
||||
if (cc >= GGML_CUDA_CC_ADA_LOVELACE) {
|
||||
return src0_small && ne11 <= 4;
|
||||
}
|
||||
@@ -486,6 +488,9 @@ bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_
|
||||
case GGML_TYPE_BF16:
|
||||
if (GGML_CUDA_CC_IS_NVIDIA(cc)) {
|
||||
const bool src0_small = (src0_ne[1] <= 512 || src0_ne[2]*src0_ne[3] == 1);
|
||||
if (ampere_mma_available(cc)) {
|
||||
return src0_small && ne11 == 1;
|
||||
}
|
||||
if (cc >= GGML_CUDA_CC_ADA_LOVELACE) {
|
||||
return src0_small && ne11 <= 4;
|
||||
}
|
||||
@@ -1,11 +1,11 @@
|
||||
#include "common.cuh"
|
||||
|
||||
void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
|
||||
void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_mul_mat_vec(
|
||||
void ggml_cuda_op_mul_mat_vec_f(
|
||||
ggml_backend_cuda_context & ctx,
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
||||
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
||||
const int64_t src1_padded_row_size, cudaStream_t stream);
|
||||
|
||||
bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_ne, int64_t ne11);
|
||||
bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0_ne, int64_t ne11);
|
||||
@@ -0,0 +1,49 @@
|
||||
#include "ggml-impl.h"
|
||||
#include "opt-step-sgd.cuh"
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
static __global__ void opt_step_sgd_f32(
|
||||
float * __restrict__ x, const float * __restrict__ g,
|
||||
const float * __restrict__ pars, const int64_t k) {
|
||||
|
||||
const int64_t i = (int64_t) blockIdx.x*blockDim.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
x[i] = x[i] * (1.0f - pars[0] * pars[1]) - pars[0] * g[i];
|
||||
}
|
||||
|
||||
static void opt_step_sgd_f32_cuda(
|
||||
float * x, const float * g, const float * __restrict__ pars, const int64_t k, cudaStream_t stream) {
|
||||
|
||||
const dim3 block_dims(CUDA_OPT_STEP_SGD_BLOCK_SIZE, 1, 1);
|
||||
const dim3 block_nums((k + CUDA_OPT_STEP_SGD_BLOCK_SIZE - 1) / CUDA_OPT_STEP_SGD_BLOCK_SIZE, 1, 1);
|
||||
opt_step_sgd_f32<<<block_nums, block_dims, 0, stream>>>(x, g, pars, k);
|
||||
}
|
||||
|
||||
void ggml_cuda_opt_step_sgd(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src0_grad = dst->src[1];
|
||||
const ggml_tensor * params = dst->src[2];
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src0_grad->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(params->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous(src0_grad));
|
||||
GGML_ASSERT(ggml_is_contiguous(params));
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, src0_grad));
|
||||
GGML_ASSERT(ggml_nelements(params) == 2);
|
||||
|
||||
float * src0_d = (float *) src0->data;
|
||||
const float * src0_grad_d = (const float *) src0_grad->data;
|
||||
const float * params_d = (const float *) params->data;
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
const int64_t ne = ggml_nelements(src0);
|
||||
|
||||
opt_step_sgd_f32_cuda(src0_d, src0_grad_d, params_d, ne, stream);
|
||||
}
|
||||
@@ -0,0 +1,5 @@
|
||||
#include "common.cuh"
|
||||
|
||||
#define CUDA_OPT_STEP_SGD_BLOCK_SIZE 256
|
||||
|
||||
void ggml_cuda_opt_step_sgd(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
@@ -0,0 +1,53 @@
|
||||
#include "common.cuh"
|
||||
|
||||
// Row reduction kernel template - compute sum (norm=false) or mean (norm=true)
|
||||
template <bool norm>
|
||||
static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __restrict__ dst, const int ncols) {
|
||||
const int row = blockIdx.x;
|
||||
const int col = threadIdx.x;
|
||||
|
||||
float sum = 0.0f;
|
||||
const int num_unroll = 8;
|
||||
float temp[num_unroll];
|
||||
float sum_temp[num_unroll] = { 0.0f };
|
||||
for (int i = col; i < ncols;) {
|
||||
for (int j = 0; j < num_unroll; ++j) {
|
||||
if (i < ncols) {
|
||||
temp[j] = x[row * ncols + i];
|
||||
} else {
|
||||
temp[j] = 0;
|
||||
}
|
||||
i += blockDim.x;
|
||||
}
|
||||
for (int j = 0; j < num_unroll; ++j) {
|
||||
sum_temp[j] += temp[j];
|
||||
}
|
||||
}
|
||||
for (int j = 0; j < num_unroll; ++j) {
|
||||
sum += sum_temp[j];
|
||||
}
|
||||
|
||||
// sum up partial sums
|
||||
sum = warp_reduce_sum(sum);
|
||||
if (blockDim.x > WARP_SIZE) {
|
||||
assert((blockDim.x <= 1024) && (blockDim.x % WARP_SIZE) == 0);
|
||||
__shared__ float s_sum[32];
|
||||
const int warp_id = threadIdx.x / WARP_SIZE;
|
||||
const int lane_id = threadIdx.x % WARP_SIZE;
|
||||
if (lane_id == 0) {
|
||||
s_sum[warp_id] = sum;
|
||||
}
|
||||
__syncthreads();
|
||||
sum = 0.0f;
|
||||
if (lane_id < (blockDim.x / WARP_SIZE)) {
|
||||
sum = s_sum[lane_id];
|
||||
}
|
||||
sum = warp_reduce_sum(sum);
|
||||
}
|
||||
|
||||
if (col != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
dst[row] = norm ? sum / ncols : sum;
|
||||
}
|
||||
@@ -3,11 +3,6 @@
|
||||
|
||||
typedef void (*set_rows_kernel_t)(const char * src, char * dst);
|
||||
|
||||
template<typename src_t, typename dst_t>
|
||||
__device__ __forceinline__ void set_rows_1(const src_t * src_f, dst_t * dst_f) {
|
||||
convert_flt(src_f, dst_f);
|
||||
}
|
||||
|
||||
// Generic quantized set_rows kernel template
|
||||
template<typename block_type, int qk, void (*quantize_func)(const float*, block_type*)>
|
||||
static __global__ void k_set_rows_quant(
|
||||
@@ -117,9 +112,7 @@ static __global__ void k_set_rows(
|
||||
const src_t * src0_row = src0 + i01*s01 + i02*s02 + i03*s03;
|
||||
dst_t * dst_row_ptr = dst + dst_row*s1 + i02*s2 + i03*s3;
|
||||
|
||||
const src_t* src_elem = src0_row + i00;
|
||||
dst_t* dst_elem = dst_row_ptr + i00;
|
||||
set_rows_1(src_elem, dst_elem);
|
||||
dst_row_ptr[i00] = ggml_cuda_cast<dst_t>(src0_row[i00]);
|
||||
|
||||
GGML_UNUSED(ne10);
|
||||
GGML_UNUSED(ne13);
|
||||
|
||||
+153
-71
@@ -1,87 +1,117 @@
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
|
||||
#define USE_CUB
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
|
||||
|
||||
#ifdef USE_CUB
|
||||
#include <cub/cub.cuh>
|
||||
using namespace cub;
|
||||
#endif // USE_CUB
|
||||
|
||||
#include "ssm-scan.cuh"
|
||||
|
||||
template <size_t splitD, size_t N>
|
||||
__global__ void __launch_bounds__(splitD, 2)
|
||||
ssm_scan_f32(const float * __restrict__ src0, const float * __restrict__ src1, const float * __restrict__ src2,
|
||||
const float * __restrict__ src3, const float * __restrict__ src4, const float * __restrict__ src5,
|
||||
// We would like to keep pragma unroll for cases where L_template is not 0,
|
||||
// so we suppress the clang transformation warning.
|
||||
#ifdef __clang__
|
||||
#pragma clang diagnostic push
|
||||
#pragma clang diagnostic ignored "-Wpass-failed"
|
||||
#endif // __clang__
|
||||
template <size_t splitD, size_t N, size_t L_template>
|
||||
__global__ void __launch_bounds__(splitD, 1)
|
||||
ssm_scan_f32(const float *__restrict__ src0, const float *__restrict__ src1, const float *__restrict__ src2,
|
||||
const float *__restrict__ src3, const float *__restrict__ src4, const float *__restrict__ src5,
|
||||
const int32_t * __restrict__ src6, float * __restrict__ dst,
|
||||
const int src0_nb2, const int src0_nb3, const int src1_nb2, const int src1_nb3,
|
||||
const int src2_nb1, const int src2_nb2, const int src3_nb1,
|
||||
const int src4_nb2, const int src4_nb3, const int src5_nb2, const int src5_nb3,
|
||||
const int64_t s_off, const int64_t d_inner, const int64_t L) {
|
||||
const int64_t s_off, const int64_t d_inner, const int64_t L_param)
|
||||
{
|
||||
const size_t L = L_template == 0 ? L_param : L_template;
|
||||
const float *s0_block = (const float *)((const char *)src0 + src6[blockIdx.x] * src0_nb3 + blockIdx.y * splitD * src0_nb2);
|
||||
const float *x_block = (const float *)((const char *)src1 + (blockIdx.x * src1_nb3) + blockIdx.y * splitD * sizeof(float));
|
||||
const float *dt_block = (const float *)((const char *)src2 + (blockIdx.x * src2_nb2) + blockIdx.y * splitD * sizeof(float));
|
||||
const float *A_block = (const float *)((const char *)src3 + blockIdx.y * splitD * src3_nb1);
|
||||
const float *B_block = (const float *)((const char *)src4 + (blockIdx.x * src4_nb3));
|
||||
const float *C_block = (const float *)((const char *)src5 + (blockIdx.x * src5_nb3));
|
||||
float *y_block = (float *)((char *)dst + (blockIdx.x * d_inner * L * sizeof(float)) + blockIdx.y * splitD * sizeof(float));
|
||||
float *s_block = (float *)((char *)dst + s_off + blockIdx.x * src0_nb3 + blockIdx.y * splitD * src0_nb2);
|
||||
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
const int bidx = blockIdx.x; // split along B (sequences)
|
||||
const int bidy = blockIdx.y; // split along D (d_inner)
|
||||
const int tid = threadIdx.x;
|
||||
const int wid = tid / 32;
|
||||
const int wtid = tid % 32;
|
||||
|
||||
extern __shared__ float smem[];
|
||||
const int stride_sA = N + 1;
|
||||
const int stride_ss0 = N + 1;
|
||||
float * smem_A = smem;
|
||||
float * smem_s0 = smem_A + splitD * stride_sA;
|
||||
|
||||
const float * s0_block = (const float *) ((const char *) src0 + src6[bidx] * src0_nb3 + bidy * splitD * src0_nb2);
|
||||
const float * x_block = (const float *) ((const char *) src1 + (bidx * src1_nb3) + bidy * splitD * sizeof(float));
|
||||
const float * dt_block = (const float *) ((const char *) src2 + (bidx * src2_nb2) + bidy * splitD * sizeof(float));
|
||||
const float * A_block = (const float *) ((const char *) src3 + bidy * splitD * src3_nb1);
|
||||
const float * B_block = (const float *) ((const char *) src4 + (bidx * src4_nb3));
|
||||
const float * C_block = (const float *) ((const char *) src5 + (bidx * src5_nb3));
|
||||
float * y_block = (float *) ((char *) dst + (bidx * d_inner * L * sizeof(float)) + bidy * splitD * sizeof(float));
|
||||
float * s_block = (float *) ((char *) dst + s_off + bidx * src0_nb3 + bidy * splitD * src0_nb2);
|
||||
|
||||
const int stride_s0 = src0_nb2 / sizeof(float);
|
||||
const int stride_x = src1_nb2 / sizeof(float);
|
||||
const int stride_x = src1_nb2 / sizeof(float);
|
||||
const int stride_dt = src2_nb1 / sizeof(float);
|
||||
const int stride_A = src3_nb1 / sizeof(float);
|
||||
const int stride_B = src4_nb2 / sizeof(float);
|
||||
const int stride_C = src5_nb2 / sizeof(float);
|
||||
const int stride_s = stride_s0;
|
||||
const int stride_y = d_inner;
|
||||
const int stride_B = src4_nb2 / sizeof(float);
|
||||
const int stride_C = src5_nb2 / sizeof(float);
|
||||
const int stride_y = d_inner;
|
||||
|
||||
// can N not be 16? for example 32?
|
||||
if (N == 16) {
|
||||
float regA[N];
|
||||
float regs0[N];
|
||||
|
||||
__shared__ float smemB[N];
|
||||
__shared__ float smemC[N];
|
||||
|
||||
#ifdef USE_CUB
|
||||
using BlockLoad = cub::BlockLoad<float, splitD, N, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
|
||||
using BlockStore = cub::BlockStore<float, splitD, N, cub::BLOCK_STORE_WARP_TRANSPOSE>;
|
||||
|
||||
union CubTempStorage {
|
||||
typename BlockLoad::TempStorage load_temp;
|
||||
typename BlockStore::TempStorage store_temp;
|
||||
};
|
||||
__shared__ CubTempStorage cub_temp_storage;
|
||||
|
||||
BlockLoad(cub_temp_storage.load_temp).Load(A_block, regA);
|
||||
BlockLoad(cub_temp_storage.load_temp).Load(s0_block, regs0);
|
||||
#else
|
||||
const int stride_s0 = src0_nb2 / sizeof(float);
|
||||
const int stride_A = src3_nb1 / sizeof(float);
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < splitD / 4; i += 2) {
|
||||
float value = A_block[(wid * warp_size + i) * stride_A + wtid];
|
||||
// todo: bank conflict
|
||||
// I am always confused with how to use the swizzling method to solve
|
||||
// bank conflit. Hoping somebody can tell me.
|
||||
smem_A[(wid * warp_size + i) * stride_sA + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
|
||||
}
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < splitD / 4; i += 2) {
|
||||
float value = s0_block[(wid * warp_size + i) * stride_s0 + wtid];
|
||||
smem_s0[(wid * warp_size + i) * stride_ss0 + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
|
||||
}
|
||||
for (size_t n = 0; n < N; ++n)
|
||||
{
|
||||
regA[n] = A_block[threadIdx.x * stride_A + n];
|
||||
regs0[n] = s0_block[threadIdx.x * stride_s0 + n];
|
||||
}
|
||||
#endif
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for (int64_t i = 0; i < L; i++) {
|
||||
float dt_soft_plus = dt_block[i * stride_dt + tid];
|
||||
if (dt_soft_plus <= 20.0f) {
|
||||
dt_soft_plus = log1pf(exp(dt_soft_plus));
|
||||
}
|
||||
float x_dt = x_block[i * stride_x + tid] * dt_soft_plus;
|
||||
float sumf = 0.0f;
|
||||
#pragma unroll
|
||||
for (size_t j = 0; j < N; j++) {
|
||||
float state = (smem_s0[tid * stride_ss0 + j] * expf(dt_soft_plus * smem_A[tid * stride_sA + j])) +
|
||||
(B_block[i * stride_B + j] * x_dt);
|
||||
sumf += state * C_block[i * stride_C + j];
|
||||
if (i == L - 1) {
|
||||
s_block[tid * stride_s + j] = state;
|
||||
} else {
|
||||
smem_s0[tid * stride_ss0 + j] = state;
|
||||
}
|
||||
for (size_t i = 0; i < L; i++)
|
||||
{
|
||||
if (threadIdx.x < N)
|
||||
{
|
||||
smemB[threadIdx.x] = B_block[i * stride_B + threadIdx.x];
|
||||
smemC[threadIdx.x] = C_block[i * stride_C + threadIdx.x];
|
||||
}
|
||||
__syncthreads();
|
||||
y_block[i * stride_y + tid] = sumf;
|
||||
|
||||
float dt_soft_plus = dt_block[i * stride_dt + threadIdx.x];
|
||||
if (dt_soft_plus <= 20.0f)
|
||||
{
|
||||
dt_soft_plus = log1pf(expf(dt_soft_plus));
|
||||
}
|
||||
float x_dt = x_block[i * stride_x + threadIdx.x] * dt_soft_plus;
|
||||
|
||||
float sumf = 0.0f;
|
||||
#pragma unroll
|
||||
for (size_t n = 0; n < N; n++)
|
||||
{
|
||||
float state = regs0[n] * expf(dt_soft_plus * regA[n]) + smemB[n] * x_dt;
|
||||
sumf += state * smemC[n];
|
||||
regs0[n] = state;
|
||||
}
|
||||
y_block[i * stride_y + threadIdx.x] = sumf;
|
||||
}
|
||||
|
||||
#ifdef USE_CUB
|
||||
BlockStore(cub_temp_storage.store_temp).Store(s_block, regs0);
|
||||
#else
|
||||
const int stride_s = stride_s0;
|
||||
#pragma unroll
|
||||
for (size_t n = 0; n < N; ++n)
|
||||
{
|
||||
s_block[threadIdx.x * stride_s + n] = regs0[n];
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#ifdef __clang__
|
||||
#pragma clang diagnostic pop
|
||||
#endif // __clang__
|
||||
|
||||
// assumes as many threads as d_state
|
||||
template <int splitH, int d_state>
|
||||
@@ -201,11 +231,11 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa
|
||||
const int src5_nb3, const int64_t s_off, const int64_t d_state, const int64_t head_dim,
|
||||
const int64_t n_head, const int64_t n_group, const int64_t n_tok, const int64_t n_seq,
|
||||
cudaStream_t stream) {
|
||||
const int threads = 128;
|
||||
// NOTE: if you change conditions here, be sure to update the corresponding supports_op condition!
|
||||
if (src3_nb1 == sizeof(float)) {
|
||||
// Mamba-2
|
||||
if (d_state == 128) {
|
||||
const int threads = 128;
|
||||
GGML_ASSERT(d_state % threads == 0);
|
||||
// NOTE: can be any power of two between 4 and 64
|
||||
const int splitH = 16;
|
||||
@@ -229,7 +259,6 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa
|
||||
GGML_ABORT("doesn't support d_state!=(128 or 256).");
|
||||
}
|
||||
} else {
|
||||
const int threads = 128;
|
||||
// Mamba-1
|
||||
GGML_ASSERT(n_head % threads == 0);
|
||||
GGML_ASSERT(head_dim == 1);
|
||||
@@ -237,10 +266,63 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa
|
||||
const dim3 blocks(n_seq, (n_head + threads - 1) / threads, 1);
|
||||
const int smem_size = (threads * (d_state + 1) * 2) * sizeof(float);
|
||||
if (d_state == 16) {
|
||||
ssm_scan_f32<128, 16><<<blocks, threads, smem_size, stream>>>(
|
||||
src0, src1, src2, src3, src4, src5, src6, dst,
|
||||
switch (n_tok)
|
||||
{
|
||||
case 1:
|
||||
ssm_scan_f32<threads, 16, 1><<<blocks, threads, smem_size, stream>>>(
|
||||
src0, src1, src2, src3, src4, src5, src6, dst,
|
||||
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
|
||||
src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
|
||||
break;
|
||||
case 2:
|
||||
ssm_scan_f32<threads, 16, 2><<<blocks, threads, smem_size, stream>>>(
|
||||
src0, src1, src2, src3, src4, src5, src6, dst,
|
||||
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
|
||||
src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
|
||||
break;
|
||||
case 3:
|
||||
ssm_scan_f32<threads, 16, 3><<<blocks, threads, smem_size, stream>>>(
|
||||
src0, src1, src2, src3, src4, src5, src6, dst,
|
||||
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
|
||||
src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
|
||||
break;
|
||||
case 4:
|
||||
ssm_scan_f32<threads, 16, 4><<<blocks, threads, smem_size, stream>>>(
|
||||
src0, src1, src2, src3, src4, src5, src6, dst,
|
||||
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
|
||||
src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
|
||||
break;
|
||||
case 5:
|
||||
ssm_scan_f32<threads, 16, 5><<<blocks, threads, smem_size, stream>>>(
|
||||
src0, src1, src2, src3, src4, src5, src6, dst,
|
||||
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
|
||||
src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
|
||||
break;
|
||||
case 6:
|
||||
ssm_scan_f32<threads, 16, 6><<<blocks, threads, smem_size, stream>>>(
|
||||
src0, src1, src2, src3, src4, src5, src6, dst,
|
||||
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
|
||||
src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
|
||||
break;
|
||||
case 7:
|
||||
ssm_scan_f32<threads, 16, 7><<<blocks, threads, smem_size, stream>>>(
|
||||
src0, src1, src2, src3, src4, src5, src6, dst,
|
||||
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
|
||||
src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
|
||||
break;
|
||||
case 8:
|
||||
ssm_scan_f32<threads, 16, 8><<<blocks, threads, smem_size, stream>>>(
|
||||
src0, src1, src2, src3, src4, src5, src6, dst,
|
||||
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
|
||||
src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
|
||||
break;
|
||||
default:
|
||||
ssm_scan_f32<threads, 16, 0><<<blocks, threads, smem_size, stream>>>(
|
||||
src0, src1, src2, src3, src4, src5, src6, dst,
|
||||
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
|
||||
src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
GGML_ABORT("doesn't support d_state!=16.");
|
||||
}
|
||||
|
||||
@@ -1,19 +1,15 @@
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
|
||||
#define USE_CUB
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
|
||||
#include "sum.cuh"
|
||||
#include "sumrows.cuh"
|
||||
|
||||
#ifdef USE_CUB
|
||||
#ifdef GGML_CUDA_USE_CUB
|
||||
#include <cub/cub.cuh>
|
||||
using namespace cub;
|
||||
#endif // USE_CUB
|
||||
|
||||
#include "sumrows.cuh"
|
||||
#include "sum.cuh"
|
||||
#endif // GGML_CUDA_USE_CUB
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream) {
|
||||
#ifdef USE_CUB
|
||||
#ifdef GGML_CUDA_USE_CUB
|
||||
size_t tmp_size = 0;
|
||||
DeviceReduce::Sum(nullptr, tmp_size, x, dst, ne, stream);
|
||||
ggml_cuda_pool_alloc<uint8_t> tmp_alloc(pool, tmp_size);
|
||||
@@ -23,7 +19,7 @@ void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int
|
||||
// For AMD there is rocPRIM which could be used as a drop-in replacement via hipcub but this would require C++11 -> C++14.
|
||||
sum_rows_f32_cuda(x, dst, ne, 1, stream);
|
||||
GGML_UNUSED(pool);
|
||||
#endif // USE_CUB
|
||||
#endif // GGML_CUDA_USE_CUB
|
||||
}
|
||||
|
||||
void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
@@ -1,9 +1,17 @@
|
||||
#include "reduce_rows.cuh"
|
||||
#include "sumrows.cuh"
|
||||
|
||||
void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
const dim3 block_dims(WARP_SIZE, 1, 1);
|
||||
const int id = ggml_cuda_get_device();
|
||||
const int nsm = ggml_cuda_info().devices[id].nsm;
|
||||
const dim3 block_nums(nrows, 1, 1);
|
||||
reduce_rows_f32</*norm*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
||||
if ((nrows / nsm) < 2) {
|
||||
const dim3 block_dims(512, 1, 1);
|
||||
reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
||||
} else {
|
||||
const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1);
|
||||
reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
@@ -19,8 +27,17 @@ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const int64_t ncols = src0->ne[0];
|
||||
const int64_t nrows = ggml_nrows(src0);
|
||||
|
||||
const dim3 block_dims(WARP_SIZE, 1, 1);
|
||||
const dim3 block_nums(nrows, 1, 1);
|
||||
|
||||
reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
|
||||
const int id = ggml_cuda_get_device();
|
||||
const int nsm = ggml_cuda_info().devices[id].nsm;
|
||||
if ((nrows / nsm) < 2) {
|
||||
// Increase num threads to 512 for small nrows to better hide the latency
|
||||
const dim3 block_dims(512, 1, 1);
|
||||
reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
|
||||
} else {
|
||||
// Enough active SMs to hide latency, use smaller blocks to allow better scheduling
|
||||
const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1);
|
||||
reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
|
||||
}
|
||||
}
|
||||
|
||||
Vendored
+7
-23
@@ -1,12 +1,10 @@
|
||||
#pragma once
|
||||
|
||||
#define HIP_ENABLE_WARP_SYNC_BUILTINS 1
|
||||
#define HIP_DISABLE_WARP_SYNC_BUILTINS 1
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hipblas/hipblas.h>
|
||||
#include <hip/hip_fp16.h>
|
||||
#include <hip/hip_bfloat16.h>
|
||||
// for rocblas_initialize()
|
||||
#include "rocblas/rocblas.h"
|
||||
#include <hip/hip_bf16.h>
|
||||
|
||||
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
|
||||
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
|
||||
@@ -137,7 +135,7 @@
|
||||
#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
|
||||
#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
|
||||
|
||||
#if HIP_VERSION >= 70000000
|
||||
#if HIP_VERSION >= 60500000
|
||||
#define CUBLAS_COMPUTE_16F HIPBLAS_COMPUTE_16F
|
||||
#define CUBLAS_COMPUTE_32F HIPBLAS_COMPUTE_32F
|
||||
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_COMPUTE_32F_FAST_16F
|
||||
@@ -149,7 +147,7 @@
|
||||
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
|
||||
#define cublasComputeType_t hipblasDatatype_t
|
||||
#define cudaDataType_t hipblasDatatype_t
|
||||
#endif // HIP_VERSION >= 7000000
|
||||
#endif // HIP_VERSION >= 6050000
|
||||
|
||||
#if !defined(__HIP_PLATFORM_AMD__)
|
||||
#error "The HIP backend supports only AMD targets"
|
||||
@@ -181,8 +179,7 @@
|
||||
#define RDNA4
|
||||
#endif
|
||||
|
||||
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
|
||||
defined(__gfx1150__) || defined(__gfx1151__)
|
||||
#if defined(__GFX11__)
|
||||
#define RDNA3
|
||||
#endif
|
||||
|
||||
@@ -199,7 +196,8 @@
|
||||
#define __has_builtin(x) 0
|
||||
#endif
|
||||
|
||||
typedef hip_bfloat16 nv_bfloat16;
|
||||
typedef __hip_bfloat16 nv_bfloat16;
|
||||
typedef __hip_bfloat162 nv_bfloat162;
|
||||
|
||||
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
|
||||
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
|
||||
@@ -250,17 +248,3 @@ static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigne
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
#if HIP_VERSION < 50600000
|
||||
// __shfl_xor() for half2 was added in ROCm 5.6
|
||||
static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
|
||||
typedef union half2_b32 {
|
||||
half2 val;
|
||||
int b32;
|
||||
} half2_b32_t;
|
||||
half2_b32_t tmp;
|
||||
tmp.val = var;
|
||||
tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
|
||||
return tmp.val;
|
||||
}
|
||||
#endif // HIP_VERSION < 50600000
|
||||
|
||||
Vendored
+2
-1
@@ -137,4 +137,5 @@
|
||||
#define cudaStreamEndCapture musaStreamEndCapture
|
||||
#define cudaOccupancyMaxActiveBlocksPerMultiprocessor musaOccupancyMaxActiveBlocksPerMultiprocessor
|
||||
|
||||
typedef mt_bfloat16 nv_bfloat16;
|
||||
typedef __mt_bfloat16 nv_bfloat16;
|
||||
typedef __mt_bfloat162 nv_bfloat162;
|
||||
|
||||
@@ -46,8 +46,8 @@ if (GGML_HIP_ROCWMMA_FATTN)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (${hip_VERSION} VERSION_LESS 5.5)
|
||||
message(FATAL_ERROR "At least ROCM/HIP V5.5 is required")
|
||||
if (${hip_VERSION} VERSION_LESS 6.1)
|
||||
message(FATAL_ERROR "At least ROCM/HIP V6.1 is required")
|
||||
endif()
|
||||
|
||||
message(STATUS "HIP and hipBLAS found")
|
||||
@@ -121,6 +121,10 @@ if (GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 OR ${hip_VERSION} VERSION_GREATER_EQUAL 7
|
||||
add_compile_definitions(GGML_HIP_ROCWMMA_FATTN_GFX12)
|
||||
endif()
|
||||
|
||||
if (GGML_HIP_EXPORT_METRICS)
|
||||
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Rpass-analysis=kernel-resource-usage --save-temps")
|
||||
endif()
|
||||
|
||||
if (NOT GGML_CUDA_FA)
|
||||
add_compile_definitions(GGML_CUDA_NO_FA)
|
||||
endif()
|
||||
|
||||
@@ -55,6 +55,7 @@ endfunction()
|
||||
|
||||
set(GGML_OPENCL_KERNELS
|
||||
add
|
||||
add_id
|
||||
argsort
|
||||
clamp
|
||||
cpy
|
||||
|
||||
@@ -345,6 +345,7 @@ struct ggml_backend_opencl_context {
|
||||
cl_command_queue queue;
|
||||
|
||||
cl_program program_add;
|
||||
cl_program program_add_id;
|
||||
cl_program program_clamp;
|
||||
cl_program program_cpy;
|
||||
cl_program program_cvt;
|
||||
@@ -404,6 +405,7 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_mul, kernel_mul_row, kernel_mul_f16, kernel_mul_row_f16;
|
||||
cl_kernel kernel_div, kernel_div_row, kernel_div_f16, kernel_div_row_f16;
|
||||
cl_kernel kernel_sub, kernel_sub_row, kernel_sub_f16, kernel_sub_row_f16;
|
||||
cl_kernel kernel_add_id;
|
||||
cl_kernel kernel_scale;
|
||||
cl_kernel kernel_silu, kernel_silu_4;
|
||||
cl_kernel kernel_gelu, kernel_gelu_4;
|
||||
@@ -412,7 +414,7 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_relu;
|
||||
cl_kernel kernel_sigmoid_f32, kernel_sigmoid_f16;
|
||||
cl_kernel kernel_clamp;
|
||||
cl_kernel kernel_geglu, kernel_reglu, kernel_swiglu, kernel_geglu_erf, kernel_geglu_quick,
|
||||
cl_kernel kernel_geglu, kernel_reglu, kernel_swiglu, kernel_swiglu_oai, kernel_geglu_erf, kernel_geglu_quick,
|
||||
kernel_geglu_f16, kernel_reglu_f16, kernel_swiglu_f16, kernel_geglu_erf_f16, kernel_geglu_quick_f16;
|
||||
cl_kernel kernel_norm;
|
||||
cl_kernel kernel_rms_norm, kernel_rms_norm_mul;
|
||||
@@ -600,6 +602,7 @@ struct ggml_backend_opencl_context {
|
||||
if (ref_count == 0) {
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
write_profiling_info();
|
||||
profiling_info.clear();
|
||||
#endif
|
||||
}
|
||||
}
|
||||
@@ -681,6 +684,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// add_id
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "add_id.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("add_id.cl");
|
||||
#endif
|
||||
backend_ctx->program_add_id =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_add_id = clCreateKernel(backend_ctx->program_add_id, "kernel_add_id", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// clamp
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
@@ -787,6 +806,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
CL_CHECK((backend_ctx->kernel_geglu = clCreateKernel(backend_ctx->program_glu, "kernel_geglu", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_reglu = clCreateKernel(backend_ctx->program_glu, "kernel_reglu", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_swiglu = clCreateKernel(backend_ctx->program_glu, "kernel_swiglu", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_swiglu_oai = clCreateKernel(backend_ctx->program_glu, "kernel_swiglu_oai", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_geglu_erf = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_erf", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_geglu_quick = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_quick", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_geglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_f16", &err), err));
|
||||
@@ -2461,12 +2481,21 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
case GGML_OP_SCALE:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
|
||||
case GGML_OP_ADD:
|
||||
if (op->type == GGML_TYPE_F16) {
|
||||
const bool src0_ok = op->src[0]->type == GGML_TYPE_F16 || op->src[0]->type == GGML_TYPE_F32;
|
||||
const bool src1_ok = op->src[1]->type == GGML_TYPE_F16 || op->src[1]->type == GGML_TYPE_F32;
|
||||
if (src0_ok && src1_ok) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
case GGML_OP_SUB:
|
||||
return (op->src[0]->type == op->src[1]->type) &&
|
||||
(op->src[0]->type == op->type) &&
|
||||
(op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16);
|
||||
case GGML_OP_ADD_ID:
|
||||
return op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(op)) {
|
||||
case GGML_UNARY_OP_GELU:
|
||||
@@ -2488,6 +2517,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
case GGML_GLU_OP_GEGLU:
|
||||
case GGML_GLU_OP_REGLU:
|
||||
case GGML_GLU_OP_SWIGLU:
|
||||
case GGML_GLU_OP_SWIGLU_OAI:
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
case GGML_GLU_OP_GEGLU_QUICK:
|
||||
return ggml_is_contiguous_1(op->src[0]) && (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16);
|
||||
@@ -2497,8 +2527,6 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
case GGML_OP_CLAMP:
|
||||
return op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_SOFT_MAX:
|
||||
// TODO: support attention sinks [TAG_ATTN_SINKS]
|
||||
return op->src[2] == nullptr;
|
||||
case GGML_OP_NORM:
|
||||
case GGML_OP_RMS_NORM:
|
||||
return true;
|
||||
@@ -2603,10 +2631,10 @@ ggml_backend_t ggml_backend_opencl_init(void) {
|
||||
ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(dev);
|
||||
|
||||
ggml_backend_t backend = new ggml_backend {
|
||||
/* .guid = */ ggml_backend_opencl_guid(),
|
||||
/* .interface = */ ggml_backend_opencl_i,
|
||||
/* .device = */ dev,
|
||||
/* .context = */ backend_ctx
|
||||
/* .guid = */ ggml_backend_opencl_guid(),
|
||||
/* .iface = */ ggml_backend_opencl_i,
|
||||
/* .device = */ dev,
|
||||
/* .context = */ backend_ctx
|
||||
};
|
||||
|
||||
return backend;
|
||||
@@ -3696,34 +3724,30 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
|
||||
GGML_ASSERT(src0->type == src1->type);
|
||||
GGML_ASSERT(src0->type == dst->type);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
||||
|
||||
const int ne00 = src0->ne[0];
|
||||
const int ne01 = src0->ne[1];
|
||||
const int ne02 = src0->ne[2];
|
||||
const int ne03 = src0->ne[3];
|
||||
const int ne00 = src0->ne[0];
|
||||
const int ne01 = src0->ne[1];
|
||||
const int ne02 = src0->ne[2];
|
||||
const int ne03 = src0->ne[3];
|
||||
|
||||
const cl_ulong nb00 = src0->nb[0];
|
||||
const cl_ulong nb01 = src0->nb[1];
|
||||
const cl_ulong nb02 = src0->nb[2];
|
||||
const cl_ulong nb03 = src0->nb[3];
|
||||
|
||||
const int ne10 = src1->ne[0];
|
||||
const int ne11 = src1->ne[1];
|
||||
const int ne12 = src1->ne[2];
|
||||
const int ne13 = src1->ne[3]; UNUSED(ne13);
|
||||
const int ne10 = src1->ne[0];
|
||||
const int ne11 = src1->ne[1];
|
||||
const int ne12 = src1->ne[2];
|
||||
const int ne13 = src1->ne[3];
|
||||
|
||||
const cl_ulong nb10 = src1->nb[0];
|
||||
const cl_ulong nb11 = src1->nb[1];
|
||||
const cl_ulong nb12 = src1->nb[2];
|
||||
const cl_ulong nb13 = src1->nb[3]; UNUSED(nb13);
|
||||
const cl_ulong nb13 = src1->nb[3];
|
||||
|
||||
const int ne0 = dst->ne[0];
|
||||
const int ne1 = dst->ne[1];
|
||||
const int ne2 = dst->ne[2];
|
||||
const int ne3 = dst->ne[3];
|
||||
const int ne0 = dst->ne[0];
|
||||
const int ne1 = dst->ne[1];
|
||||
const int ne2 = dst->ne[2];
|
||||
const int ne3 = dst->ne[3];
|
||||
|
||||
const cl_ulong nb0 = dst->nb[0];
|
||||
const cl_ulong nb1 = dst->nb[1];
|
||||
@@ -3740,68 +3764,114 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
bool bcast_row = false;
|
||||
cl_kernel kernel;
|
||||
|
||||
if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
|
||||
const bool bcast_row = ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0;
|
||||
|
||||
if (bcast_row) {
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
// src1 is a row
|
||||
GGML_ASSERT(ne11 == 1);
|
||||
}
|
||||
|
||||
bcast_row = true;
|
||||
int ne = ne00 / 4;
|
||||
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
if (dst->type == GGML_TYPE_F32) {
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32);
|
||||
if (bcast_row) {
|
||||
kernel = backend_ctx->kernel_add_row;
|
||||
const int ne = ne00 / 4;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_add_row_f16;
|
||||
}
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
|
||||
} else {
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
kernel = backend_ctx->kernel_add;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 26, sizeof(cl_ulong), &nb0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 27, sizeof(cl_ulong), &nb1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 28, sizeof(cl_ulong), &nb2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 29, sizeof(cl_ulong), &nb3));
|
||||
}
|
||||
} else if (dst->type == GGML_TYPE_F16) {
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32);
|
||||
const int type_src0 = (src0->type == GGML_TYPE_F32);
|
||||
const int type_src1 = (src1->type == GGML_TYPE_F32);
|
||||
if (bcast_row) {
|
||||
kernel = backend_ctx->kernel_add_row_f16;
|
||||
const int ne = ne00 / 4;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &type_src0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &type_src1));
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_add_f16;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 26, sizeof(cl_ulong), &nb0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 27, sizeof(cl_ulong), &nb1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 28, sizeof(cl_ulong), &nb2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 29, sizeof(cl_ulong), &nb3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 30, sizeof(int), &type_src0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 31, sizeof(int), &type_src1));
|
||||
}
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 26, sizeof(cl_ulong), &nb0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 27, sizeof(cl_ulong), &nb1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 28, sizeof(cl_ulong), &nb2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 29, sizeof(cl_ulong), &nb3));
|
||||
} else {
|
||||
GGML_ASSERT(false && "unsupported data types for add");
|
||||
}
|
||||
|
||||
if (bcast_row) {
|
||||
@@ -3811,19 +3881,88 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
|
||||
size_t * local_work_size_ptr = local_work_size;
|
||||
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
|
||||
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
||||
local_work_size_ptr = nullptr;
|
||||
}
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size_ptr, dst);
|
||||
} else {
|
||||
unsigned int nth = MIN(64, ne0);
|
||||
size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
|
||||
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
||||
size_t local_work_size[] = {nth, 1, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cl_add_id(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(src1);
|
||||
GGML_ASSERT(src1->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
|
||||
const ggml_tensor * src2 = dst->src[2];
|
||||
GGML_ASSERT(src2);
|
||||
GGML_ASSERT(src2->extra);
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src2->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous_rows(src0));
|
||||
|
||||
const int ne00 = src0->ne[0];
|
||||
const int ne01 = src0->ne[1];
|
||||
const int ne02 = src0->ne[2];
|
||||
|
||||
const cl_ulong nb01 = src0->nb[1];
|
||||
const cl_ulong nb02 = src0->nb[2];
|
||||
|
||||
const cl_ulong nb11 = src1->nb[1];
|
||||
|
||||
const cl_ulong nb21 = src2->nb[1];
|
||||
|
||||
const int ne0 = dst->ne[0];
|
||||
const int ne1 = dst->ne[1];
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
||||
ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
||||
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
||||
cl_ulong offset2 = extra2->offset + src2->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_add_id;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb21));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne1));
|
||||
|
||||
int nth = MIN(ne00, (int) backend_ctx->get_kernel_workgroup_size(kernel));
|
||||
size_t global_work_size[] = { (size_t)ne01*nth, (size_t)ne02, 1 };
|
||||
size_t local_work_size[] = { (size_t)nth, 1, 1 };
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
}
|
||||
|
||||
static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
@@ -6502,17 +6641,24 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
|
||||
GGML_ASSERT(src1->extra);
|
||||
}
|
||||
|
||||
const ggml_tensor * src2 = dst->src[2];
|
||||
if (src2) {
|
||||
GGML_ASSERT(src2->extra);
|
||||
}
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
ggml_tensor_extra_cl * extra1 = src1 ? (ggml_tensor_extra_cl *)src1->extra : nullptr;
|
||||
ggml_tensor_extra_cl * extra2 = src2 ? (ggml_tensor_extra_cl *)src2->extra : nullptr;
|
||||
|
||||
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
cl_ulong offset1 = extra1 ? extra1->offset + src1->view_offs : offset0;
|
||||
cl_ulong offset2 = extra2 ? extra2->offset + src2->view_offs : offset0;
|
||||
|
||||
const int ne00 = src0->ne[0];
|
||||
const int ne01 = src0->ne[1];
|
||||
@@ -6580,25 +6726,27 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), extra1 ? &extra1->data_device : &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(float), &scale));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(float), &max_bias));
|
||||
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(float), &m0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(float), &m1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &n_head_log2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), extra2 ? &extra2->data_device : &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(float), &scale));
|
||||
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(float), &max_bias));
|
||||
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(float), &m0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(float), &m1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &n_head_log2));
|
||||
|
||||
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
||||
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
||||
@@ -7005,6 +7153,9 @@ static void ggml_cl_glu(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
kernel = backend_ctx->kernel_swiglu_f16;
|
||||
}
|
||||
break;
|
||||
case GGML_GLU_OP_SWIGLU_OAI:
|
||||
kernel = backend_ctx->kernel_swiglu_oai;
|
||||
break;
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
if (dst->type == GGML_TYPE_F32) {
|
||||
kernel = backend_ctx->kernel_geglu_erf;
|
||||
@@ -7040,7 +7191,10 @@ static void ggml_cl_glu(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
|
||||
const cl_ulong nb1 = dst->nb[1];
|
||||
|
||||
const int swp = ((const int32_t *) dst->op_params)[1];
|
||||
const int swp = ggml_get_op_params_i32(dst, 1);
|
||||
const float alpha = ggml_get_op_params_f32(dst, 2);
|
||||
const float limit = ggml_get_op_params_f32(dst, 3);
|
||||
|
||||
const int ne00_off = src1 ? 0 : (swp ? ne0 : 0);
|
||||
const int ne10_off = src1 ? 0 : (swp ? 0 : ne0);
|
||||
|
||||
@@ -7057,6 +7211,11 @@ static void ggml_cl_glu(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne00_off));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne10_off));
|
||||
|
||||
if (ggml_get_glu_op(dst) == GGML_GLU_OP_SWIGLU_OAI) {
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float), &limit));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(float), &alpha));
|
||||
}
|
||||
|
||||
const size_t nrows = ggml_nrows(src0);
|
||||
size_t nth = 512;
|
||||
size_t global_work_size[] = {nrows*nth, 1, 1};
|
||||
@@ -7113,6 +7272,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
||||
}
|
||||
func = ggml_cl_add;
|
||||
break;
|
||||
case GGML_OP_ADD_ID:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cl_add_id;
|
||||
break;
|
||||
case GGML_OP_MUL:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
|
||||
@@ -112,7 +112,9 @@ kernel void kernel_add_f16(
|
||||
ulong nb0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
ulong nb3,
|
||||
int type_src0,
|
||||
int type_src1
|
||||
) {
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
@@ -132,25 +134,57 @@ kernel void kernel_add_f16(
|
||||
|
||||
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
||||
const int i10 = i0 % ne10;
|
||||
*((global half *)(dst_ptr + i0*nb0)) = *((global half *)(src0_ptr + i0*nb00)) + *((global half *)(src1_ptr + i10*nb10));
|
||||
|
||||
half v0, v1;
|
||||
if (type_src0 == 1) {
|
||||
v0 = convert_half(*((global float *)(src0_ptr + i0*nb00)));
|
||||
} else {
|
||||
v0 = *((global half *)(src0_ptr + i0*nb00));
|
||||
}
|
||||
|
||||
if (type_src1 == 1) {
|
||||
v1 = convert_half(*((global float *)(src1_ptr + i10*nb10)));
|
||||
} else {
|
||||
v1 = *((global half *)(src1_ptr + i10*nb10));
|
||||
}
|
||||
|
||||
*((global half *)(dst_ptr + i0*nb0)) = v0 + v1;
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_add_row_f16(
|
||||
global half4 * src0,
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global half4 * src1,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global half4 * dst,
|
||||
ulong offsetd,
|
||||
int ne
|
||||
int ne,
|
||||
int type_src0,
|
||||
int type_src1
|
||||
) {
|
||||
src0 = (global half4*)((global char*)src0 + offset0);
|
||||
src1 = (global half4*)((global char*)src1 + offset1);
|
||||
dst = (global half4*)((global char*)dst + offsetd);
|
||||
|
||||
// This performs better than using %.
|
||||
uint gid = get_global_id(0);
|
||||
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
||||
dst[gid] = src0[gid] + src1[idx1];
|
||||
|
||||
half4 v0, v1;
|
||||
if (type_src0 == 1) {
|
||||
global float4* src0_f32 = (global float4*)((global char*)src0 + offset0);
|
||||
v0 = convert_half4(src0_f32[gid]);
|
||||
} else {
|
||||
global half4* src0_f16 = (global half4*)((global char*)src0 + offset0);
|
||||
v0 = src0_f16[gid];
|
||||
}
|
||||
|
||||
if (type_src1 == 1) {
|
||||
global float4* src1_f32 = (global float4*)((global char*)src1 + offset1);
|
||||
v1 = convert_half4(src1_f32[idx1]);
|
||||
} else {
|
||||
global half4* src1_f16 = (global half4*)((global char*)src1 + offset1);
|
||||
v1 = src1_f16[idx1];
|
||||
}
|
||||
|
||||
dst[gid] = v0 + v1;
|
||||
}
|
||||
|
||||
@@ -0,0 +1,42 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// add_id
|
||||
//------------------------------------------------------------------------------
|
||||
kernel void kernel_add_id(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global char * src2,
|
||||
ulong offset2,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb11,
|
||||
ulong nb21,
|
||||
int ne0,
|
||||
int ne1
|
||||
) {
|
||||
src0 = (global char*)((global char*)src0 + offset0);
|
||||
src1 = (global char*)((global char*)src1 + offset1);
|
||||
src2 = (global char*)((global char*)src2 + offset2);
|
||||
dst = (global char*)((global char*)dst + offsetd);
|
||||
|
||||
int i1 = get_group_id(0);
|
||||
int i2 = get_group_id(1);
|
||||
|
||||
const int i11 = *((global const int *) (src2 + i1*sizeof(int) + i2*nb21));
|
||||
|
||||
const size_t nb1 = ne0 * sizeof(float);
|
||||
const size_t nb2 = ne1 * nb1;
|
||||
|
||||
global float * dst_row = (global float *)((global char *)dst + i1*nb1 + i2*nb2);
|
||||
global float * src0_row = (global float *)((global char *)src0 + i1*nb01 + i2*nb02);
|
||||
global float * src1_row = (global float *)((global char *)src1 + i11*nb11);
|
||||
|
||||
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
||||
dst_row[i0] = src0_row[i0] + src1_row[i0];
|
||||
}
|
||||
}
|
||||
@@ -202,6 +202,47 @@ kernel void kernel_swiglu_f16(
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// swiglu_oai
|
||||
//------------------------------------------------------------------------------
|
||||
kernel void kernel_swiglu_oai(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
ulong nb01,
|
||||
ulong nb11,
|
||||
int ne0,
|
||||
ulong nb1,
|
||||
int ne00_off,
|
||||
int ne10_off,
|
||||
float limit,
|
||||
float alpha
|
||||
) {
|
||||
src0 = (global char*)((global char*)src0 + offset0);
|
||||
src1 = (global char*)((global char*)src1 + offset1);
|
||||
dst = (global char*)((global char*)dst + offsetd);
|
||||
|
||||
global float * src0_row = (global float *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off;
|
||||
global float * src1_row = (global float *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off;
|
||||
global float * dst_row = (global float *) ((global char *) dst + get_group_id(0)*nb1);
|
||||
|
||||
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
||||
float x0 = src0_row[i0];
|
||||
float x1 = src1_row[i0];
|
||||
|
||||
x0 = min(x0, limit);
|
||||
x1 = max(min(x1, limit), -limit);
|
||||
|
||||
float out_glu = x0 / (1.0f + exp(-x0 * alpha));
|
||||
out_glu = out_glu * (1.0f + x1);
|
||||
|
||||
dst_row[i0] = out_glu;
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// geglu_erf
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
@@ -26,6 +26,8 @@ kernel void kernel_soft_max_4_f16(
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global char * src2,
|
||||
ulong offset2,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
@@ -48,6 +50,7 @@ kernel void kernel_soft_max_4_f16(
|
||||
) {
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
src2 = src2 + offset2;
|
||||
dst = dst + offsetd;
|
||||
|
||||
int i03 = get_group_id(2);
|
||||
@@ -60,6 +63,7 @@ kernel void kernel_soft_max_4_f16(
|
||||
|
||||
global float4 * psrc4 = (global float4 *)(src0 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||
global half4 * pmask = src1 != src0 ? (global half4 *)(src1 + i11*nb11 + i12*nb12 + i13*nb13) : 0;
|
||||
global float * psrc2 = src2 != src0 ? (global float *)(src2) : 0;
|
||||
global float4 * pdst4 = (global float4 *)(dst + i01*nb1 + i02*nb2 + i03*nb3);
|
||||
|
||||
float slope = 1.0f;
|
||||
@@ -75,7 +79,7 @@ kernel void kernel_soft_max_4_f16(
|
||||
}
|
||||
|
||||
// parallel max
|
||||
float4 lmax4 = -INFINITY;
|
||||
float4 lmax4 = psrc2 ? psrc2[i02] : -INFINITY;
|
||||
for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
|
||||
lmax4 = fmax(lmax4, psrc4[i00]*scale + slope*(pmask ? convert_float4(pmask[i00]) : 0.0f));
|
||||
}
|
||||
@@ -92,7 +96,11 @@ kernel void kernel_soft_max_4_f16(
|
||||
}
|
||||
float lsum = lsum4.s0 + lsum4.s1 + lsum4.s2 + lsum4.s3;
|
||||
|
||||
const float sum = sub_group_reduce_add(lsum);
|
||||
float sum = sub_group_reduce_add(lsum);
|
||||
|
||||
if (psrc2) {
|
||||
sum += exp(psrc2[i02] - max);
|
||||
}
|
||||
|
||||
for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
|
||||
pdst4[i00] /= sum;
|
||||
|
||||
@@ -26,6 +26,8 @@ kernel void kernel_soft_max_4(
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global char * src2,
|
||||
ulong offset2,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
@@ -48,6 +50,7 @@ kernel void kernel_soft_max_4(
|
||||
) {
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
src2 = src2 + offset2;
|
||||
dst = dst + offsetd;
|
||||
|
||||
int i03 = get_group_id(2);
|
||||
@@ -60,6 +63,7 @@ kernel void kernel_soft_max_4(
|
||||
|
||||
global float4 * psrc4 = (global float4 *)(src0 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||
global float4 * pmask = src1 != src0 ? (global float4 *)(src1 + i11*nb11 + i12*nb12 + i13*nb13) : 0;
|
||||
global float * psrc2 = src2 != src0 ? (global float *)(src2) : 0;
|
||||
global float4 * pdst4 = (global float4 *)(dst + i01*nb1 + i02*nb2 + i03*nb3);
|
||||
|
||||
float slope = 1.0f;
|
||||
@@ -75,7 +79,7 @@ kernel void kernel_soft_max_4(
|
||||
}
|
||||
|
||||
// parallel max
|
||||
float4 lmax4 = -INFINITY;
|
||||
float4 lmax4 = psrc2 ? psrc2[i02] : -INFINITY;
|
||||
for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
|
||||
lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f));
|
||||
}
|
||||
@@ -92,7 +96,11 @@ kernel void kernel_soft_max_4(
|
||||
}
|
||||
float lsum = lsum4.s0 + lsum4.s1 + lsum4.s2 + lsum4.s3;
|
||||
|
||||
const float sum = sub_group_reduce_add(lsum);
|
||||
float sum = sub_group_reduce_add(lsum);
|
||||
|
||||
if (psrc2) {
|
||||
sum += exp(psrc2[i02] - max);
|
||||
}
|
||||
|
||||
for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
|
||||
pdst4[i00] /= sum;
|
||||
|
||||
@@ -26,6 +26,8 @@ kernel void kernel_soft_max_f16(
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global char * src2,
|
||||
ulong offset2,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
@@ -48,6 +50,7 @@ kernel void kernel_soft_max_f16(
|
||||
) {
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
src2 = src2 + offset2;
|
||||
dst = dst + offsetd;
|
||||
|
||||
int i03 = get_group_id(2);
|
||||
@@ -60,6 +63,7 @@ kernel void kernel_soft_max_f16(
|
||||
|
||||
global float * psrc0 = (global float *)(src0 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||
global half * pmask = src1 != src0 ? (global half *)(src1 + i11*nb11 + i12*nb12 + i13*nb13) : 0;
|
||||
global float * psrc2 = src2 != src0 ? (global float *)(src2) : 0;
|
||||
global float * pdst = (global float *)(dst + i01*nb1 + i02*nb2 + i03*nb3);
|
||||
|
||||
float slope = 1.0f;
|
||||
@@ -75,7 +79,7 @@ kernel void kernel_soft_max_f16(
|
||||
}
|
||||
|
||||
// parallel max
|
||||
float lmax = -INFINITY;
|
||||
float lmax = psrc2 ? psrc2[i02] : -INFINITY;
|
||||
for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
|
||||
lmax = fmax(lmax, psrc0[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f));
|
||||
}
|
||||
@@ -91,7 +95,11 @@ kernel void kernel_soft_max_f16(
|
||||
pdst[i00] = exp_psrc0;
|
||||
}
|
||||
|
||||
const float sum = sub_group_reduce_add(lsum);
|
||||
float sum = sub_group_reduce_add(lsum);
|
||||
|
||||
if (psrc2) {
|
||||
sum += exp(psrc2[i02] - max);
|
||||
}
|
||||
|
||||
for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
|
||||
pdst[i00] /= sum;
|
||||
|
||||
@@ -26,6 +26,8 @@ kernel void kernel_soft_max(
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global char * src2,
|
||||
ulong offset2,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
@@ -48,6 +50,7 @@ kernel void kernel_soft_max(
|
||||
) {
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
src2 = src2 + offset2;
|
||||
dst = dst + offsetd;
|
||||
|
||||
int i03 = get_group_id(2);
|
||||
@@ -60,6 +63,7 @@ kernel void kernel_soft_max(
|
||||
|
||||
global float * psrc0 = (global float *)(src0 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||
global float * pmask = src1 != src0 ? (global float *)(src1 + i11*nb11 + i12*nb12 + i13*nb13) : 0;
|
||||
global float * psrc2 = src2 != src0 ? (global float *)(src2) : 0;
|
||||
global float * pdst = (global float *)(dst + i01*nb1 + i02*nb2 + i03*nb3);
|
||||
|
||||
float slope = 1.0f;
|
||||
@@ -75,7 +79,7 @@ kernel void kernel_soft_max(
|
||||
}
|
||||
|
||||
// parallel max
|
||||
float lmax = -INFINITY;
|
||||
float lmax = psrc2 ? psrc2[i02] : -INFINITY;
|
||||
for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
|
||||
lmax = fmax(lmax, psrc0[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f));
|
||||
}
|
||||
@@ -91,7 +95,11 @@ kernel void kernel_soft_max(
|
||||
pdst[i00] = exp_psrc0;
|
||||
}
|
||||
|
||||
const float sum = sub_group_reduce_add(lsum);
|
||||
float sum = sub_group_reduce_add(lsum);
|
||||
|
||||
if (psrc2) {
|
||||
sum += exp(psrc2[i02] - max);
|
||||
}
|
||||
|
||||
for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
|
||||
pdst[i00] /= sum;
|
||||
|
||||
+94
-38
@@ -64,9 +64,11 @@ struct ggml_opt_context {
|
||||
int32_t opt_i = 0;
|
||||
bool loss_per_datapoint = false;
|
||||
|
||||
ggml_opt_get_optimizer_params get_opt_pars = nullptr;
|
||||
void * get_opt_pars_ud = nullptr;
|
||||
struct ggml_tensor * adamw_params = nullptr;
|
||||
ggml_opt_get_optimizer_params get_opt_pars = nullptr;
|
||||
void * get_opt_pars_ud = nullptr;
|
||||
struct ggml_tensor * opt_step_params = nullptr; // Stores output of get_opt_pars.
|
||||
|
||||
enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW;
|
||||
};
|
||||
|
||||
struct ggml_opt_result {
|
||||
@@ -229,9 +231,13 @@ struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * us
|
||||
result.adamw.eps = 1e-8f;
|
||||
result.adamw.wd = 0.0f;
|
||||
|
||||
result.sgd.alpha = 1e-3f;
|
||||
result.sgd.wd = 0.0f;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
struct ggml_opt_optimizer_params ggml_opt_get_constant_optimizer_params(void * userdata) {
|
||||
return *((struct ggml_opt_optimizer_params *) userdata);
|
||||
}
|
||||
@@ -249,6 +255,7 @@ struct ggml_opt_params ggml_opt_default_params(
|
||||
/*opt_period =*/ 1,
|
||||
/*get_opt_pars =*/ ggml_opt_get_default_optimizer_params,
|
||||
/*get_opt_pars_ud =*/ nullptr,
|
||||
/*optimizer =*/ GGML_OPT_OPTIMIZER_TYPE_ADAMW,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -316,9 +323,14 @@ static void ggml_opt_build(ggml_opt_context_t opt_ctx) {
|
||||
GGML_ASSERT(opt_ctx->ctx_compute && "no compute context set, either use static graphs or set one with ggml_opt_prepare_alloc");
|
||||
GGML_ASSERT((!opt_ctx->static_graphs || opt_ctx->inputs->data) && "when using static graphs the inputs must be allocated statically");
|
||||
|
||||
const enum ggml_opt_optimizer_type optimizer = opt_ctx->optimizer;
|
||||
|
||||
const bool accumulate = opt_ctx->build_type_alloc >= GGML_OPT_BUILD_TYPE_GRAD &&
|
||||
!(opt_ctx->static_graphs && opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_OPT && opt_ctx->opt_period == 1);
|
||||
|
||||
const bool need_momenta = opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_OPT &&
|
||||
opt_ctx->optimizer == GGML_OPT_OPTIMIZER_TYPE_ADAMW;
|
||||
|
||||
ggml_set_input(opt_ctx->inputs);
|
||||
ggml_set_output(opt_ctx->outputs);
|
||||
|
||||
@@ -340,8 +352,7 @@ static void ggml_opt_build(ggml_opt_context_t opt_ctx) {
|
||||
// - pred (if using static graphs)
|
||||
// - ncorrect (if using static graphs, 2 tensors).
|
||||
constexpr size_t n_loss = 1;
|
||||
const size_t tensors_per_param = (accumulate ? 1 : 0) +
|
||||
(opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_OPT ? 2 : 0);
|
||||
const size_t tensors_per_param = (accumulate ? 1 : 0) + (need_momenta ? 2 : 0);
|
||||
const size_t tensors_const = opt_ctx->static_graphs ? 9 : 0;
|
||||
const size_t size_meta = (n_loss + tensors_per_param*n_param + tensors_const) * ggml_tensor_overhead();
|
||||
struct ggml_init_params params = {
|
||||
@@ -458,7 +469,7 @@ static void ggml_opt_build(ggml_opt_context_t opt_ctx) {
|
||||
}
|
||||
}
|
||||
|
||||
if (opt_ctx->build_type_alloc >= GGML_OPT_BUILD_TYPE_OPT) {
|
||||
if (need_momenta && opt_ctx->build_type_alloc >= GGML_OPT_BUILD_TYPE_OPT) {
|
||||
opt_ctx->grad_m.resize(n_nodes);
|
||||
opt_ctx->grad_v.resize(n_nodes);
|
||||
for (int i = 0; i < n_nodes; ++i) {
|
||||
@@ -492,23 +503,36 @@ static void ggml_opt_build(ggml_opt_context_t opt_ctx) {
|
||||
// gb_opt == graph backward optimize, forward pass, then backward pass to calculate gradients, then optimizer step.
|
||||
opt_ctx->gb_opt = ggml_graph_dup(opt_ctx->ctx_compute, opt_ctx->gb_grad, /*force_grads =*/ true);
|
||||
|
||||
opt_ctx->adamw_params = ggml_new_tensor_1d(opt_ctx->ctx_cpu, GGML_TYPE_F32, 7);
|
||||
ggml_set_input(opt_ctx->adamw_params);
|
||||
ggml_set_name(opt_ctx->adamw_params, "adamw_params");
|
||||
|
||||
opt_ctx->opt_step_params = ggml_new_tensor_1d(opt_ctx->ctx_cpu, GGML_TYPE_F32, need_momenta ? 7 : 2);
|
||||
ggml_tensor * adamw_params = opt_ctx->opt_step_params;
|
||||
ggml_set_input(adamw_params);
|
||||
const char * optimizer_name = ggml_opt_optimizer_name(opt_ctx->optimizer);
|
||||
ggml_format_name(adamw_params, "%s_params", optimizer_name);
|
||||
for (int i = opt_ctx->gf->n_nodes-1; i >= 0; --i) {
|
||||
struct ggml_tensor * node = opt_ctx->gb_opt->nodes[i];
|
||||
struct ggml_tensor * grad = ggml_graph_get_grad(opt_ctx->gb_opt, node);
|
||||
|
||||
if (grad && (node->flags & GGML_TENSOR_FLAG_PARAM)) {
|
||||
struct ggml_tensor * m = opt_ctx->grad_m[i];
|
||||
struct ggml_tensor * v = opt_ctx->grad_v[i];
|
||||
struct ggml_tensor * opt_step = ggml_opt_step_adamw(opt_ctx->ctx_compute, node, grad, m, v, opt_ctx->adamw_params);
|
||||
|
||||
ggml_set_name(m, (std::string("AdamW m for ") + std::string(node->name)).c_str());
|
||||
ggml_set_name(v, (std::string("AdamW v for ") + std::string(node->name)).c_str());
|
||||
ggml_set_name(opt_step, (std::string("AdamW step for ") + std::string(node->name)).c_str());
|
||||
|
||||
struct ggml_tensor * m = nullptr;
|
||||
struct ggml_tensor * v = nullptr;
|
||||
if (need_momenta) {
|
||||
m = opt_ctx->grad_m[i];
|
||||
v = opt_ctx->grad_v[i];
|
||||
ggml_format_name(m, "AdamW m for %s", node->name);
|
||||
ggml_format_name(v, "AdamW v for %s", node->name);
|
||||
}
|
||||
struct ggml_tensor * opt_step;
|
||||
switch (optimizer) {
|
||||
case GGML_OPT_OPTIMIZER_TYPE_ADAMW:
|
||||
opt_step = ggml_opt_step_adamw(opt_ctx->ctx_compute, node, grad, m, v, adamw_params);
|
||||
break;
|
||||
case GGML_OPT_OPTIMIZER_TYPE_SGD:
|
||||
opt_step = ggml_opt_step_sgd(opt_ctx->ctx_compute, node, grad, adamw_params);
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
ggml_format_name(opt_step, "%s step for %s", optimizer_name, node->name);
|
||||
ggml_build_forward_expand(opt_ctx->gb_opt, opt_step);
|
||||
}
|
||||
}
|
||||
@@ -534,6 +558,7 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
|
||||
result->opt_period = params.opt_period;
|
||||
result->get_opt_pars = params.get_opt_pars;
|
||||
result->get_opt_pars_ud = params.get_opt_pars_ud;
|
||||
result->optimizer = params.optimizer;
|
||||
|
||||
GGML_ASSERT(result->opt_period >= 1);
|
||||
|
||||
@@ -756,29 +781,43 @@ void ggml_opt_alloc(ggml_opt_context_t opt_ctx, bool backward) {
|
||||
void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result) {
|
||||
GGML_ASSERT(opt_ctx->eval_ready);
|
||||
if (opt_ctx->allocated_graph == opt_ctx->gb_opt) {
|
||||
struct ggml_opt_optimizer_params opt_pars = opt_ctx->get_opt_pars(opt_ctx->get_opt_pars_ud);
|
||||
const ggml_opt_optimizer_params & opt_pars = opt_ctx->get_opt_pars(opt_ctx->get_opt_pars_ud);
|
||||
|
||||
GGML_ASSERT(opt_pars.adamw.alpha > 0.0f);
|
||||
GGML_ASSERT(opt_pars.adamw.beta1 >= 0.0f);
|
||||
GGML_ASSERT(opt_pars.adamw.beta1 <= 1.0f);
|
||||
GGML_ASSERT(opt_pars.adamw.beta2 >= 0.0f);
|
||||
GGML_ASSERT(opt_pars.adamw.beta2 <= 1.0f);
|
||||
GGML_ASSERT(opt_pars.adamw.eps >= 0.0f);
|
||||
GGML_ASSERT(opt_pars.adamw.wd >= 0.0f);
|
||||
GGML_ASSERT(opt_pars.adamw.wd <= 1.0f);
|
||||
switch (opt_ctx->optimizer) {
|
||||
case GGML_OPT_OPTIMIZER_TYPE_ADAMW: {
|
||||
GGML_ASSERT(opt_pars.adamw.alpha > 0.0f);
|
||||
GGML_ASSERT(opt_pars.adamw.beta1 >= 0.0f);
|
||||
GGML_ASSERT(opt_pars.adamw.beta1 <= 1.0f);
|
||||
GGML_ASSERT(opt_pars.adamw.beta2 >= 0.0f);
|
||||
GGML_ASSERT(opt_pars.adamw.beta2 <= 1.0f);
|
||||
GGML_ASSERT(opt_pars.adamw.eps >= 0.0f);
|
||||
GGML_ASSERT(opt_pars.adamw.wd >= 0.0f);
|
||||
GGML_ASSERT(opt_pars.adamw.wd <= 1.0f);
|
||||
|
||||
// beta1, beta2 after applying warmup
|
||||
const float beta1h = 1.0f/(1.0f - powf(opt_pars.adamw.beta1, opt_ctx->iter));
|
||||
const float beta2h = 1.0f/(1.0f - powf(opt_pars.adamw.beta2, opt_ctx->iter));
|
||||
// beta1, beta2 after applying warmup
|
||||
const float beta1h = 1.0f / (1.0f - powf(opt_pars.adamw.beta1, opt_ctx->iter));
|
||||
const float beta2h = 1.0f / (1.0f - powf(opt_pars.adamw.beta2, opt_ctx->iter));
|
||||
|
||||
float * adamw_par_data = ggml_get_data_f32(opt_ctx->adamw_params);
|
||||
adamw_par_data[0] = opt_pars.adamw.alpha;
|
||||
adamw_par_data[1] = opt_pars.adamw.beta1;
|
||||
adamw_par_data[2] = opt_pars.adamw.beta2;
|
||||
adamw_par_data[3] = opt_pars.adamw.eps;
|
||||
adamw_par_data[4] = opt_pars.adamw.wd;
|
||||
adamw_par_data[5] = beta1h;
|
||||
adamw_par_data[6] = beta2h;
|
||||
float * adamw_par_data = ggml_get_data_f32(opt_ctx->opt_step_params);
|
||||
adamw_par_data[0] = opt_pars.adamw.alpha;
|
||||
adamw_par_data[1] = opt_pars.adamw.beta1;
|
||||
adamw_par_data[2] = opt_pars.adamw.beta2;
|
||||
adamw_par_data[3] = opt_pars.adamw.eps;
|
||||
adamw_par_data[4] = opt_pars.adamw.wd;
|
||||
adamw_par_data[5] = beta1h;
|
||||
adamw_par_data[6] = beta2h;
|
||||
} break;
|
||||
case GGML_OPT_OPTIMIZER_TYPE_SGD: {
|
||||
GGML_ASSERT(opt_pars.sgd.alpha > 0.0f);
|
||||
GGML_ASSERT(opt_pars.sgd.wd >= 0.0f);
|
||||
GGML_ASSERT(opt_pars.sgd.wd <= 1.0f);
|
||||
float * sgd = ggml_get_data_f32(opt_ctx->opt_step_params);
|
||||
sgd[0] = opt_pars.sgd.alpha;
|
||||
sgd[1] = opt_pars.sgd.wd;
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
ggml_backend_sched_graph_compute(opt_ctx->backend_sched, opt_ctx->allocated_graph_copy);
|
||||
@@ -963,6 +1002,7 @@ void ggml_opt_fit(
|
||||
ggml_tensor * outputs,
|
||||
ggml_opt_dataset_t dataset,
|
||||
enum ggml_opt_loss_type loss_type,
|
||||
enum ggml_opt_optimizer_type optimizer,
|
||||
ggml_opt_get_optimizer_params get_opt_pars,
|
||||
int64_t nepoch,
|
||||
int64_t nbatch_logical,
|
||||
@@ -993,6 +1033,7 @@ void ggml_opt_fit(
|
||||
params.opt_period = opt_period;
|
||||
params.get_opt_pars = get_opt_pars;
|
||||
params.get_opt_pars_ud = &epoch;
|
||||
params.optimizer = optimizer;
|
||||
ggml_opt_context_t opt_ctx = ggml_opt_init(params);
|
||||
|
||||
// Shuffling the data is generally useful but there is only a point if not all data is used in a single batch.
|
||||
@@ -1035,3 +1076,18 @@ void ggml_opt_fit(
|
||||
ggml_opt_result_free(result_train);
|
||||
ggml_opt_result_free(result_val);
|
||||
}
|
||||
|
||||
enum ggml_opt_optimizer_type ggml_opt_context_optimizer_type(ggml_opt_context_t c) {
|
||||
return c->optimizer;
|
||||
}
|
||||
|
||||
GGML_API const char * ggml_opt_optimizer_name(enum ggml_opt_optimizer_type o) {
|
||||
switch (o) {
|
||||
case GGML_OPT_OPTIMIZER_TYPE_ADAMW:
|
||||
return "adamw";
|
||||
case GGML_OPT_OPTIMIZER_TYPE_SGD:
|
||||
return "sgd";
|
||||
default:
|
||||
return "undefined";
|
||||
};
|
||||
}
|
||||
|
||||
@@ -288,7 +288,7 @@ void quantize_row_mxfp4_ref(const float * GGML_RESTRICT x, block_mxfp4 * GGML_RE
|
||||
}
|
||||
}
|
||||
|
||||
const uint8_t e = (uint8_t) (floorf(log2f(amax)) - 2 + 127);
|
||||
const uint8_t e = amax > 0.0f ? (uint8_t) (floorf(log2f(amax)) - 2 + 127) : 0;
|
||||
|
||||
const float d = GGML_E8M0_TO_FP32_HALF(e);
|
||||
|
||||
|
||||
@@ -29,9 +29,12 @@
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <filesystem>
|
||||
#include <algorithm>
|
||||
|
||||
namespace fs = std::filesystem;
|
||||
|
||||
static constexpr size_t MAX_CHUNK_SIZE = 1024ull * 1024ull * 1024ull; // 1 GiB
|
||||
|
||||
#ifdef _WIN32
|
||||
typedef SOCKET sockfd_t;
|
||||
using ssize_t = __int64;
|
||||
@@ -323,11 +326,14 @@ static std::shared_ptr<socket_t> create_server_socket(const char * host, int por
|
||||
static bool send_data(sockfd_t sockfd, const void * data, size_t size) {
|
||||
size_t bytes_sent = 0;
|
||||
while (bytes_sent < size) {
|
||||
ssize_t n = send(sockfd, (const char *)data + bytes_sent, size - bytes_sent, 0);
|
||||
size_t size_to_send = std::min(size - bytes_sent, MAX_CHUNK_SIZE);
|
||||
ssize_t n = send(sockfd, (const char *)data + bytes_sent, size_to_send, 0);
|
||||
if (n < 0) {
|
||||
GGML_LOG_ERROR("send failed (bytes_sent=%zu, size_to_send=%zu)\n",
|
||||
bytes_sent, size_to_send);
|
||||
return false;
|
||||
}
|
||||
bytes_sent += n;
|
||||
bytes_sent += (size_t)n;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
@@ -335,11 +341,18 @@ static bool send_data(sockfd_t sockfd, const void * data, size_t size) {
|
||||
static bool recv_data(sockfd_t sockfd, void * data, size_t size) {
|
||||
size_t bytes_recv = 0;
|
||||
while (bytes_recv < size) {
|
||||
ssize_t n = recv(sockfd, (char *)data + bytes_recv, size - bytes_recv, 0);
|
||||
if (n <= 0) {
|
||||
size_t size_to_recv = std::min(size - bytes_recv, MAX_CHUNK_SIZE);
|
||||
ssize_t n = recv(sockfd, (char *)data + bytes_recv, size_to_recv, 0);
|
||||
if (n < 0) {
|
||||
GGML_LOG_ERROR("recv failed (bytes_recv=%zu, size_to_recv=%zu)\n",
|
||||
bytes_recv, size_to_recv);
|
||||
return false;
|
||||
}
|
||||
bytes_recv += n;
|
||||
if (n == 0) {
|
||||
GGML_LOG_ERROR("recv returned 0 (peer closed?)\n");
|
||||
return false;
|
||||
}
|
||||
bytes_recv += (size_t)n;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
@@ -823,10 +836,10 @@ ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
|
||||
};
|
||||
|
||||
ggml_backend_t backend = new ggml_backend {
|
||||
/* .guid = */ ggml_backend_rpc_guid(),
|
||||
/* .interface = */ ggml_backend_rpc_interface,
|
||||
/* .device = */ ggml_backend_rpc_add_device(endpoint),
|
||||
/* .context = */ ctx
|
||||
/* .guid = */ ggml_backend_rpc_guid(),
|
||||
/* .iface = */ ggml_backend_rpc_interface,
|
||||
/* .device = */ ggml_backend_rpc_add_device(endpoint),
|
||||
/* .context = */ ctx
|
||||
};
|
||||
return backend;
|
||||
}
|
||||
|
||||
@@ -2705,9 +2705,9 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
|
||||
" : converting src1 to fp16");
|
||||
|
||||
// iterate tensor dims and find the slowest moving dim and stride
|
||||
int64_t last_dim=0;
|
||||
int64_t last_str=0;
|
||||
int64_t largest_str=0;
|
||||
int last_dim=0;
|
||||
int last_str=0;
|
||||
size_t largest_str=0;
|
||||
for(int i = 0; i< 4; i++){
|
||||
// last stride is always the largest
|
||||
if(src1->nb[i] == largest_str){
|
||||
@@ -2783,7 +2783,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
|
||||
auto launch_gemm_for_batches = [&ctx, queue](const sycl::half *src0,
|
||||
const sycl::half *src1, float *dst,
|
||||
int64_t a0, int64_t a1, int64_t batcha,
|
||||
int64_t b0, int64_t b1, int64_t batchb,
|
||||
int64_t /*b0*/, int64_t b1, int64_t batchb,
|
||||
int64_t sa0, int64_t sa1, int64_t sa2,
|
||||
int64_t sb0, int64_t sb1, int64_t sb2,
|
||||
int64_t sd2) {
|
||||
@@ -2832,14 +2832,26 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
|
||||
}
|
||||
};
|
||||
|
||||
bool cont_batches_a = nb02 * ne02 == nb03;
|
||||
bool cont_batches_b = nb12 * ne12 == nb13;
|
||||
if (cont_batches_a && cont_batches_b) {
|
||||
const bool cont_batches_dim2_a = nb02 * ne02 == nb03;
|
||||
const bool cont_batches_dim2_b = nb12 * ne12 == nb13;
|
||||
const bool cont_batches_dim3_a = ne02 == 1 && nb02 * ne01 == nb03;
|
||||
const bool cont_batches_dim3_b = ne12 == 1 && nb12 * ne11 == nb13;
|
||||
if (cont_batches_dim2_a && cont_batches_dim2_b) {
|
||||
// A batch is considered contiguous if the dimension 2 is not strided
|
||||
int64_t batches0 = ne02 * ne03;
|
||||
int64_t batches1 = ne12 * ne13;
|
||||
launch_gemm_for_batches(src0_f16, src1_f16, dst_ddf, ne00, ne01, batches0,
|
||||
ne10, ne11, batches1, str_a0, str_a1, str_a2, str_b0, str_b1,
|
||||
str_b2, nb2 / sizeof(float));
|
||||
} else if (cont_batches_dim3_a && cont_batches_dim3_b) {
|
||||
// This case is similar to the one above with the difference that only the batch in dimension 3 is used and the dimension 2 is of size 1.
|
||||
int64_t batches0 = ne02 * ne03;
|
||||
int64_t batches1 = ne12 * ne13;
|
||||
int64_t str_a3 = nb03 / type_size_src0;
|
||||
int64_t str_b3 = nb13 / type_size_src1;
|
||||
launch_gemm_for_batches(src0_f16, src1_f16, dst_ddf, ne00, ne01, batches0,
|
||||
ne10, ne11, batches1, str_a0, str_a1, str_a3, str_b0, str_b1,
|
||||
str_b3, nb2 / sizeof(float));
|
||||
} else {
|
||||
for (int64_t b_a = 0; b_a < ne03; b_a++) {
|
||||
const sycl::half *src0_f16_shifted
|
||||
@@ -4215,6 +4227,15 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
// FIXME: keep a list of supported types to avoid breaking the backend when a new type is added
|
||||
return false;
|
||||
}
|
||||
// TODO: The configuration below needs more work to be supported with oneDNN
|
||||
if (ggml_is_permuted(a) && !ggml_is_contiguous(a) && a->ne[2] > 1 && a->ne[3] > 1) {
|
||||
return false;
|
||||
}
|
||||
// TODO: This specific configuration can fail with oneDNN and needs more debugging
|
||||
if (!ggml_is_permuted(a) && ggml_is_permuted(b) && b->ne[2] > 1 && b->ne[3] > 1 &&
|
||||
a->ne[0] > 128 && a->ne[2] == 1 && src0_type == GGML_TYPE_F16) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
case GGML_OP_OUT_PROD:
|
||||
@@ -4586,10 +4607,10 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
|
||||
};
|
||||
|
||||
ggml_backend_t sycl_backend = new ggml_backend {
|
||||
/* .guid = */ ggml_backend_sycl_guid(),
|
||||
/* .interface = */ ggml_backend_sycl_interface,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_sycl_reg(), device),
|
||||
/* .context = */ ctx
|
||||
/* .guid = */ ggml_backend_sycl_guid(),
|
||||
/* .iface = */ ggml_backend_sycl_interface,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_sycl_reg(), device),
|
||||
/* .context = */ ctx
|
||||
};
|
||||
|
||||
return sycl_backend;
|
||||
|
||||
@@ -510,6 +510,7 @@ struct vk_device_struct {
|
||||
vk_pipeline pipeline_rwkv_wkv6_f32;
|
||||
vk_pipeline pipeline_rwkv_wkv7_f32;
|
||||
vk_pipeline pipeline_opt_step_adamw_f32;
|
||||
vk_pipeline pipeline_opt_step_sgd_f32;
|
||||
vk_pipeline pipeline_conv2d_f32[CONV_SHAPE_COUNT];
|
||||
vk_pipeline pipeline_conv2d_f16_f32[CONV_SHAPE_COUNT];
|
||||
vk_pipeline pipeline_conv2d_dw_whcn_f32;
|
||||
@@ -534,6 +535,7 @@ struct vk_device_struct {
|
||||
ggml_backend_buffer_type buffer_type;
|
||||
|
||||
bool disable_fusion;
|
||||
bool disable_host_visible_vidmem;
|
||||
|
||||
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
||||
std::unique_ptr<vk_memory_logger> memory_logger;
|
||||
@@ -1109,17 +1111,23 @@ class vk_perf_logger {
|
||||
return;
|
||||
}
|
||||
if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) {
|
||||
const uint64_t m = node->src[0]->ne[1];
|
||||
const uint64_t n = node->src[1]->ne[1];
|
||||
const uint64_t k = node->src[1]->ne[0];
|
||||
std::string name = ggml_op_name(node->op);
|
||||
if (n == 1) {
|
||||
name += "_VEC m=" + std::to_string(m) + " k=" + std::to_string(k);
|
||||
} else {
|
||||
name += " m=" + std::to_string(m) + " n=" + std::to_string(n) + " k=" + std::to_string(k);
|
||||
const uint64_t m = node->src[0]->ne[1];
|
||||
const uint64_t n = node->ne[1];
|
||||
const uint64_t k = node->src[1]->ne[0];
|
||||
const uint64_t batch = node->src[1]->ne[2] * node->src[1]->ne[3];
|
||||
std::string name = ggml_op_name(node->op);
|
||||
if ((node->op == GGML_OP_MUL_MAT && n <= mul_mat_vec_max_cols) ||
|
||||
(node->op == GGML_OP_MUL_MAT_ID && node->src[2]->ne[1] == 1)) {
|
||||
name += "_VEC";
|
||||
}
|
||||
name += " ";
|
||||
name += ggml_type_name(node->src[0]->type);
|
||||
name += " m=" + std::to_string(m) + " n=" + std::to_string(n) + " k=" + std::to_string(k);
|
||||
if (batch > 1) {
|
||||
name += " batch=" + std::to_string(batch);
|
||||
}
|
||||
timings[name].push_back(time);
|
||||
flops[name].push_back(m * n * (k + (k - 1)));
|
||||
flops[name].push_back(m * n * (k + (k - 1)) * batch);
|
||||
return;
|
||||
}
|
||||
if (node->op == GGML_OP_CONV_2D) {
|
||||
@@ -1804,6 +1812,8 @@ static vk_buffer ggml_vk_create_buffer_device(vk_device& device, size_t size) {
|
||||
} else if (device->uma) {
|
||||
// Fall back to host memory type
|
||||
buf = ggml_vk_create_buffer(device, size, vk::MemoryPropertyFlagBits::eDeviceLocal, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
||||
} else if (device->disable_host_visible_vidmem) {
|
||||
buf = ggml_vk_create_buffer(device, size, vk::MemoryPropertyFlagBits::eDeviceLocal, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
} else {
|
||||
// use rebar if available, otherwise fallback to device only visible memory
|
||||
buf = ggml_vk_create_buffer(device, size, vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
@@ -2283,14 +2293,14 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
};
|
||||
|
||||
#define CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, HSK, HSV, HEAD_SIZES) \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][0][0][0], "flash_attn_f32_f16_" #HEAD_SIZES "_f16acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,false), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,false), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][0][0][1], "flash_attn_f32_f16_" #HEAD_SIZES "_aligned_f16acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,false), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,false), fa_rows_cols(FAPATH,HSK,HSV,0,TYPE,false)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][1][0][0], "flash_attn_f32_f16_" #HEAD_SIZES "_f32acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,false), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,false), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][1][0][1], "flash_attn_f32_f16_" #HEAD_SIZES "_aligned_f32acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,false), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,false), fa_rows_cols(FAPATH,HSK,HSV,0,TYPE,false)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][0][1][0], "flash_attn_f32_f16_" #HEAD_SIZES "_f16acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,true), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,true), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][0][1][1], "flash_attn_f32_f16_" #HEAD_SIZES "_aligned_f16acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,true), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,true), fa_rows_cols(FAPATH,HSK,HSV,0,TYPE,true)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][1][1][0], "flash_attn_f32_f16_" #HEAD_SIZES "_f32acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,true), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,true), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][1][1][1], "flash_attn_f32_f16_" #HEAD_SIZES "_aligned_f32acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,true), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,true), fa_rows_cols(FAPATH,HSK,HSV,0,TYPE,true)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][0][0][0], "flash_attn_f32_f16_" #HEAD_SIZES "_f16acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,false), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,false), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][0][0][1], "flash_attn_f32_f16_" #HEAD_SIZES "_aligned_f16acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,false), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,false), fa_rows_cols(FAPATH,HSK,HSV,0,TYPE,false)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][1][0][0], "flash_attn_f32_f16_" #HEAD_SIZES "_f32acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,false), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,false), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][1][0][1], "flash_attn_f32_f16_" #HEAD_SIZES "_aligned_f32acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,false), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,false), fa_rows_cols(FAPATH,HSK,HSV,0,TYPE,false)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][0][1][0], "flash_attn_f32_f16_" #HEAD_SIZES "_f16acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,true), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,true), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][0][1][1], "flash_attn_f32_f16_" #HEAD_SIZES "_aligned_f16acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,true), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,true), fa_rows_cols(FAPATH,HSK,HSV,0,TYPE,true)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][1][1][0], "flash_attn_f32_f16_" #HEAD_SIZES "_f32acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,true), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,true), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][1][1][1], "flash_attn_f32_f16_" #HEAD_SIZES "_aligned_f32acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,true), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,true), fa_rows_cols(FAPATH,HSK,HSV,0,TYPE,true)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \
|
||||
|
||||
#define CREATE_FA(TYPE, NAMELC, FAPATH, SUFFIX) \
|
||||
CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 64, 64, 64) \
|
||||
@@ -2907,7 +2917,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_MXFP4], "get_rows_mxfp4_f32", get_rows_mxfp4_f32_len, get_rows_mxfp4_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256 * 4, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_split_k_reduce, "fa_split_k_reduce", fa_split_k_reduce_len, fa_split_k_reduce_data, "main", 2, 4 * sizeof(uint32_t), {1, device->subgroup_size, 1}, {device->subgroup_size}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_split_k_reduce, "fa_split_k_reduce", fa_split_k_reduce_len, fa_split_k_reduce_data, "main", 3, 5 * sizeof(uint32_t), {1, device->subgroup_size, 1}, {device->subgroup_size}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1, "quantize_q8_1", quantize_q8_1_len, quantize_q8_1_data, "main", 2, 1 * sizeof(uint32_t), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1);
|
||||
|
||||
for (uint32_t i = 0; i < p021_max_gqa_ratio; ++i) {
|
||||
@@ -3120,6 +3130,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_opt_step_adamw_f32, "opt_step_adamw_f32", opt_step_adamw_f32_len, opt_step_adamw_f32_data, "main", 5, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_opt_step_sgd_f32, "opt_step_sgd_f32", opt_step_sgd_f32_len, opt_step_sgd_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
// conv2d
|
||||
for (uint32_t s = 0; s < CONV_SHAPE_COUNT; ++s) {
|
||||
uint32_t conv2d_WG_SIZE = 256;
|
||||
@@ -3265,6 +3277,9 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
const char* GGML_VK_PREFER_HOST_MEMORY = getenv("GGML_VK_PREFER_HOST_MEMORY");
|
||||
device->prefer_host_memory = GGML_VK_PREFER_HOST_MEMORY != nullptr;
|
||||
|
||||
const char* GGML_VK_DISABLE_HOST_VISIBLE_VIDMEM = getenv("GGML_VK_DISABLE_HOST_VISIBLE_VIDMEM");
|
||||
device->disable_host_visible_vidmem = GGML_VK_DISABLE_HOST_VISIBLE_VIDMEM != nullptr;
|
||||
|
||||
bool fp16_storage = false;
|
||||
bool fp16_compute = false;
|
||||
bool maintenance4_support = false;
|
||||
@@ -6501,11 +6516,14 @@ static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, co
|
||||
return supported;
|
||||
}
|
||||
|
||||
static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * q, const ggml_tensor * k, const ggml_tensor * v, const ggml_tensor * mask, ggml_tensor * dst, bool dryrun = false) {
|
||||
static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * q, const ggml_tensor * k, const ggml_tensor * v, const ggml_tensor * mask, const ggml_tensor * sinks, ggml_tensor * dst, bool dryrun = false) {
|
||||
VK_LOG_DEBUG("ggml_vk_flash_attn((" << q << ", name=" << q->name << ", type=" << q->type << ", ne0=" << q->ne[0] << ", ne1=" << q->ne[1] << ", ne2=" << q->ne[2] << ", ne3=" << q->ne[3] << ", nb0=" << q->nb[0] << ", nb1=" << q->nb[1] << ", nb2=" << q->nb[2] << ", nb3=" << q->nb[3];
|
||||
std::cerr << "), (" << k << ", name=" << k->name << ", type=" << k->type << ", ne0=" << k->ne[0] << ", ne1=" << k->ne[1] << ", ne2=" << k->ne[2] << ", ne3=" << k->ne[3] << ", nb0=" << k->nb[0] << ", nb1=" << k->nb[1] << ", nb2=" << k->nb[2] << ", nb3=" << k->nb[3];
|
||||
std::cerr << "), (" << v << ", name=" << v->name << ", type=" << v->type << ", ne0=" << v->ne[0] << ", ne1=" << v->ne[1] << ", ne2=" << v->ne[2] << ", ne3=" << v->ne[3] << ", nb0=" << v->nb[0] << ", nb1=" << v->nb[1] << ", nb2=" << v->nb[2] << ", nb3=" << v->nb[3];
|
||||
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
|
||||
if (sinks) {
|
||||
std::cerr << "), (" << sinks << ", name=" << sinks->name << ", type=" << sinks->type << ", ne0=" << sinks->ne[0] << ", ne1=" << sinks->ne[1] << ", ne2=" << sinks->ne[2] << ", ne3=" << sinks->ne[3] << ", nb0=" << sinks->nb[0] << ", nb1=" << sinks->nb[1] << ", nb2=" << sinks->nb[2] << ", nb3=" << sinks->nb[3];
|
||||
}
|
||||
std::cerr << "), " << (dryrun ? "dryrun" : "") << ")");
|
||||
|
||||
GGML_TENSOR_LOCALS(int64_t, neq, q, ne)
|
||||
@@ -6704,10 +6722,10 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||
|
||||
vk_buffer d_Q = nullptr, d_K = nullptr, d_V = nullptr, d_D = nullptr, d_M = nullptr;
|
||||
size_t q_buf_offset = 0, k_buf_offset = 0, v_buf_offset = 0, d_buf_offset = 0, m_buf_offset = 0;
|
||||
vk_buffer d_Q = nullptr, d_K = nullptr, d_V = nullptr, d_D = nullptr, d_M = nullptr, d_S = nullptr;
|
||||
size_t q_buf_offset = 0, k_buf_offset = 0, v_buf_offset = 0, d_buf_offset = 0, m_buf_offset = 0, s_buf_offset = 0;
|
||||
|
||||
bool Q_uma = false, K_uma = false, V_uma = false, D_uma = false, M_uma = false;
|
||||
bool Q_uma = false, K_uma = false, V_uma = false, D_uma = false, M_uma = false, S_uma = false;
|
||||
|
||||
if (ctx->device->uma) {
|
||||
ggml_vk_host_get(ctx->device, q->data, d_Q, q_buf_offset);
|
||||
@@ -6722,6 +6740,10 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
ggml_vk_host_get(ctx->device, mask->data, d_M, m_buf_offset);
|
||||
M_uma = d_M != nullptr;
|
||||
}
|
||||
if (sinks) {
|
||||
ggml_vk_host_get(ctx->device, sinks->data, d_S, s_buf_offset);
|
||||
S_uma = d_S != nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -6757,7 +6779,17 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t mask_n_head_log2 = ((mask != nullptr) << 16) | n_head_log2;
|
||||
if (!S_uma) {
|
||||
d_S = d_Q;
|
||||
s_buf_offset = q_buf_offset;
|
||||
if (sinks) {
|
||||
ggml_backend_vk_buffer_context * s_buf_ctx = (ggml_backend_vk_buffer_context*)sinks->buffer->context;
|
||||
d_S = s_buf_ctx->dev_buffer;
|
||||
s_buf_offset = vk_tensor_offset(sinks) + sinks->view_offs;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t mask_n_head_log2 = ((sinks != nullptr) << 24) | ((mask != nullptr) << 16) | n_head_log2;
|
||||
|
||||
const vk_flash_attn_push_constants pc = { N, KV,
|
||||
(uint32_t)ne1, (uint32_t)ne2, (uint32_t)ne3,
|
||||
@@ -6781,6 +6813,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
vk_subbuffer{d_K, k_buf_offset, VK_WHOLE_SIZE},
|
||||
vk_subbuffer{d_V, v_buf_offset, VK_WHOLE_SIZE},
|
||||
vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE},
|
||||
vk_subbuffer{d_S, s_buf_offset, VK_WHOLE_SIZE},
|
||||
vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE},
|
||||
},
|
||||
// We only use split_k when group query attention is enabled, which means
|
||||
@@ -6790,10 +6823,11 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
pc, { workgroups_x * pipeline->wg_denoms[0], workgroups_y, workgroups_z });
|
||||
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
const std::array<uint32_t, 4> pc2 = { HSV, (uint32_t)ne1, (uint32_t)ne3, split_k };
|
||||
const std::array<uint32_t, 5> pc2 = { HSV, (uint32_t)ne1, (uint32_t)ne3, split_k, (sinks != nullptr) };
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_flash_attn_split_k_reduce,
|
||||
{
|
||||
vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE},
|
||||
vk_subbuffer{d_S, s_buf_offset, VK_WHOLE_SIZE},
|
||||
vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
|
||||
},
|
||||
pc2, { (uint32_t)ne1, HSV, (uint32_t)ne3 });
|
||||
@@ -6804,6 +6838,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
vk_subbuffer{d_K, k_buf_offset, VK_WHOLE_SIZE},
|
||||
vk_subbuffer{d_V, v_buf_offset, VK_WHOLE_SIZE},
|
||||
vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE},
|
||||
vk_subbuffer{d_S, s_buf_offset, VK_WHOLE_SIZE},
|
||||
vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
|
||||
},
|
||||
pc, { workgroups_x, workgroups_y, workgroups_z });
|
||||
@@ -7167,6 +7202,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||
return ctx->device->pipeline_opt_step_adamw_f32;
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_OPT_STEP_SGD:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_opt_step_sgd_f32;
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_leaky_relu_f32;
|
||||
@@ -7666,6 +7706,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
||||
ggml_vk_buffer_memset_async(subctx, d_D, d_buf_offset, 0, d_sz);
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
||||
} else if (op == GGML_OP_OPT_STEP_SGD) {
|
||||
// OPT_STEP_SGD works on src0, it does not need dst
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz } }, pc, elements);
|
||||
} else if (use_src2) {
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
||||
@@ -8019,6 +8063,12 @@ static void ggml_vk_opt_step_adamw(ggml_backend_vk_context * ctx, vk_context& su
|
||||
);
|
||||
}
|
||||
|
||||
static void ggml_vk_opt_step_sgd(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool dryrun = false) {
|
||||
const size_t n = ggml_nelements(dst->src[0]);
|
||||
|
||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_OPT_STEP_SGD, { (uint32_t)n, 0, 0.0f, 0.0f }, dryrun);
|
||||
}
|
||||
|
||||
static void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
||||
int * op_params = (int *)dst->op_params;
|
||||
|
||||
@@ -8310,7 +8360,7 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, cons
|
||||
(uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
|
||||
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1]}, theta_scale,
|
||||
src2 != nullptr, (uint32_t)src0->ne[2], s1, s2,
|
||||
sections[0], sections[1], sections[2], sections[3], backprop
|
||||
{ sections[0], sections[1], sections[2], sections[3] }, backprop
|
||||
}, dryrun);
|
||||
}
|
||||
|
||||
@@ -8342,7 +8392,7 @@ static void ggml_vk_sum_rows(ggml_backend_vk_context * ctx, vk_context& subctx,
|
||||
}
|
||||
|
||||
static void ggml_vk_argmax(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGMAX, { (uint32_t)src0->ne[0], 0, 0.0f, 0.0f }, dryrun);
|
||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGMAX, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], 0.0f, 0.0f }, dryrun);
|
||||
}
|
||||
|
||||
static void ggml_vk_count_equal(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
||||
@@ -9572,11 +9622,11 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
case GGML_OP_OPT_STEP_ADAMW:
|
||||
case GGML_OP_OPT_STEP_SGD:
|
||||
break;
|
||||
default:
|
||||
std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
|
||||
GGML_ABORT("fatal error");
|
||||
return false;
|
||||
}
|
||||
|
||||
vk_context compute_ctx;
|
||||
@@ -9636,6 +9686,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
||||
case GGML_OP_CONV_2D:
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
case GGML_OP_OPT_STEP_SGD:
|
||||
{
|
||||
// These operations all go through ggml_vk_op_f32, so short-circuit and
|
||||
// do the only thing needed for the dryrun.
|
||||
@@ -9868,7 +9919,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
||||
break;
|
||||
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
ggml_vk_flash_attn(ctx, compute_ctx, src0, src1, src2, src3, node, dryrun);
|
||||
ggml_vk_flash_attn(ctx, compute_ctx, src0, src1, src2, src3, node->src[4], node, dryrun);
|
||||
|
||||
break;
|
||||
|
||||
@@ -9885,6 +9936,11 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
||||
case GGML_OP_OPT_STEP_ADAMW:
|
||||
ggml_vk_opt_step_adamw(ctx, compute_ctx, node, dryrun);
|
||||
|
||||
break;
|
||||
|
||||
case GGML_OP_OPT_STEP_SGD:
|
||||
ggml_vk_opt_step_sgd(ctx, compute_ctx, src0, src1, src2, node, dryrun);
|
||||
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
@@ -9988,8 +10044,8 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph *
|
||||
case GGML_OP_REPEAT:
|
||||
case GGML_OP_REPEAT_BACK:
|
||||
case GGML_OP_OPT_STEP_ADAMW:
|
||||
case GGML_OP_OPT_STEP_SGD:
|
||||
buf = tensor->buffer;
|
||||
|
||||
break;
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(tensor)) {
|
||||
@@ -10741,10 +10797,10 @@ ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
|
||||
ggml_vk_init(ctx, dev_num);
|
||||
|
||||
ggml_backend_t vk_backend = new ggml_backend {
|
||||
/* .guid = */ ggml_backend_vk_guid(),
|
||||
/* .interface = */ ggml_backend_vk_interface,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), dev_num),
|
||||
/* .context = */ ctx,
|
||||
/* .guid = */ ggml_backend_vk_guid(),
|
||||
/* .iface = */ ggml_backend_vk_interface,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), dev_num),
|
||||
/* .context = */ ctx,
|
||||
};
|
||||
|
||||
return vk_backend;
|
||||
@@ -10855,7 +10911,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case GGML_OP_GLU:
|
||||
switch (ggml_get_glu_op(op)) {
|
||||
case GGML_GLU_OP_GEGLU:
|
||||
@@ -10871,7 +10926,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case GGML_OP_MUL_MAT:
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
{
|
||||
@@ -10935,7 +10989,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
}
|
||||
|
||||
return true;
|
||||
} break;
|
||||
}
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
{
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
@@ -10945,8 +10999,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
if (head_sizes == FA_HEAD_SIZE_UNSUPPORTED) {
|
||||
return false;
|
||||
}
|
||||
// TODO: support attention sinks [TAG_ATTN_SINKS]
|
||||
if (op->src[4]) {
|
||||
if (op->src[4] && op->src[4]->type != GGML_TYPE_F32) {
|
||||
return false;
|
||||
}
|
||||
if (op->src[0]->type != GGML_TYPE_F32) {
|
||||
@@ -11026,7 +11079,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
} break;
|
||||
}
|
||||
case GGML_OP_SET_ROWS:
|
||||
{
|
||||
switch (op->type) {
|
||||
@@ -11043,7 +11096,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
} break;
|
||||
}
|
||||
case GGML_OP_CONT:
|
||||
case GGML_OP_CPY:
|
||||
case GGML_OP_DUP:
|
||||
@@ -11095,7 +11148,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
} break;
|
||||
}
|
||||
case GGML_OP_REPEAT:
|
||||
return ggml_type_size(op->type) == sizeof(float) && ggml_type_size(op->src[0]->type) == sizeof(float);
|
||||
case GGML_OP_REPEAT_BACK:
|
||||
@@ -11129,6 +11182,9 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
case GGML_OP_SIN:
|
||||
case GGML_OP_COS:
|
||||
case GGML_OP_CLAMP:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
case GGML_OP_OPT_STEP_ADAMW:
|
||||
case GGML_OP_OPT_STEP_SGD:
|
||||
return op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_ACC:
|
||||
@@ -11150,8 +11206,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_RWKV_WKV6:
|
||||
case GGML_OP_RWKV_WKV7:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
case GGML_OP_OPT_STEP_ADAMW:
|
||||
return true;
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
|
||||
@@ -11541,6 +11595,9 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
|
||||
if (tensor->op == GGML_OP_FLASH_ATTN_EXT) {
|
||||
const float * params = (const float *)tensor->op_params;
|
||||
tensor_clone = ggml_flash_attn_ext(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], src_clone[3], params[0], params[1], params[2]);
|
||||
if (src_clone[4]) {
|
||||
ggml_flash_attn_ext_add_sinks(tensor_clone, src_clone[4]);
|
||||
}
|
||||
} else if (tensor->op == GGML_OP_MUL_MAT) {
|
||||
tensor_clone = ggml_mul_mat(ggml_ctx, src_clone[0], src_clone[1]);
|
||||
} else if (tensor->op == GGML_OP_MUL_MAT_ID) {
|
||||
@@ -11746,6 +11803,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
|
||||
src_clone[0]->flags = src0->flags;
|
||||
tensor_clone = ggml_opt_step_adamw(ggml_ctx, src_clone[0], src_clone[1],
|
||||
src_clone[2], src_clone[3], src_clone[4]);
|
||||
} else if (tensor->op == GGML_OP_OPT_STEP_SGD) {
|
||||
src_clone[0]->flags = src0->flags;
|
||||
tensor_clone = ggml_opt_step_sgd(ggml_ctx, src_clone[0], src_clone[1],
|
||||
src_clone[2]);
|
||||
}
|
||||
else {
|
||||
std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
|
||||
|
||||
@@ -5,6 +5,8 @@
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
|
||||
#define FLT_MAX 3.402823466e+38F
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
||||
@@ -19,19 +21,26 @@ void main() {
|
||||
const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
|
||||
const uint col = gl_LocalInvocationID.x;
|
||||
|
||||
if (col >= p.KX) {
|
||||
if (row >= p.KY) {
|
||||
return;
|
||||
}
|
||||
A_TYPE amax = data_a[row*p.KX + col];
|
||||
tmp[col] = col;
|
||||
|
||||
A_TYPE amax = -FLT_MAX;
|
||||
uint acol = col;
|
||||
|
||||
if (col < p.KX) {
|
||||
amax = data_a[row*p.KX + col];
|
||||
}
|
||||
|
||||
for (uint i = col + BLOCK_SIZE; i < p.KX; i += BLOCK_SIZE) {
|
||||
A_TYPE val = data_a[row*p.KX + i];
|
||||
if (val > amax) {
|
||||
amax = val;
|
||||
tmp[col] = i;
|
||||
acol = i;
|
||||
}
|
||||
}
|
||||
|
||||
tmp[col] = acol;
|
||||
tmpmax[col] = amax;
|
||||
|
||||
barrier();
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user