mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-16 10:46:43 +02:00
Compare commits
49 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 059031b8c4 | |||
| 511182eabb | |||
| 133d99c599 | |||
| cb42c29427 | |||
| d233b507cd | |||
| 0f98acfac6 | |||
| ca57e0f35e | |||
| c1b295eea5 | |||
| de73196344 | |||
| b49a13dd2f | |||
| 05834841dc | |||
| ef277de2ad | |||
| b43272afa2 | |||
| 0fc1e820a9 | |||
| 82ca83db3c | |||
| f4bd8b3d26 | |||
| 51e9d02599 | |||
| d273c1402b | |||
| 27b040691c | |||
| 29c60d8cdd | |||
| 359cbe3f46 | |||
| e18bc6aaf3 | |||
| ee94172d33 | |||
| 934266c0e0 | |||
| 9c4fdcbec8 | |||
| 24ecb58168 | |||
| 9afdffe70e | |||
| 3b3963c55c | |||
| dda64fc17c | |||
| 0350f58152 | |||
| ad52d5c259 | |||
| 172b78210a | |||
| 13ad16af12 | |||
| 8f7080bf48 | |||
| e1b40ac3b9 | |||
| dc020985b8 | |||
| 344f9126cc | |||
| 9a17ab914b | |||
| ea3b0590ee | |||
| 29499bb593 | |||
| 48aa8fd1f2 | |||
| 583fd6b000 | |||
| 9f773486ab | |||
| e8a7fd4fb0 | |||
| a5e3fde857 | |||
| f308ea7059 | |||
| c3c88f296a | |||
| 182adefcf3 | |||
| 0d26d8ccd8 |
@@ -227,20 +227,20 @@ effectiveStdenv.mkDerivation (
|
||||
)
|
||||
]
|
||||
++ optionals useRocm [
|
||||
(cmakeFeature "CMAKE_C_COMPILER" "hipcc")
|
||||
(cmakeFeature "CMAKE_CXX_COMPILER" "hipcc")
|
||||
|
||||
# Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
|
||||
# in https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
|
||||
# and select the line that matches the current nixpkgs version of rocBLAS.
|
||||
# Should likely use `rocmPackages.clr.gpuTargets`.
|
||||
"-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
|
||||
(cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
|
||||
(cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets))
|
||||
]
|
||||
++ optionals useMetalKit [
|
||||
(lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
|
||||
(cmakeBool "LLAMA_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
|
||||
];
|
||||
|
||||
# Environment variables needed for ROCm
|
||||
env = optionals useRocm {
|
||||
ROCM_PATH = "${rocmPackages.clr}";
|
||||
HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode";
|
||||
};
|
||||
|
||||
# TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
|
||||
# if they haven't been added yet.
|
||||
postInstall = ''
|
||||
|
||||
@@ -0,0 +1,73 @@
|
||||
# https://github.com/actions/labeler
|
||||
|
||||
SYCL:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml-sycl.h
|
||||
- ggml-sycl.cpp
|
||||
- README-sycl.md
|
||||
Nvidia GPU:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml-cuda/**
|
||||
Vulkan:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml_vk_generate_shaders.py
|
||||
- ggml-vulkan*
|
||||
documentation:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- docs/**
|
||||
- media/**
|
||||
testing:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- tests/**
|
||||
build:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- cmake/**
|
||||
- CMakeLists.txt
|
||||
- CMakePresets.json
|
||||
- codecov.yml
|
||||
examples:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file: examples/**
|
||||
devops:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- .devops/**
|
||||
- .github/**
|
||||
- ci/**
|
||||
python:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- "**/*.py"
|
||||
- requirements/**
|
||||
- gguf-py/**
|
||||
- .flake8
|
||||
script:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- scripts/**
|
||||
android:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- examples/llama.android/**
|
||||
server:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- examples/server/**
|
||||
ggml:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml-*.c
|
||||
- ggml-*.h
|
||||
- ggml-cuda/**
|
||||
nix:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- "**/*.nix"
|
||||
- .github/workflows/nix-*.yml
|
||||
- .devops/nix/nixpkgs-instances.nix
|
||||
+125
-62
@@ -271,40 +271,40 @@ jobs:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
|
||||
name: llama-bin-ubuntu-x64.zip
|
||||
|
||||
# ubuntu-latest-cmake-sanitizer:
|
||||
# runs-on: ubuntu-latest
|
||||
#
|
||||
# continue-on-error: true
|
||||
#
|
||||
# strategy:
|
||||
# matrix:
|
||||
# sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
||||
# build_type: [Debug, Release]
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v4
|
||||
#
|
||||
# - name: Dependencies
|
||||
# id: depends
|
||||
# run: |
|
||||
# sudo apt-get update
|
||||
# sudo apt-get install build-essential
|
||||
#
|
||||
# - name: Build
|
||||
# id: cmake_build
|
||||
# run: |
|
||||
# mkdir build
|
||||
# cd build
|
||||
# cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
|
||||
# cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
|
||||
#
|
||||
# - name: Test
|
||||
# id: cmake_test
|
||||
# run: |
|
||||
# cd build
|
||||
# ctest -L main --verbose --timeout 900
|
||||
ubuntu-latest-cmake-sanitizer:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
continue-on-error: true
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
||||
build_type: [Debug, Release]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
|
||||
cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
ubuntu-latest-cmake-mpi:
|
||||
runs-on: ubuntu-latest
|
||||
@@ -392,6 +392,33 @@ jobs:
|
||||
cmake -DLLAMA_VULKAN=ON ..
|
||||
cmake --build . --config Release -j $(nproc)
|
||||
|
||||
ubuntu-22-cmake-hip:
|
||||
runs-on: ubuntu-22.04
|
||||
container: rocm/dev-ubuntu-22.04:6.0.2
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev
|
||||
|
||||
- name: Build with native CMake HIP support
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DLLAMA_HIPBLAS=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Build with legacy HIP support
|
||||
id: cmake_build_legacy_hip
|
||||
run: |
|
||||
cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DLLAMA_HIPBLAS=ON
|
||||
cmake --build build2 --config Release -j $(nproc)
|
||||
|
||||
ubuntu-22-cmake-sycl:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
@@ -693,26 +720,28 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'rpc'
|
||||
- build: 'rpc-x64'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_RPC=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'noavx'
|
||||
- build: 'noavx-x64'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'avx2'
|
||||
- build: 'avx2-x64'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'avx'
|
||||
- build: 'avx-x64'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'avx512'
|
||||
- build: 'avx512-x64'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'clblast'
|
||||
- build: 'clblast-x64'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
|
||||
- build: 'openblas'
|
||||
- build: 'openblas-x64'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||
- build: 'kompute'
|
||||
- build: 'kompute-x64'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'vulkan'
|
||||
- build: 'vulkan-x64'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'arm64'
|
||||
defines: '-A ARM64 -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'llvm-arm64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'msvc-arm64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -723,13 +752,13 @@ jobs:
|
||||
|
||||
- name: Clone Kompute submodule
|
||||
id: clone_kompute
|
||||
if: ${{ matrix.build == 'kompute' }}
|
||||
if: ${{ matrix.build == 'kompute-x64' }}
|
||||
run: |
|
||||
git submodule update --init kompute
|
||||
|
||||
- name: Download OpenCL SDK
|
||||
id: get_opencl
|
||||
if: ${{ matrix.build == 'clblast' }}
|
||||
if: ${{ matrix.build == 'clblast-x64' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip"
|
||||
mkdir $env:RUNNER_TEMP/opencl
|
||||
@@ -737,7 +766,7 @@ jobs:
|
||||
|
||||
- name: Download CLBlast
|
||||
id: get_clblast
|
||||
if: ${{ matrix.build == 'clblast' }}
|
||||
if: ${{ matrix.build == 'clblast-x64' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
|
||||
curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
|
||||
@@ -750,7 +779,7 @@ jobs:
|
||||
|
||||
- name: Download OpenBLAS
|
||||
id: get_openblas
|
||||
if: ${{ matrix.build == 'openblas' }}
|
||||
if: ${{ matrix.build == 'openblas-x64' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
|
||||
curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
|
||||
@@ -763,38 +792,41 @@ jobs:
|
||||
|
||||
- name: Install Vulkan SDK
|
||||
id: get_vulkan
|
||||
if: ${{ matrix.build == 'kompute' || matrix.build == 'vulkan' }}
|
||||
if: ${{ matrix.build == 'kompute-x64' || matrix.build == 'vulkan-x64' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
|
||||
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
|
||||
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
|
||||
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
|
||||
|
||||
- name: Install Ninja
|
||||
id: install_ninja
|
||||
run: |
|
||||
choco install ninja
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. ${{ matrix.defines }}
|
||||
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||
cmake -S . -B build ${{ matrix.defines }}
|
||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||
|
||||
- name: Add clblast.dll
|
||||
id: add_clblast_dll
|
||||
if: ${{ matrix.build == 'clblast' }}
|
||||
if: ${{ matrix.build == 'clblast-x64' }}
|
||||
run: |
|
||||
cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release
|
||||
cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt
|
||||
|
||||
- name: Add libopenblas.dll
|
||||
id: add_libopenblas_dll
|
||||
if: ${{ matrix.build == 'openblas' }}
|
||||
if: ${{ matrix.build == 'openblas-x64' }}
|
||||
run: |
|
||||
cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
|
||||
cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
|
||||
|
||||
- name: Check AVX512F support
|
||||
id: check_avx512f
|
||||
if: ${{ matrix.build == 'avx512' }}
|
||||
if: ${{ matrix.build == 'avx512-x64' }}
|
||||
continue-on-error: true
|
||||
run: |
|
||||
cd build
|
||||
@@ -808,14 +840,14 @@ jobs:
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
# not all machines have native AVX-512
|
||||
if: ${{ matrix.build != 'arm64' && matrix.build != 'clblast' && matrix.build != 'kompute' && matrix.build != 'vulkan' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }}
|
||||
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'clblast-x64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main -C Release --verbose --timeout 900
|
||||
|
||||
- name: Test (Intel SDE)
|
||||
id: cmake_test_sde
|
||||
if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
|
||||
if: ${{ matrix.build == 'avx512-x64' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
|
||||
# for some weird reason windows tar doesn't like sde tar.xz
|
||||
@@ -843,14 +875,14 @@ jobs:
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
run: |
|
||||
Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
|
||||
|
||||
- name: Upload artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip
|
||||
name: llama-bin-win-${{ matrix.build }}-x64.zip
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
|
||||
name: llama-bin-win-${{ matrix.build }}.zip
|
||||
|
||||
windows-latest-cmake-cuda:
|
||||
runs-on: windows-latest
|
||||
@@ -984,6 +1016,37 @@ jobs:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
|
||||
name: llama-bin-win-sycl-x64.zip
|
||||
|
||||
windows-latest-cmake-hip:
|
||||
runs-on: windows-latest
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Install
|
||||
id: depends
|
||||
run: |
|
||||
$ErrorActionPreference = "Stop"
|
||||
write-host "Downloading AMD HIP SDK Installer"
|
||||
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
|
||||
write-host "Installing AMD HIP SDK"
|
||||
Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
|
||||
write-host "Completed AMD HIP SDK installation"
|
||||
|
||||
- name: Verify ROCm
|
||||
id: verify
|
||||
run: |
|
||||
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
|
||||
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
|
||||
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DLLAMA_HIPBLAS=ON
|
||||
cmake --build build --config Release
|
||||
|
||||
ios-xcode-build:
|
||||
runs-on: macos-latest
|
||||
|
||||
|
||||
@@ -0,0 +1,12 @@
|
||||
name: "Pull Request Labeler"
|
||||
on:
|
||||
- pull_request_target
|
||||
|
||||
jobs:
|
||||
labeler:
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: write
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/labeler@v5
|
||||
@@ -32,13 +32,14 @@ jobs:
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
# TODO: temporary disabled due to linux kernel issues
|
||||
#sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
||||
sanitizer: [UNDEFINED]
|
||||
sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
||||
build_type: [Debug]
|
||||
include:
|
||||
- build_type: Release
|
||||
sanitizer: ""
|
||||
- build_type: Debug
|
||||
sanitizer: THREAD
|
||||
disabled_on_pr: true
|
||||
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
|
||||
|
||||
steps:
|
||||
|
||||
+40
-9
@@ -1,4 +1,4 @@
|
||||
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
|
||||
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
|
||||
project("llama.cpp" C CXX)
|
||||
include(CheckIncludeFileCXX)
|
||||
|
||||
@@ -555,16 +555,37 @@ if (LLAMA_VULKAN)
|
||||
endif()
|
||||
|
||||
if (LLAMA_HIPBLAS)
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
|
||||
if ($ENV{ROCM_PATH})
|
||||
set(ROCM_PATH $ENV{ROCM_PATH})
|
||||
else()
|
||||
set(ROCM_PATH /opt/rocm)
|
||||
endif()
|
||||
list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
|
||||
|
||||
if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
|
||||
message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang")
|
||||
# CMake on Windows doesn't support the HIP language yet
|
||||
if(WIN32)
|
||||
set(CXX_IS_HIPCC TRUE)
|
||||
else()
|
||||
string(REGEX MATCH "hipcc(\.bat)?$" CXX_IS_HIPCC "${CMAKE_CXX_COMPILER}")
|
||||
endif()
|
||||
|
||||
if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
|
||||
message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
|
||||
endif()
|
||||
if(CXX_IS_HIPCC)
|
||||
if(LINUX)
|
||||
if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
|
||||
message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
|
||||
endif()
|
||||
|
||||
message(WARNING "Setting hipcc as the C++ compiler is legacy behavior."
|
||||
" Prefer setting the HIP compiler directly. See README for details.")
|
||||
endif()
|
||||
else()
|
||||
# Forward AMDGPU_TARGETS to CMAKE_HIP_ARCHITECTURES.
|
||||
if(AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
|
||||
set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
|
||||
endif()
|
||||
cmake_minimum_required(VERSION 3.21)
|
||||
enable_language(HIP)
|
||||
endif()
|
||||
find_package(hip REQUIRED)
|
||||
find_package(hipblas REQUIRED)
|
||||
find_package(rocblas REQUIRED)
|
||||
@@ -598,13 +619,18 @@ if (LLAMA_HIPBLAS)
|
||||
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
||||
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
||||
|
||||
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
|
||||
if (CXX_IS_HIPCC)
|
||||
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::device)
|
||||
else()
|
||||
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP)
|
||||
endif()
|
||||
|
||||
if (LLAMA_STATIC)
|
||||
message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
|
||||
endif()
|
||||
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} PUBLIC hip::host roc::rocblas roc::hipblas)
|
||||
endif()
|
||||
|
||||
if (LLAMA_SYCL)
|
||||
@@ -1007,6 +1033,11 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR
|
||||
if (GGML_COMPILER_SUPPORT_DOTPROD)
|
||||
add_compile_definitions(__ARM_FEATURE_DOTPROD)
|
||||
endif ()
|
||||
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
|
||||
if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
|
||||
add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
|
||||
endif ()
|
||||
|
||||
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
|
||||
if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
|
||||
add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
|
||||
|
||||
@@ -0,0 +1,45 @@
|
||||
{
|
||||
"version": 4,
|
||||
"configurePresets": [
|
||||
{
|
||||
"name": "base",
|
||||
"hidden": true,
|
||||
"generator": "Ninja",
|
||||
"binaryDir": "${sourceDir}/build-${presetName}",
|
||||
"cacheVariables": {
|
||||
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
|
||||
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
|
||||
}
|
||||
},
|
||||
|
||||
{ "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
|
||||
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
|
||||
{ "name": "static", "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } },
|
||||
|
||||
{
|
||||
"name": "arm64-windows-msvc", "hidden": true,
|
||||
"architecture": { "value": "arm64", "strategy": "external" },
|
||||
"toolset": { "value": "host=x86_64", "strategy": "external" },
|
||||
"cacheVariables": {
|
||||
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
|
||||
}
|
||||
},
|
||||
|
||||
{
|
||||
"name": "arm64-windows-llvm", "hidden": true,
|
||||
"architecture": { "value": "arm64", "strategy": "external" },
|
||||
"toolset": { "value": "host=x86_64", "strategy": "external" },
|
||||
"cacheVariables": {
|
||||
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
|
||||
}
|
||||
},
|
||||
|
||||
{ "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
|
||||
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "release" ] },
|
||||
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "release", "static" ] },
|
||||
|
||||
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
||||
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "release" ] },
|
||||
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] }
|
||||
]
|
||||
}
|
||||
@@ -560,10 +560,10 @@ endif # LLAMA_VULKAN
|
||||
ifdef LLAMA_HIPBLAS
|
||||
ifeq ($(wildcard /opt/rocm),)
|
||||
ROCM_PATH ?= /usr
|
||||
GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
|
||||
AMDGPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
|
||||
else
|
||||
ROCM_PATH ?= /opt/rocm
|
||||
GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
|
||||
AMDGPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
|
||||
endif
|
||||
HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc
|
||||
LLAMA_CUDA_DMMV_X ?= 32
|
||||
@@ -575,7 +575,7 @@ ifdef LLAMA_HIP_UMA
|
||||
endif # LLAMA_HIP_UMA
|
||||
MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
|
||||
MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
|
||||
HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
|
||||
HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
|
||||
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
||||
HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
|
||||
HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
|
||||
|
||||
@@ -528,13 +528,28 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
```
|
||||
- Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
|
||||
```bash
|
||||
CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ \
|
||||
cmake -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
|
||||
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
|
||||
cmake -S . -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
|
||||
&& cmake --build build --config Release -- -j 16
|
||||
```
|
||||
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON"`.
|
||||
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON`.
|
||||
However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
|
||||
|
||||
Note that if you get the following error:
|
||||
```
|
||||
clang: error: cannot find ROCm device library; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library
|
||||
```
|
||||
Try searching for a directory under `HIP_PATH` that contains the file
|
||||
`oclc_abi_version_400.bc`. Then, add the following to the start of the
|
||||
command: `HIP_DEVICE_LIB_PATH=<directory-you-just-found>`, so something
|
||||
like:
|
||||
```bash
|
||||
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
|
||||
HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
|
||||
cmake -S . -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
|
||||
&& cmake --build build -- -j 16
|
||||
```
|
||||
|
||||
- Using `make` (example for target gfx1030, build with 16 CPU threads):
|
||||
```bash
|
||||
make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
|
||||
@@ -543,10 +558,8 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
|
||||
```bash
|
||||
set PATH=%HIP_PATH%\bin;%PATH%
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release ..
|
||||
cmake --build .
|
||||
cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
|
||||
cmake --build build
|
||||
```
|
||||
Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
|
||||
Find your gpu version string by matching the most significant version information from `rocminfo | grep gfx | head -1 | awk '{print $2}'` with the list of processors, e.g. `gfx1035` maps to `gfx1030`.
|
||||
@@ -712,6 +725,9 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
|
||||
### Prepare and Quantize
|
||||
|
||||
> [!NOTE]
|
||||
> You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `llama.cpp` main every 6 hours.
|
||||
|
||||
To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
|
||||
|
||||
Note: `convert.py` does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
|
||||
|
||||
@@ -0,0 +1,16 @@
|
||||
set( CMAKE_SYSTEM_NAME Windows )
|
||||
set( CMAKE_SYSTEM_PROCESSOR arm64 )
|
||||
|
||||
set( target arm64-pc-windows-msvc )
|
||||
|
||||
set( CMAKE_C_COMPILER clang )
|
||||
set( CMAKE_CXX_COMPILER clang++ )
|
||||
|
||||
set( CMAKE_C_COMPILER_TARGET ${target} )
|
||||
set( CMAKE_CXX_COMPILER_TARGET ${target} )
|
||||
|
||||
set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast" )
|
||||
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
|
||||
|
||||
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
||||
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
||||
@@ -0,0 +1,6 @@
|
||||
set( CMAKE_SYSTEM_NAME Windows )
|
||||
set( CMAKE_SYSTEM_PROCESSOR arm64 )
|
||||
|
||||
set( target arm64-pc-windows-msvc )
|
||||
set( CMAKE_C_COMPILER_TARGET ${target} )
|
||||
set( CMAKE_CXX_COMPILER_TARGET ${target} )
|
||||
+1
-1
@@ -2553,7 +2553,7 @@ void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const cha
|
||||
size_t pos_start = 0;
|
||||
size_t pos_found = 0;
|
||||
|
||||
if (!data_str.empty() && (std::isspace(data_str[0]) || std::isspace(data_str.back()))) {
|
||||
if (std::isspace(data_str[0]) || std::isspace(data_str.back())) {
|
||||
data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
|
||||
data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
|
||||
data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
|
||||
|
||||
@@ -26,7 +26,7 @@ namespace grammar_parser {
|
||||
|
||||
static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
|
||||
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
|
||||
auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id));
|
||||
auto result = state.symbol_ids.emplace(std::string(src, len), next_id);
|
||||
return result.first->second;
|
||||
}
|
||||
|
||||
|
||||
@@ -272,7 +272,7 @@ private:
|
||||
if (literal.empty()) {
|
||||
return false;
|
||||
}
|
||||
ret.push_back(std::make_pair(literal, true));
|
||||
ret.emplace_back(literal, true);
|
||||
literal.clear();
|
||||
return true;
|
||||
};
|
||||
@@ -298,7 +298,7 @@ private:
|
||||
while (i < length) {
|
||||
char c = sub_pattern[i];
|
||||
if (c == '.') {
|
||||
seq.push_back(std::make_pair(get_dot(), false));
|
||||
seq.emplace_back(get_dot(), false);
|
||||
i++;
|
||||
} else if (c == '(') {
|
||||
i++;
|
||||
@@ -307,7 +307,7 @@ private:
|
||||
_warnings.push_back("Unsupported pattern syntax");
|
||||
}
|
||||
}
|
||||
seq.push_back(std::make_pair("(" + to_rule(transform()) + ")", false));
|
||||
seq.emplace_back("(" + to_rule(transform()) + ")", false);
|
||||
} else if (c == ')') {
|
||||
i++;
|
||||
if (start > 0 && sub_pattern[start - 1] != '(') {
|
||||
@@ -331,9 +331,9 @@ private:
|
||||
}
|
||||
square_brackets += ']';
|
||||
i++;
|
||||
seq.push_back(std::make_pair(square_brackets, false));
|
||||
seq.emplace_back(square_brackets, false);
|
||||
} else if (c == '|') {
|
||||
seq.push_back(std::make_pair("|", false));
|
||||
seq.emplace_back("|", false);
|
||||
i++;
|
||||
} else if (c == '*' || c == '+' || c == '?') {
|
||||
seq.back() = std::make_pair(to_rule(seq.back()) + c, false);
|
||||
@@ -417,7 +417,7 @@ private:
|
||||
}
|
||||
}
|
||||
if (!literal.empty()) {
|
||||
seq.push_back(std::make_pair(literal, true));
|
||||
seq.emplace_back(literal, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
+5
-5
@@ -211,7 +211,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
|
||||
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
||||
#else
|
||||
#define LOG_FLF_FMT "[%24s:%5ld][%24s] "
|
||||
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
||||
#define LOG_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
|
||||
#endif
|
||||
#else
|
||||
#define LOG_FLF_FMT "%s"
|
||||
@@ -224,7 +224,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
|
||||
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
||||
#else
|
||||
#define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
|
||||
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
||||
#define LOG_TEE_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
|
||||
#endif
|
||||
#else
|
||||
#define LOG_TEE_FLF_FMT "%s"
|
||||
@@ -294,7 +294,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
|
||||
// Main LOG macro.
|
||||
// behaves like printf, and supports arguments the exact same way.
|
||||
//
|
||||
#ifndef _MSC_VER
|
||||
#if !defined(_MSC_VER) || defined(__clang__)
|
||||
#define LOG(...) LOG_IMPL(__VA_ARGS__, "")
|
||||
#else
|
||||
#define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "")
|
||||
@@ -308,14 +308,14 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
|
||||
// Secondary target can be changed just like LOG_TARGET
|
||||
// by defining LOG_TEE_TARGET
|
||||
//
|
||||
#ifndef _MSC_VER
|
||||
#if !defined(_MSC_VER) || defined(__clang__)
|
||||
#define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
|
||||
#else
|
||||
#define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "")
|
||||
#endif
|
||||
|
||||
// LOG macro variants with auto endline.
|
||||
#ifndef _MSC_VER
|
||||
#if !defined(_MSC_VER) || defined(__clang__)
|
||||
#define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
|
||||
#define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
|
||||
#else
|
||||
|
||||
@@ -20,11 +20,13 @@
|
||||
# - Update llama.cpp with the new pre-tokenizer if necessary
|
||||
#
|
||||
# TODO: generate tokenizer tests for llama.cpp
|
||||
# TODO: automate the update of convert-hf-to-gguf.py
|
||||
#
|
||||
|
||||
import logging
|
||||
import os
|
||||
import pathlib
|
||||
import re
|
||||
|
||||
import requests
|
||||
import sys
|
||||
import json
|
||||
@@ -35,6 +37,7 @@ from transformers import AutoTokenizer
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
logger = logging.getLogger("convert-hf-to-gguf-update")
|
||||
sess = requests.Session()
|
||||
|
||||
|
||||
class TOKENIZER_TYPE(IntEnum):
|
||||
@@ -79,63 +82,44 @@ models = [
|
||||
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
||||
]
|
||||
|
||||
# make directory "models/tokenizers" if it doesn't exist
|
||||
if not os.path.exists("models/tokenizers"):
|
||||
os.makedirs("models/tokenizers")
|
||||
|
||||
|
||||
def download_file_with_auth(url, token, save_path):
|
||||
headers = {"Authorization": f"Bearer {token}"}
|
||||
response = requests.get(url, headers=headers)
|
||||
if response.status_code == 200:
|
||||
with open(save_path, 'wb') as f:
|
||||
f.write(response.content)
|
||||
logger.info(f"File {save_path} downloaded successfully")
|
||||
else:
|
||||
logger.info(f"Failed to download file. Status code: {response.status_code}")
|
||||
response = sess.get(url, headers=headers)
|
||||
response.raise_for_status()
|
||||
os.makedirs(os.path.dirname(save_path), exist_ok=True)
|
||||
with open(save_path, 'wb') as f:
|
||||
f.write(response.content)
|
||||
logger.info(f"File {save_path} downloaded successfully")
|
||||
|
||||
|
||||
# download the tokenizer models
|
||||
for model in models:
|
||||
def download_model(model):
|
||||
name = model["name"]
|
||||
repo = model["repo"]
|
||||
tokt = model["tokt"]
|
||||
|
||||
if not os.path.exists(f"models/tokenizers/{name}"):
|
||||
os.makedirs(f"models/tokenizers/{name}")
|
||||
else:
|
||||
logger.info(f"Directory models/tokenizers/{name} already exists - skipping")
|
||||
continue
|
||||
|
||||
logger.info(f"Downloading {name} to models/tokenizers/{name}")
|
||||
|
||||
url = f"{repo}/raw/main/config.json"
|
||||
save_path = f"models/tokenizers/{name}/config.json"
|
||||
download_file_with_auth(url, token, save_path)
|
||||
|
||||
url = f"{repo}/raw/main/tokenizer.json"
|
||||
save_path = f"models/tokenizers/{name}/tokenizer.json"
|
||||
download_file_with_auth(url, token, save_path)
|
||||
|
||||
# if downloaded file is less than 1KB, we likely need to download an LFS instead
|
||||
if os.path.getsize(save_path) < 1024:
|
||||
# remove the file
|
||||
os.remove(save_path)
|
||||
url = f"{repo}/resolve/main/tokenizer.json"
|
||||
save_path = f"models/tokenizers/{name}/tokenizer.json"
|
||||
download_file_with_auth(url, token, save_path)
|
||||
os.makedirs(f"models/tokenizers/{name}", exist_ok=True)
|
||||
|
||||
files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
|
||||
if tokt == TOKENIZER_TYPE.SPM:
|
||||
url = f"{repo}/resolve/main/tokenizer.model"
|
||||
save_path = f"models/tokenizers/{name}/tokenizer.model"
|
||||
download_file_with_auth(url, token, save_path)
|
||||
files.append("tokenizer.model")
|
||||
|
||||
for file in files:
|
||||
save_path = f"models/tokenizers/{name}/{file}"
|
||||
if os.path.isfile(save_path):
|
||||
logger.info(f"{name}: File {save_path} already exists - skipping")
|
||||
continue
|
||||
download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
|
||||
|
||||
|
||||
for model in models:
|
||||
try:
|
||||
download_model(model)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to download model {model['name']}. Error: {e}")
|
||||
|
||||
url = f"{repo}/raw/main/tokenizer_config.json"
|
||||
save_path = f"models/tokenizers/{name}/tokenizer_config.json"
|
||||
download_file_with_auth(url, token, save_path)
|
||||
|
||||
# generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
|
||||
# TODO: auto-update convert-hf-to-gguf.py with the generated function
|
||||
|
||||
src_ifs = ""
|
||||
for model in models:
|
||||
@@ -224,11 +208,18 @@ src_func = f"""
|
||||
return res
|
||||
"""
|
||||
|
||||
print(src_func) # noqa: NP100
|
||||
convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
|
||||
convert_py = convert_py_pth.read_text()
|
||||
convert_py = re.sub(
|
||||
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
|
||||
lambda m: m.group(1) + src_func + m.group(3),
|
||||
convert_py,
|
||||
flags=re.DOTALL | re.MULTILINE,
|
||||
)
|
||||
|
||||
logger.info("\n")
|
||||
logger.info("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
|
||||
logger.info("\n")
|
||||
convert_py_pth.write_text(convert_py)
|
||||
|
||||
logger.info("+++ convert-hf-to-gguf.py was updated")
|
||||
|
||||
# generate tests for each tokenizer model
|
||||
|
||||
|
||||
+18
-12
@@ -402,6 +402,7 @@ class Model:
|
||||
# NOTE: this function is generated by convert-hf-to-gguf-update.py
|
||||
# do not modify it manually!
|
||||
# ref: https://github.com/ggerganov/llama.cpp/pull/6920
|
||||
# Marker: Start get_vocab_base_pre
|
||||
def get_vocab_base_pre(self, tokenizer) -> str:
|
||||
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
|
||||
# is specific for the BPE pre-tokenizer used by the model
|
||||
@@ -489,6 +490,7 @@ class Model:
|
||||
logger.debug(f"chkhsh: {chkhsh}")
|
||||
|
||||
return res
|
||||
# Marker: End get_vocab_base_pre
|
||||
|
||||
def _set_vocab_gpt2(self) -> None:
|
||||
tokens, toktypes, tokpre = self.get_vocab_base()
|
||||
@@ -526,7 +528,7 @@ class Model:
|
||||
|
||||
# for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
|
||||
added_vocab = tokenizer.special_tokens
|
||||
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()}
|
||||
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
|
||||
|
||||
for i in range(vocab_size):
|
||||
if i not in reverse_vocab:
|
||||
@@ -571,6 +573,10 @@ class Model:
|
||||
|
||||
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
|
||||
|
||||
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
||||
scores: list[float] = [-10000.0] * vocab_size
|
||||
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
|
||||
|
||||
for token_id in range(tokenizer.vocab_size()):
|
||||
piece = tokenizer.IdToPiece(token_id)
|
||||
text = piece.encode("utf-8")
|
||||
@@ -586,21 +592,23 @@ class Model:
|
||||
elif tokenizer.IsByte(token_id):
|
||||
toktype = SentencePieceTokenTypes.BYTE
|
||||
|
||||
tokens.append(text)
|
||||
scores.append(score)
|
||||
toktypes.append(toktype)
|
||||
tokens[token_id] = text
|
||||
scores[token_id] = score
|
||||
toktypes[token_id] = toktype
|
||||
|
||||
added_tokens_file = self.dir_model / 'added_tokens.json'
|
||||
if added_tokens_file.is_file():
|
||||
with open(added_tokens_file, "r", encoding="utf-8") as f:
|
||||
added_tokens_json = json.load(f)
|
||||
|
||||
for key in added_tokens_json:
|
||||
key = key.encode("utf-8")
|
||||
if key not in tokens:
|
||||
tokens.append(key)
|
||||
scores.append(-1000.0)
|
||||
toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
|
||||
token_id = added_tokens_json[key]
|
||||
if (token_id >= vocab_size):
|
||||
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
||||
continue
|
||||
|
||||
tokens[token_id] = key.encode("utf-8")
|
||||
scores[token_id] = -1000.0
|
||||
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
||||
|
||||
if vocab_size > len(tokens):
|
||||
pad_count = vocab_size - len(tokens)
|
||||
@@ -610,8 +618,6 @@ class Model:
|
||||
scores.append(-1000.0)
|
||||
toktypes.append(SentencePieceTokenTypes.UNUSED)
|
||||
|
||||
assert len(tokens) == vocab_size
|
||||
|
||||
self.gguf_writer.add_tokenizer_model("llama")
|
||||
self.gguf_writer.add_tokenizer_pre("default")
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
|
||||
+1
-1
@@ -1109,7 +1109,7 @@ class OutputFile:
|
||||
if metadata is not None and metadata.name is not None:
|
||||
name = metadata.name
|
||||
elif params.path_model is not None:
|
||||
name = str(params.path_model.parent).split("/")[-1]
|
||||
name = params.path_model.name
|
||||
elif params.n_ctx == 4096:
|
||||
# Heuristic detection of LLaMA v2 model
|
||||
name = "LLaMA v2"
|
||||
|
||||
+20
-4
@@ -1,6 +1,6 @@
|
||||
# Debugging Tests Tips
|
||||
|
||||
## How to run & debug a specific test without anything else to keep the feedback loop short?
|
||||
## How to run & execute or debug a specific test without anything else to keep the feedback loop short?
|
||||
|
||||
There is a script called debug-test.sh in the scripts folder whose parameter takes a REGEX and an optional test number.
|
||||
|
||||
@@ -10,13 +10,27 @@ For example, running the following command will output an interactive list from
|
||||
|
||||
It will then build & run in the debugger for you.
|
||||
|
||||
To just execute a test and get back a PASS or FAIL message run:
|
||||
|
||||
```bash
|
||||
./scripts/debug-test.sh test-tokenizer
|
||||
```
|
||||
|
||||
To test in GDB use the `-g` flag to enable gdb test mode.
|
||||
|
||||
```bash
|
||||
./scripts/debug-test.sh -g test-tokenizer
|
||||
|
||||
# Once in the debugger, i.e. at the chevrons prompt, setting a breakpoint could be as follows:
|
||||
>>> b main
|
||||
```
|
||||
|
||||
To speed up the testing loop, if you know your test number you can just run it similar to below:
|
||||
|
||||
```bash
|
||||
./scripts/debug-test.sh test 23
|
||||
```
|
||||
|
||||
For further reference use `debug-test.sh -h` to print help.
|
||||
|
||||
|
||||
@@ -41,7 +55,7 @@ cmake -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_FATAL_WARNINGS=ON ..
|
||||
make -j
|
||||
```
|
||||
|
||||
#### Step 3.1: Identify Test Command for Debugging
|
||||
#### Step 3: Find all tests available that matches REGEX
|
||||
|
||||
The output of this command will give you the command & arguments needed to run GDB.
|
||||
|
||||
@@ -69,11 +83,13 @@ Labels: main
|
||||
...
|
||||
```
|
||||
|
||||
So for test #1 we can tell these two pieces of relevant information:
|
||||
#### Step 4: Identify Test Command for Debugging
|
||||
|
||||
So for test #1 above we can tell these two pieces of relevant information:
|
||||
* Test Binary: `~/llama.cpp/build-ci-debug/bin/test-tokenizer-0`
|
||||
* Test GGUF Model: `~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf`
|
||||
|
||||
#### Step 3.2: Run GDB on test command
|
||||
#### Step 5: Run GDB on test command
|
||||
|
||||
Based on the ctest 'test command' report above we can then run a gdb session via this command below:
|
||||
|
||||
|
||||
@@ -211,6 +211,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// clean up
|
||||
llama_print_timings(ctx);
|
||||
llama_batch_free(batch);
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
llama_backend_free();
|
||||
|
||||
@@ -12,15 +12,17 @@ cmake_minimum_required(VERSION 3.22.1)
|
||||
# build script scope).
|
||||
project("llama-android")
|
||||
|
||||
include(FetchContent)
|
||||
FetchContent_Declare(
|
||||
llama
|
||||
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
|
||||
GIT_TAG master
|
||||
)
|
||||
#include(FetchContent)
|
||||
#FetchContent_Declare(
|
||||
# llama
|
||||
# GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
|
||||
# GIT_TAG ci-android
|
||||
#)
|
||||
#
|
||||
## Also provides "common"
|
||||
#FetchContent_MakeAvailable(llama)
|
||||
|
||||
# Also provides "common"
|
||||
FetchContent_MakeAvailable(llama)
|
||||
add_subdirectory(../../../../../../ please-work)
|
||||
|
||||
# Creates and names a library, sets it as either STATIC
|
||||
# or SHARED, and provides the relative paths to its source code.
|
||||
|
||||
@@ -88,7 +88,6 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<
|
||||
// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
|
||||
static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
|
||||
struct {
|
||||
struct ggml_tensor * newline;
|
||||
struct ggml_context * ctx;
|
||||
} model;
|
||||
|
||||
@@ -150,20 +149,6 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
||||
|
||||
model.ctx = ggml_init(params);
|
||||
|
||||
ggml_tensor * newline_tmp = clip_get_newline_tensor(ctx_clip);
|
||||
model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
|
||||
if (newline_tmp->backend != GGML_BACKEND_TYPE_CPU) {
|
||||
if (newline_tmp->buffer == NULL) {
|
||||
LOG_TEE("newline_tmp tensor buffer is NULL\n");
|
||||
}
|
||||
ggml_backend_tensor_get(newline_tmp, model.newline->data, 0, ggml_nbytes(newline_tmp));
|
||||
} else {
|
||||
model.newline->data = newline_tmp->data;
|
||||
if (model.newline->data == NULL) {
|
||||
LOG_TEE("newline_tmp tensor data is NULL\n");
|
||||
}
|
||||
}
|
||||
|
||||
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
|
||||
// ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
|
||||
// fill it with the image embeddings, ignoring the base
|
||||
|
||||
@@ -1425,7 +1425,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
||||
// Use all tasks
|
||||
tasks.resize(n_task);
|
||||
printf("%s: reading tasks", __func__);
|
||||
int n_dot = n_task/100;
|
||||
int n_dot = std::max((int) n_task/100, 1);
|
||||
int i = 0;
|
||||
for (auto& task : tasks) {
|
||||
++i;
|
||||
@@ -1675,7 +1675,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
||||
|
||||
llama_batch_free(batch);
|
||||
|
||||
if (n_done < 100) return;
|
||||
if (n_done < 100 && (params.multiple_choice_tasks != 0 && params.multiple_choice_tasks < (size_t)n_task)) return;
|
||||
|
||||
float p = 1.f*n_correct/n_done;
|
||||
float sigma = sqrt(p*(1-p)/(n_done-1));
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
# quantize
|
||||
|
||||
TODO
|
||||
You can also use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to build your own quants without any setup.
|
||||
|
||||
Note: It is synced from llama.cpp `main` every 6 hours.
|
||||
|
||||
## Llama 2 7B
|
||||
|
||||
|
||||
@@ -42,7 +42,7 @@ cmake --build . --config Release
|
||||
Then, start the `rpc-server` with the backend:
|
||||
|
||||
```bash
|
||||
$ bin/rpc-server 0.0.0.0 50052
|
||||
$ bin/rpc-server -p 50052
|
||||
create_backend: using CUDA backend
|
||||
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
|
||||
ggml_cuda_init: CUDA_USE_TENSOR_CORES: yes
|
||||
@@ -53,7 +53,7 @@ Starting RPC server on 0.0.0.0:50052
|
||||
|
||||
When using the CUDA backend, you can specify the device with the `CUDA_VISIBLE_DEVICES` environment variable, e.g.:
|
||||
```bash
|
||||
$ CUDA_VISIBLE_DEVICES=0 bin/rpc-server 0.0.0.0 50052
|
||||
$ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052
|
||||
```
|
||||
This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.
|
||||
|
||||
|
||||
+78
-14
@@ -7,9 +7,64 @@
|
||||
#endif
|
||||
|
||||
#include "ggml-rpc.h"
|
||||
#ifdef _WIN32
|
||||
# include <windows.h>
|
||||
#else
|
||||
# include <unistd.h>
|
||||
#endif
|
||||
#include <string>
|
||||
#include <stdio.h>
|
||||
|
||||
struct rpc_server_params {
|
||||
std::string host = "0.0.0.0";
|
||||
int port = 50052;
|
||||
size_t backend_mem = 0;
|
||||
};
|
||||
|
||||
static void print_usage(int /*argc*/, char ** argv, rpc_server_params params) {
|
||||
fprintf(stderr, "Usage: %s [options]\n\n", argv[0]);
|
||||
fprintf(stderr, "options:\n");
|
||||
fprintf(stderr, " -h, --help show this help message and exit\n");
|
||||
fprintf(stderr, " -H HOST, --host HOST host to bind to (default: %s)\n", params.host.c_str());
|
||||
fprintf(stderr, " -p PORT, --port PORT port to bind to (default: %d)\n", params.port);
|
||||
fprintf(stderr, " -m MEM, --mem MEM backend memory size (in MB)\n");
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params & params) {
|
||||
std::string arg;
|
||||
for (int i = 1; i < argc; i++) {
|
||||
arg = argv[i];
|
||||
if (arg == "-H" || arg == "--host") {
|
||||
if (++i >= argc) {
|
||||
return false;
|
||||
}
|
||||
params.host = argv[i];
|
||||
} else if (arg == "-p" || arg == "--port") {
|
||||
if (++i >= argc) {
|
||||
return false;
|
||||
}
|
||||
params.port = std::stoi(argv[i]);
|
||||
if (params.port <= 0 || params.port > 65535) {
|
||||
return false;
|
||||
}
|
||||
} else if (arg == "-m" || arg == "--mem") {
|
||||
if (++i >= argc) {
|
||||
return false;
|
||||
}
|
||||
params.backend_mem = std::stoul(argv[i]) * 1024 * 1024;
|
||||
} else if (arg == "-h" || arg == "--help") {
|
||||
print_usage(argc, argv, params);
|
||||
exit(0);
|
||||
} else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
print_usage(argc, argv, params);
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static ggml_backend_t create_backend() {
|
||||
ggml_backend_t backend = NULL;
|
||||
#ifdef GGML_USE_CUDA
|
||||
@@ -38,21 +93,25 @@ static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
|
||||
#ifdef GGML_USE_CUDA
|
||||
ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
|
||||
#else
|
||||
// TODO: implement for other backends
|
||||
*free_mem = 1;
|
||||
*total_mem = 1;
|
||||
#ifdef _WIN32
|
||||
MEMORYSTATUSEX status;
|
||||
status.dwLength = sizeof(status);
|
||||
GlobalMemoryStatusEx(&status);
|
||||
*total_mem = status.ullTotalPhys;
|
||||
*free_mem = status.ullAvailPhys;
|
||||
#else
|
||||
long pages = sysconf(_SC_PHYS_PAGES);
|
||||
long page_size = sysconf(_SC_PAGE_SIZE);
|
||||
*total_mem = pages * page_size;
|
||||
*free_mem = *total_mem;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
int main(int argc, char * argv[]) {
|
||||
if (argc < 3) {
|
||||
fprintf(stderr, "Usage: %s <host> <port>\n", argv[0]);
|
||||
return 1;
|
||||
}
|
||||
const char * host = argv[1];
|
||||
int port = std::stoi(argv[2]);
|
||||
if (port <= 0 || port > 65535) {
|
||||
fprintf(stderr, "Invalid port number: %d\n", port);
|
||||
rpc_server_params params;
|
||||
if (!rpc_server_params_parse(argc, argv, params)) {
|
||||
fprintf(stderr, "Invalid parameters\n");
|
||||
return 1;
|
||||
}
|
||||
ggml_backend_t backend = create_backend();
|
||||
@@ -60,10 +119,15 @@ int main(int argc, char * argv[]) {
|
||||
fprintf(stderr, "Failed to create backend\n");
|
||||
return 1;
|
||||
}
|
||||
printf("Starting RPC server on %s:%d\n", host, port);
|
||||
std::string endpoint = params.host + ":" + std::to_string(params.port);
|
||||
size_t free_mem, total_mem;
|
||||
get_backend_memory(&free_mem, &total_mem);
|
||||
std::string endpoint = std::string(host) + ":" + std::to_string(port);
|
||||
if (params.backend_mem > 0) {
|
||||
free_mem = params.backend_mem;
|
||||
total_mem = params.backend_mem;
|
||||
} else {
|
||||
get_backend_memory(&free_mem, &total_mem);
|
||||
}
|
||||
printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024));
|
||||
start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem);
|
||||
ggml_backend_free(backend);
|
||||
return 0;
|
||||
|
||||
@@ -17,8 +17,9 @@ The project is under active development, and we are [looking for feedback and co
|
||||
|
||||
**Command line options:**
|
||||
|
||||
- `--threads N`, `-t N`: Set the number of threads to use during generation. Not used if model layers are offloaded to GPU. The server is using batching. This parameter is used only if one token is to be processed on CPU backend.
|
||||
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. Not used if model layers are offloaded to GPU.
|
||||
- `-v`, `--verbose`: Enable verbose server output. When using the `/completion` endpoint, this includes the tokenized prompt, the full request and the full response.
|
||||
- `-t N`, `--threads N`: Set the number of threads to use by CPU layers during generation. Not used by model layers that are offloaded to GPU. This option has no effect when using the maximum number of GPU layers. Default: `std::thread::hardware_concurrency()` (number of CPU cores).
|
||||
- `-tb N, --threads-batch N`: Set the number of threads to use by CPU layers during batch and prompt processing (>= 32 tokens). This option has no effect if a GPU is available. Default: `--threads`.
|
||||
- `--threads-http N`: Number of threads in the http server pool to process requests. Default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`
|
||||
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
|
||||
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file. Default: unused
|
||||
@@ -36,9 +37,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||
- `--numa STRATEGY`: Attempt one of the below optimization strategies that may help on some NUMA systems
|
||||
- `--numa distribute`: Spread execution evenly over all nodes
|
||||
- `--numa isolate`: Only spawn threads on CPUs on the node that execution started on
|
||||
- `--numa numactl`: Use the CPU map provided by numactl. If run without this previously, it is recommended to drop the system
|
||||
page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437
|
||||
|
||||
- `--numa numactl`: Use the CPU map provided by numactl. If run without this previously, it is recommended to drop the system page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437
|
||||
- `--numa`: Attempt optimizations that may help on some NUMA systems.
|
||||
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
|
||||
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
|
||||
|
||||
@@ -2387,6 +2387,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
|
||||
printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
||||
printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
|
||||
printf(" --port PORT port to listen (default (default: %d)\n", sparams.port);
|
||||
printf(" --rpc SERVERS comma separated list of RPC servers\n");
|
||||
printf(" --path PUBLIC_PATH path from which to serve static files (default: disabled)\n");
|
||||
printf(" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n");
|
||||
printf(" --api-key-file FNAME path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n");
|
||||
@@ -2439,6 +2440,12 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
|
||||
break;
|
||||
}
|
||||
sparams.port = std::stoi(argv[i]);
|
||||
} else if (arg == "--rpc") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.rpc_servers = argv[i];
|
||||
} else if (arg == "--host") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
|
||||
@@ -1895,7 +1895,6 @@ void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * t
|
||||
|
||||
tensor->buffer = buffer;
|
||||
tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
|
||||
tensor->backend = tensor->view_src->backend;
|
||||
ggml_backend_buffer_init_tensor(buffer, tensor);
|
||||
}
|
||||
|
||||
|
||||
+1
-1
@@ -2558,7 +2558,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
||||
}
|
||||
|
||||
// Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
|
||||
if (cuda_graph_update_required) {
|
||||
if (use_cuda_graph && cuda_graph_update_required) {
|
||||
cuda_ctx->cuda_graph->number_consecutive_updates++;
|
||||
} else {
|
||||
cuda_ctx->cuda_graph->number_consecutive_updates = 0;
|
||||
|
||||
@@ -315,6 +315,20 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
||||
#endif
|
||||
return c;
|
||||
}
|
||||
|
||||
#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
|
||||
// __shfl_xor() for half2 was added in ROCm 5.6
|
||||
static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
|
||||
typedef union half2_b32 {
|
||||
half2 val;
|
||||
int b32;
|
||||
} half2_b32_t;
|
||||
half2_b32_t tmp;
|
||||
tmp.val = var;
|
||||
tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
|
||||
return tmp.val;
|
||||
}
|
||||
#endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
|
||||
#endif // defined(GGML_USE_HIPBLAS)
|
||||
|
||||
#define FP16_AVAILABLE (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
|
||||
@@ -463,6 +477,17 @@ static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -
|
||||
|
||||
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
|
||||
|
||||
static __device__ __forceinline__ float get_alibi_slope(
|
||||
const float max_bias, const uint32_t h, const uint32_t n_head_log2, const float m0, const float m1
|
||||
) {
|
||||
if (max_bias <= 0.0f) {
|
||||
return 1.0f;
|
||||
}
|
||||
const float base = h < n_head_log2 ? m0 : m1;
|
||||
const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
||||
|
||||
return powf(base, exph);
|
||||
}
|
||||
|
||||
//////////////////////
|
||||
|
||||
|
||||
@@ -1,7 +1,44 @@
|
||||
#include "common.cuh"
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#define FATTN_KQ_STRIDE 256
|
||||
#define HALF_MAX_HALF __float2half(65504.0f/2) // Use neg. of this instead of -INFINITY to initialize KQ max vals to avoid NaN upon subtraction.
|
||||
#define SOFTMAX_FTZ_THRESHOLD -20.0f // Softmax exp. of values smaller than this are flushed to zero to avoid NaNs.
|
||||
|
||||
typedef void (* fattn_kernel_t)(
|
||||
const char * __restrict__ Q,
|
||||
const char * __restrict__ K,
|
||||
const char * __restrict__ V,
|
||||
const char * __restrict__ mask,
|
||||
float * __restrict__ dst,
|
||||
float2 * __restrict__ dst_meta,
|
||||
const float scale,
|
||||
const float max_bias,
|
||||
const float m0,
|
||||
const float m1,
|
||||
const uint32_t n_head_log2,
|
||||
const int ne00,
|
||||
const int ne01,
|
||||
const int ne02,
|
||||
const int ne03,
|
||||
const int ne10,
|
||||
const int ne11,
|
||||
const int ne12,
|
||||
const int ne13,
|
||||
const int ne31,
|
||||
const int nb31,
|
||||
const int nb01,
|
||||
const int nb02,
|
||||
const int nb03,
|
||||
const int nb11,
|
||||
const int nb12,
|
||||
const int nb13,
|
||||
const int ne0,
|
||||
const int ne1,
|
||||
const int ne2,
|
||||
const int ne3);
|
||||
|
||||
template<int D, int parallel_blocks> // D == head size
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
__launch_bounds__(D, 1)
|
||||
@@ -45,3 +82,81 @@ static __global__ void flash_attn_combine_results(
|
||||
|
||||
dst[blockIdx.y*D + tid] = VKQ_numerator / VKQ_denominator;
|
||||
}
|
||||
|
||||
template <int D, int parallel_blocks>
|
||||
void launch_fattn(ggml_backend_cuda_context & ctx, ggml_tensor * dst, fattn_kernel_t fattn_kernel, int nwarps, int cols_per_block) {
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
const ggml_tensor * K = dst->src[1];
|
||||
const ggml_tensor * V = dst->src[2];
|
||||
|
||||
const ggml_tensor * mask = dst->src[3];
|
||||
|
||||
ggml_tensor * KQV = dst;
|
||||
|
||||
GGML_ASSERT(Q->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(K->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(V->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(KQV->type == GGML_TYPE_F32);
|
||||
|
||||
GGML_ASSERT(!mask || mask->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(!mask || mask->ne[1] >= GGML_PAD(Q->ne[1], 16) &&
|
||||
"the Flash-Attention CUDA kernel requires the mask to be padded to 16 and at least n_queries big");
|
||||
|
||||
GGML_ASSERT(K->ne[1] % FATTN_KQ_STRIDE == 0 && "Incorrect KV cache padding.");
|
||||
|
||||
ggml_cuda_pool & pool = ctx.pool();
|
||||
cudaStream_t main_stream = ctx.stream();
|
||||
|
||||
ggml_cuda_pool_alloc<float> dst_tmp(pool);
|
||||
ggml_cuda_pool_alloc<float2> dst_tmp_meta(pool);
|
||||
|
||||
if (parallel_blocks > 1) {
|
||||
dst_tmp.alloc(parallel_blocks*ggml_nelements(KQV));
|
||||
dst_tmp_meta.alloc(parallel_blocks*ggml_nrows(KQV));
|
||||
}
|
||||
|
||||
const dim3 block_dim(WARP_SIZE, nwarps, 1);
|
||||
const dim3 blocks_num(parallel_blocks*((Q->ne[1] + cols_per_block - 1) / cols_per_block), Q->ne[2], Q->ne[3]);
|
||||
const int shmem = 0;
|
||||
|
||||
float scale = 1.0f;
|
||||
float max_bias = 0.0f;
|
||||
|
||||
memcpy(&scale, (float *) KQV->op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, (float *) KQV->op_params + 1, sizeof(float));
|
||||
|
||||
const uint32_t n_head = Q->ne[2];
|
||||
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
|
||||
|
||||
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||
|
||||
fattn_kernel<<<blocks_num, block_dim, shmem, main_stream>>>(
|
||||
(const char *) Q->data,
|
||||
(const char *) K->data,
|
||||
(const char *) V->data,
|
||||
mask ? ((const char *) mask->data) : nullptr,
|
||||
(parallel_blocks) == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr,
|
||||
scale, max_bias, m0, m1, n_head_log2,
|
||||
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
|
||||
K->ne[0], K->ne[1], K->ne[2], K->ne[3],
|
||||
mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0,
|
||||
Q->nb[1], Q->nb[2], Q->nb[3],
|
||||
K->nb[1], K->nb[2], K->nb[3],
|
||||
KQV->ne[0], KQV->ne[1], KQV->ne[2], KQV->ne[3]
|
||||
);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
if ((parallel_blocks) == 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
const dim3 block_dim_combine(D, 1, 1);
|
||||
const dim3 blocks_num_combine(Q->ne[1], blocks_num.y, blocks_num.z);
|
||||
const int shmem_combine = 0;
|
||||
|
||||
flash_attn_combine_results<D, parallel_blocks>
|
||||
<<<blocks_num_combine, block_dim_combine, shmem_combine, main_stream>>>
|
||||
(dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
@@ -0,0 +1,312 @@
|
||||
#include "common.cuh"
|
||||
#include "fattn-common.cuh"
|
||||
#include "fattn-tile-f16.cuh"
|
||||
|
||||
#define FATTN_KQ_STRIDE_TILE_F16 64
|
||||
|
||||
template<int D, int ncols, int nwarps, int parallel_blocks> // D == head size
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
__launch_bounds__(nwarps*WARP_SIZE, 1)
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
static __global__ void flash_attn_tile_ext_f16(
|
||||
const char * __restrict__ Q,
|
||||
const char * __restrict__ K,
|
||||
const char * __restrict__ V,
|
||||
const char * __restrict__ mask,
|
||||
float * __restrict__ dst,
|
||||
float2 * __restrict__ dst_meta,
|
||||
const float scale,
|
||||
const float max_bias,
|
||||
const float m0,
|
||||
const float m1,
|
||||
const uint32_t n_head_log2,
|
||||
const int ne00,
|
||||
const int ne01,
|
||||
const int ne02,
|
||||
const int ne03,
|
||||
const int ne10,
|
||||
const int ne11,
|
||||
const int ne12,
|
||||
const int ne13,
|
||||
const int ne31,
|
||||
const int nb31,
|
||||
const int nb01,
|
||||
const int nb02,
|
||||
const int nb03,
|
||||
const int nb11,
|
||||
const int nb12,
|
||||
const int nb13,
|
||||
const int ne0,
|
||||
const int ne1,
|
||||
const int ne2,
|
||||
const int ne3) {
|
||||
#if FP16_AVAILABLE
|
||||
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
||||
|
||||
const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
|
||||
const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
|
||||
|
||||
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
|
||||
const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.y + nb01*ic0);
|
||||
const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.y / gqa_ratio));
|
||||
const half2 * V_h2 = (const half2 *) (V + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
|
||||
const half * maskh = (const half *) mask + ne11*ic0;
|
||||
|
||||
const int stride_KV2 = nb11 / sizeof(half2);
|
||||
|
||||
const float slopef = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
|
||||
const half slopeh = __float2half(slopef);
|
||||
|
||||
static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
|
||||
|
||||
__shared__ half KQ[ncols*FATTN_KQ_STRIDE_TILE_F16];
|
||||
half2 * KQ2 = (half2 *) KQ;
|
||||
|
||||
__shared__ half2 KV_tmp[FATTN_KQ_STRIDE_TILE_F16][D/2 + 1]; // Pad D to avoid memory bank conflicts.
|
||||
|
||||
half kqmax[ncols/nwarps];
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
kqmax[j0/nwarps] = -HALF_MAX_HALF;
|
||||
}
|
||||
half2 kqsum[ncols/nwarps] = {{0.0f, 0.0f}};
|
||||
|
||||
half2 VKQ[ncols/nwarps][(D/2)/WARP_SIZE] = {{{0.0f, 0.0f}}};
|
||||
|
||||
// Convert Q to half2 and store in registers:
|
||||
__shared__ half2 Q_h2[ncols][D/2];
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
const float2 tmp = Q_f2[j*(nb01/sizeof(float2)) + i];
|
||||
Q_h2[j][i] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y);
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
const int k_start = parallel_blocks == 1 ? 0 : ip*FATTN_KQ_STRIDE_TILE_F16;
|
||||
for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*FATTN_KQ_STRIDE_TILE_F16) {
|
||||
// Calculate KQ tile and keep track of new maximum KQ values:
|
||||
|
||||
half kqmax_new[ncols/nwarps];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols/nwarps; ++j) {
|
||||
kqmax_new[j] = kqmax[j];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F16; i_KQ_0 += nwarps) {
|
||||
const int i_KQ = i_KQ_0 + threadIdx.y;
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
|
||||
const int k_KQ = k_KQ_0 + threadIdx.x;
|
||||
|
||||
KV_tmp[i_KQ][k_KQ] = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
half2 sum2[FATTN_KQ_STRIDE_TILE_F16/WARP_SIZE][ncols/nwarps] = {{{0.0f, 0.0f}}};
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ = 0; k_KQ < D/2; ++k_KQ) {
|
||||
half2 K_k[FATTN_KQ_STRIDE_TILE_F16/WARP_SIZE];
|
||||
half2 Q_k[ncols/nwarps];
|
||||
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F16; i_KQ_0 += WARP_SIZE) {
|
||||
const int i_KQ = i_KQ_0 + threadIdx.x;
|
||||
|
||||
K_k[i_KQ_0/WARP_SIZE] = KV_tmp[i_KQ][k_KQ];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
|
||||
const int j_KQ = j_KQ_0 + threadIdx.y;
|
||||
|
||||
Q_k[j_KQ_0/nwarps] = Q_h2[j_KQ][k_KQ];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F16; i_KQ_0 += WARP_SIZE) {
|
||||
#pragma unroll
|
||||
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
|
||||
sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] += K_k[i_KQ_0/WARP_SIZE]*Q_k[j_KQ_0/nwarps];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F16; i_KQ_0 += WARP_SIZE) {
|
||||
const int i_KQ = i_KQ_0 + threadIdx.x;
|
||||
|
||||
#pragma unroll
|
||||
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
|
||||
const int j_KQ = j_KQ_0 + threadIdx.y;
|
||||
|
||||
half sum = __low2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]) + __high2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
|
||||
sum += mask ? slopeh*maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
|
||||
|
||||
kqmax_new[j_KQ_0/nwarps] = ggml_cuda_hmax(kqmax_new[j_KQ_0/nwarps], sum);
|
||||
|
||||
KQ[j_KQ*FATTN_KQ_STRIDE_TILE_F16 + i_KQ] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
kqmax_new[j0/nwarps] = warp_reduce_max(kqmax_new[j0/nwarps]);
|
||||
const half2 KQ_max_scale = __half2half2(hexp(kqmax[j0/nwarps] - kqmax_new[j0/nwarps]));
|
||||
kqmax[j0/nwarps] = kqmax_new[j0/nwarps];
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < FATTN_KQ_STRIDE_TILE_F16/2; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
const half2 diff = KQ2[j*(FATTN_KQ_STRIDE_TILE_F16/2) + i] - __half2half2(kqmax[j0/nwarps]);
|
||||
const half2 val = h2exp(diff);
|
||||
kqsum[j0/nwarps] = kqsum[j0/nwarps]*KQ_max_scale + val;
|
||||
KQ2[j*(FATTN_KQ_STRIDE_TILE_F16/2) + i] = val;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
VKQ[j0/nwarps][i0/WARP_SIZE] *= KQ_max_scale;
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int k0 = 0; k0 < FATTN_KQ_STRIDE_TILE_F16; k0 += nwarps) {
|
||||
const int k = k0 + threadIdx.y;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
KV_tmp[k][i] = V_h2[(k_VKQ_0 + k)*stride_KV2 + i];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int k0 = 0; k0 < FATTN_KQ_STRIDE_TILE_F16; k0 += 2) {
|
||||
half2 V_k[(D/2)/WARP_SIZE][2];
|
||||
half2 KQ_k[ncols/nwarps];
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
V_k[i0/WARP_SIZE][0] = KV_tmp[k0 + 0][i];
|
||||
V_k[i0/WARP_SIZE][1] = KV_tmp[k0 + 1][i];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
KQ_k[j0/nwarps] = KQ2[j*(FATTN_KQ_STRIDE_TILE_F16/2) + k0/2];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
VKQ[j0/nwarps][i0/WARP_SIZE] += V_k[i0/WARP_SIZE][0]* __low2half2(KQ_k[j0/nwarps]);
|
||||
VKQ[j0/nwarps][i0/WARP_SIZE] += V_k[i0/WARP_SIZE][1]*__high2half2(KQ_k[j0/nwarps]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j_VKQ_0 = 0; j_VKQ_0 < ncols; j_VKQ_0 += nwarps) {
|
||||
const int j_VKQ = j_VKQ_0 + threadIdx.y;
|
||||
|
||||
half kqsum_j = __low2half(kqsum[j_VKQ_0/nwarps]) + __high2half(kqsum[j_VKQ_0/nwarps]);
|
||||
kqsum_j = warp_reduce_sum(kqsum_j);
|
||||
|
||||
#pragma unroll
|
||||
for (int i00 = 0; i00 < D; i00 += 2*WARP_SIZE) {
|
||||
const int i0 = i00 + 2*threadIdx.x;
|
||||
|
||||
half2 dst_val = VKQ[j_VKQ_0/nwarps][i0/(2*WARP_SIZE)];
|
||||
if (parallel_blocks == 1) {
|
||||
dst_val /= __half2half2(kqsum_j);
|
||||
}
|
||||
const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
|
||||
dst[j_dst*D*gridDim.y + D*blockIdx.y + i0 + 0] = __low2float(dst_val);
|
||||
dst[j_dst*D*gridDim.y + D*blockIdx.y + i0 + 1] = __high2float(dst_val);
|
||||
}
|
||||
|
||||
if (parallel_blocks != 1 && threadIdx.x == 0) {
|
||||
dst_meta[(ic0 + j_VKQ)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
|
||||
}
|
||||
}
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // FP16_AVAILABLE
|
||||
}
|
||||
|
||||
template <int cols_per_block, int parallel_blocks>
|
||||
void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
switch (Q->ne[0]) {
|
||||
case 64: {
|
||||
constexpr int D = 64;
|
||||
constexpr int nwarps = 8;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks>;
|
||||
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
|
||||
} break;
|
||||
case 128: {
|
||||
constexpr int D = 128;
|
||||
constexpr int nwarps = 8;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks>;
|
||||
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
|
||||
} break;
|
||||
default: {
|
||||
GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * KQV = dst;
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
|
||||
const int32_t precision = KQV->op_params[2];
|
||||
GGML_ASSERT(precision == GGML_PREC_DEFAULT);
|
||||
|
||||
if (Q->ne[1] <= 16) {
|
||||
constexpr int cols_per_block = 16;
|
||||
constexpr int parallel_blocks = 4;
|
||||
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
if (Q->ne[1] <= 32) {
|
||||
constexpr int cols_per_block = 32;
|
||||
constexpr int parallel_blocks = 4;
|
||||
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr int cols_per_block = 32;
|
||||
constexpr int parallel_blocks = 1;
|
||||
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
|
||||
}
|
||||
@@ -0,0 +1,3 @@
|
||||
#include "common.cuh"
|
||||
|
||||
void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
@@ -0,0 +1,309 @@
|
||||
#include "common.cuh"
|
||||
#include "fattn-common.cuh"
|
||||
#include "fattn-tile-f32.cuh"
|
||||
|
||||
#define FATTN_KQ_STRIDE_TILE_F32 32
|
||||
|
||||
template<int D, int ncols, int nwarps, int parallel_blocks> // D == head size
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
__launch_bounds__(nwarps*WARP_SIZE, 1)
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
static __global__ void flash_attn_tile_ext_f32(
|
||||
const char * __restrict__ Q,
|
||||
const char * __restrict__ K,
|
||||
const char * __restrict__ V,
|
||||
const char * __restrict__ mask,
|
||||
float * __restrict__ dst,
|
||||
float2 * __restrict__ dst_meta,
|
||||
const float scale,
|
||||
const float max_bias,
|
||||
const float m0,
|
||||
const float m1,
|
||||
const uint32_t n_head_log2,
|
||||
const int ne00,
|
||||
const int ne01,
|
||||
const int ne02,
|
||||
const int ne03,
|
||||
const int ne10,
|
||||
const int ne11,
|
||||
const int ne12,
|
||||
const int ne13,
|
||||
const int ne31,
|
||||
const int nb31,
|
||||
const int nb01,
|
||||
const int nb02,
|
||||
const int nb03,
|
||||
const int nb11,
|
||||
const int nb12,
|
||||
const int nb13,
|
||||
const int ne0,
|
||||
const int ne1,
|
||||
const int ne2,
|
||||
const int ne3) {
|
||||
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
||||
|
||||
const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
|
||||
const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
|
||||
|
||||
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
|
||||
const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.y + nb01*ic0);
|
||||
const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.y / gqa_ratio));
|
||||
const half2 * V_h2 = (const half2 *) (V + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
|
||||
const half * maskh = (const half *) mask + ne11*ic0;
|
||||
|
||||
const int stride_KV2 = nb11 / sizeof(half2);
|
||||
|
||||
const float slope = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
|
||||
|
||||
static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
|
||||
|
||||
__shared__ float KQ[ncols*FATTN_KQ_STRIDE_TILE_F32];
|
||||
|
||||
__shared__ float KV_tmp[FATTN_KQ_STRIDE_TILE_F32][D + 1]; // Pad D to avoid memory bank conflicts.
|
||||
float2 * KV_tmp2 = (float2 *) KV_tmp;
|
||||
|
||||
float kqmax[ncols/nwarps];
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
kqmax[j0/nwarps] = -FLT_MAX/2.0f;
|
||||
}
|
||||
float kqsum[ncols/nwarps] = {0.0f};
|
||||
|
||||
float2 VKQ[ncols/nwarps][(D/2)/WARP_SIZE] = {{{0.0f, 0.0f}}};
|
||||
|
||||
// Convert Q to half2 and store in registers:
|
||||
__shared__ float Q_f[ncols][D];
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D; i0 += 2*WARP_SIZE) {
|
||||
float2 tmp = Q_f2[j*(nb01/sizeof(float2)) + i0/2 + threadIdx.x];
|
||||
Q_f[j][i0 + 0*WARP_SIZE + threadIdx.x] = tmp.x * scale;
|
||||
Q_f[j][i0 + 1*WARP_SIZE + threadIdx.x] = tmp.y * scale;
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
const int k_start = parallel_blocks == 1 ? 0 : ip*FATTN_KQ_STRIDE_TILE_F32;
|
||||
for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*FATTN_KQ_STRIDE_TILE_F32) {
|
||||
// Calculate KQ tile and keep track of new maximum KQ values:
|
||||
|
||||
float kqmax_new[ncols/nwarps];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols/nwarps; ++j) {
|
||||
kqmax_new[j] = kqmax[j];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F32; i_KQ_0 += nwarps) {
|
||||
const int i_KQ = i_KQ_0 + threadIdx.y;
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D; k_KQ_0 += 2*WARP_SIZE) {
|
||||
const half2 tmp = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ_0/2 + threadIdx.x];
|
||||
KV_tmp[i_KQ][k_KQ_0 + 0*WARP_SIZE + threadIdx.x] = __low2float(tmp);
|
||||
KV_tmp[i_KQ][k_KQ_0 + 1*WARP_SIZE + threadIdx.x] = __high2float(tmp);
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
float sum[FATTN_KQ_STRIDE_TILE_F32/WARP_SIZE][ncols/nwarps] = {{0.0f}};
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ = 0; k_KQ < D; ++k_KQ) {
|
||||
float K_k[FATTN_KQ_STRIDE_TILE_F32/WARP_SIZE];
|
||||
float Q_k[ncols/nwarps];
|
||||
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F32; i_KQ_0 += WARP_SIZE) {
|
||||
const int i_KQ = i_KQ_0 + threadIdx.x;
|
||||
|
||||
K_k[i_KQ_0/WARP_SIZE] = KV_tmp[i_KQ][k_KQ];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
|
||||
const int j_KQ = j_KQ_0 + threadIdx.y;
|
||||
|
||||
Q_k[j_KQ_0/nwarps] = Q_f[j_KQ][k_KQ];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F32; i_KQ_0 += WARP_SIZE) {
|
||||
#pragma unroll
|
||||
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
|
||||
sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] += K_k[i_KQ_0/WARP_SIZE] * Q_k[j_KQ_0/nwarps];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F32; i_KQ_0 += WARP_SIZE) {
|
||||
const int i_KQ = i_KQ_0 + threadIdx.x;
|
||||
|
||||
#pragma unroll
|
||||
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
|
||||
const int j_KQ = j_KQ_0 + threadIdx.y;
|
||||
|
||||
sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] += mask ? slope*__half2float(maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;
|
||||
|
||||
kqmax_new[j_KQ_0/nwarps] = fmaxf(kqmax_new[j_KQ_0/nwarps], sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
|
||||
|
||||
KQ[j_KQ*FATTN_KQ_STRIDE_TILE_F32 + i_KQ] = sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
kqmax_new[j0/nwarps] = warp_reduce_max(kqmax_new[j0/nwarps]);
|
||||
const float KQ_max_scale = expf(kqmax[j0/nwarps] - kqmax_new[j0/nwarps]);
|
||||
kqmax[j0/nwarps] = kqmax_new[j0/nwarps];
|
||||
|
||||
float kqsum_add = 0.0f;
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < FATTN_KQ_STRIDE_TILE_F32; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
const float diff = KQ[j*FATTN_KQ_STRIDE_TILE_F32 + i] - kqmax[j0/nwarps];
|
||||
const float val = expf(diff);
|
||||
kqsum_add += val;
|
||||
KQ[j*FATTN_KQ_STRIDE_TILE_F32 + i] = val;
|
||||
}
|
||||
kqsum[j0/nwarps] = kqsum[j0/nwarps]*KQ_max_scale + kqsum_add;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
VKQ[j0/nwarps][i0/WARP_SIZE].x *= KQ_max_scale;
|
||||
VKQ[j0/nwarps][i0/WARP_SIZE].y *= KQ_max_scale;
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int k0 = 0; k0 < FATTN_KQ_STRIDE_TILE_F32; k0 += nwarps) {
|
||||
const int k = k0 + threadIdx.y;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
KV_tmp2[k*(D/2) + i].x = __low2float(V_h2[(k_VKQ_0 + k)*stride_KV2 + i]);
|
||||
KV_tmp2[k*(D/2) + i].y = __high2float(V_h2[(k_VKQ_0 + k)*stride_KV2 + i]);
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int k = 0; k < FATTN_KQ_STRIDE_TILE_F32; ++k) {
|
||||
float2 V_k[(D/2)/WARP_SIZE];
|
||||
float KQ_k[ncols/nwarps];
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
V_k[i0/WARP_SIZE] = KV_tmp2[k*(D/2) + i];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
KQ_k[j0/nwarps] = KQ[j*FATTN_KQ_STRIDE_TILE_F32 + k];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
VKQ[j0/nwarps][i0/WARP_SIZE].x += V_k[i0/WARP_SIZE].x*KQ_k[j0/nwarps];
|
||||
VKQ[j0/nwarps][i0/WARP_SIZE].y += V_k[i0/WARP_SIZE].y*KQ_k[j0/nwarps];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j_VKQ_0 = 0; j_VKQ_0 < ncols; j_VKQ_0 += nwarps) {
|
||||
const int j_VKQ = j_VKQ_0 + threadIdx.y;
|
||||
|
||||
float kqsum_j = kqsum[j_VKQ_0/nwarps];
|
||||
kqsum_j = warp_reduce_sum(kqsum_j);
|
||||
|
||||
#pragma unroll
|
||||
for (int i00 = 0; i00 < D; i00 += 2*WARP_SIZE) {
|
||||
const int i0 = i00 + 2*threadIdx.x;
|
||||
|
||||
float2 dst_val = VKQ[j_VKQ_0/nwarps][i0/(2*WARP_SIZE)];
|
||||
if (parallel_blocks == 1) {
|
||||
dst_val.x /= kqsum_j;
|
||||
dst_val.y /= kqsum_j;
|
||||
}
|
||||
const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
|
||||
dst[j_dst*D*gridDim.y + D*blockIdx.y + i0 + 0] = dst_val.x;
|
||||
dst[j_dst*D*gridDim.y + D*blockIdx.y + i0 + 1] = dst_val.y;
|
||||
}
|
||||
|
||||
if (parallel_blocks != 1 && threadIdx.x == 0) {
|
||||
dst_meta[(ic0 + j_VKQ)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int cols_per_block, int parallel_blocks>
|
||||
void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
switch (Q->ne[0]) {
|
||||
case 64: {
|
||||
constexpr int D = 64;
|
||||
constexpr int nwarps = 8;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, parallel_blocks>;
|
||||
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
|
||||
} break;
|
||||
case 128: {
|
||||
constexpr int D = 128;
|
||||
constexpr int nwarps = 8;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, parallel_blocks>;
|
||||
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
|
||||
} break;
|
||||
default: {
|
||||
GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * KQV = dst;
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
|
||||
const int32_t precision = KQV->op_params[2];
|
||||
GGML_ASSERT(precision == GGML_PREC_DEFAULT);
|
||||
|
||||
if (Q->ne[1] <= 16) {
|
||||
constexpr int cols_per_block = 16;
|
||||
constexpr int parallel_blocks = 4;
|
||||
launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
if (Q->ne[1] <= 32) {
|
||||
constexpr int cols_per_block = 32;
|
||||
constexpr int parallel_blocks = 4;
|
||||
launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr int cols_per_block = 32;
|
||||
constexpr int parallel_blocks = 1;
|
||||
launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
|
||||
}
|
||||
@@ -0,0 +1,3 @@
|
||||
#include "common.cuh"
|
||||
|
||||
void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
+57
-161
@@ -53,17 +53,8 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||
const int stride_KV = nb11 / sizeof(half);
|
||||
const int stride_KV2 = nb11 / sizeof(half2);
|
||||
|
||||
half slopeh = __float2half(1.0f);
|
||||
|
||||
// ALiBi
|
||||
if (max_bias > 0.0f) {
|
||||
const int h = blockIdx.y;
|
||||
|
||||
const float base = h < n_head_log2 ? m0 : m1;
|
||||
const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
||||
|
||||
slopeh = __float2half(powf(base, exph));
|
||||
}
|
||||
const float slopef = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
|
||||
const half slopeh = __float2half(slopef);
|
||||
|
||||
static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
|
||||
constexpr int nwarps = D / WARP_SIZE;
|
||||
@@ -232,199 +223,104 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||
dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
|
||||
}
|
||||
|
||||
if (parallel_blocks != 1 && tid != 0) {
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
dst_meta[(ic0 + j)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j], kqsum[j]);
|
||||
}
|
||||
if (parallel_blocks != 1 && tid < ncols) {
|
||||
dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]);
|
||||
}
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // FP16_AVAILABLE
|
||||
}
|
||||
|
||||
template <int D, int cols_per_block, int parallel_blocks> void launch_fattn_vec_f16(
|
||||
const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask,
|
||||
ggml_cuda_pool & pool, cudaStream_t main_stream
|
||||
) {
|
||||
ggml_cuda_pool_alloc<float> dst_tmp(pool);
|
||||
ggml_cuda_pool_alloc<float2> dst_tmp_meta(pool);
|
||||
|
||||
if (parallel_blocks > 1) {
|
||||
dst_tmp.alloc(parallel_blocks*ggml_nelements(KQV));
|
||||
dst_tmp_meta.alloc(parallel_blocks*ggml_nrows(KQV));
|
||||
}
|
||||
|
||||
constexpr int nwarps = (D + WARP_SIZE - 1) / WARP_SIZE;
|
||||
const dim3 block_dim(WARP_SIZE, nwarps, 1);
|
||||
const dim3 blocks_num(parallel_blocks*((Q->ne[1] + cols_per_block - 1) / cols_per_block), Q->ne[2], Q->ne[3]);
|
||||
const int shmem = 0;
|
||||
|
||||
float scale = 1.0f;
|
||||
float max_bias = 0.0f;
|
||||
|
||||
memcpy(&scale, (float *) KQV->op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, (float *) KQV->op_params + 1, sizeof(float));
|
||||
|
||||
const uint32_t n_head = Q->ne[2];
|
||||
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
|
||||
|
||||
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||
|
||||
flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>
|
||||
<<<blocks_num, block_dim, shmem, main_stream>>> (
|
||||
(const char *) Q->data,
|
||||
(const char *) K->data,
|
||||
(const char *) V->data,
|
||||
mask ? ((const char *) mask->data) : nullptr,
|
||||
parallel_blocks == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr,
|
||||
scale, max_bias, m0, m1, n_head_log2,
|
||||
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
|
||||
K->ne[0], K->ne[1], K->ne[2], K->ne[3],
|
||||
mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0,
|
||||
Q->nb[1], Q->nb[2], Q->nb[3],
|
||||
K->nb[1], K->nb[2], K->nb[3],
|
||||
KQV->ne[0], KQV->ne[1], KQV->ne[2], KQV->ne[3]
|
||||
);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
if (parallel_blocks == 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
const dim3 block_dim_combine(D, 1, 1);
|
||||
const dim3 blocks_num_combine(Q->ne[1], blocks_num.y, blocks_num.z);
|
||||
const int shmem_combine = 0;
|
||||
|
||||
flash_attn_combine_results<D, parallel_blocks>
|
||||
<<<blocks_num_combine, block_dim_combine, shmem_combine, main_stream>>>
|
||||
(dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
const ggml_tensor * K = dst->src[1];
|
||||
const ggml_tensor * V = dst->src[2];
|
||||
|
||||
const ggml_tensor * mask = dst->src[3];
|
||||
|
||||
ggml_tensor * KQV = dst;
|
||||
ggml_tensor * Q = dst->src[0];
|
||||
|
||||
const int32_t precision = KQV->op_params[2];
|
||||
GGML_ASSERT(precision == GGML_PREC_DEFAULT);
|
||||
|
||||
constexpr int cols_per_block = 1;
|
||||
constexpr int cols_per_block = 1;
|
||||
constexpr int parallel_blocks = 4;
|
||||
switch (Q->ne[0]) {
|
||||
case 64:
|
||||
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
case 128:
|
||||
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
case 256:
|
||||
launch_fattn_vec_f16<256, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
case 64: {
|
||||
constexpr int D = 64;
|
||||
constexpr int nwarps = D/WARP_SIZE;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>;
|
||||
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
|
||||
} break;
|
||||
case 128: {
|
||||
constexpr int D = 128;
|
||||
constexpr int nwarps = D/WARP_SIZE;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>;
|
||||
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
|
||||
} break;
|
||||
case 256: {
|
||||
constexpr int D = 256;
|
||||
constexpr int nwarps = D/WARP_SIZE;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>;
|
||||
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
|
||||
} break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
template <int cols_per_block, int parallel_blocks>
|
||||
void launch_fattn_vec_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
switch (Q->ne[0]) {
|
||||
case 64: {
|
||||
constexpr int D = 64;
|
||||
constexpr int nwarps = D/WARP_SIZE;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>;
|
||||
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
|
||||
} break;
|
||||
case 128: {
|
||||
constexpr int D = 128;
|
||||
constexpr int nwarps = D/WARP_SIZE;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>;
|
||||
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
|
||||
} break;
|
||||
default: {
|
||||
GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_flash_attn_ext_vec_f16_no_mma(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
const ggml_tensor * K = dst->src[1];
|
||||
const ggml_tensor * V = dst->src[2];
|
||||
|
||||
const ggml_tensor * mask = dst->src[3];
|
||||
|
||||
ggml_tensor * KQV = dst;
|
||||
const ggml_tensor * KQV = dst;
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
|
||||
const int32_t precision = KQV->op_params[2];
|
||||
GGML_ASSERT(precision == GGML_PREC_DEFAULT);
|
||||
GGML_ASSERT(Q->ne[0] == 64 || Q->ne[0] == 128 && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
|
||||
|
||||
if (Q->ne[1] == 1) {
|
||||
constexpr int cols_per_block = 1;
|
||||
constexpr int parallel_blocks = 4;
|
||||
switch (Q->ne[0]) {
|
||||
case 64:
|
||||
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
case 128:
|
||||
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
if (Q->ne[1] == 2) {
|
||||
constexpr int cols_per_block = 2;
|
||||
constexpr int cols_per_block = 2;
|
||||
constexpr int parallel_blocks = 4;
|
||||
switch (Q->ne[0]) {
|
||||
case 64:
|
||||
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
case 128:
|
||||
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
launch_fattn_vec_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
if (Q->ne[1] <= 4) {
|
||||
constexpr int cols_per_block = 4;
|
||||
constexpr int cols_per_block = 4;
|
||||
constexpr int parallel_blocks = 4;
|
||||
switch (Q->ne[0]) {
|
||||
case 64:
|
||||
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
case 128:
|
||||
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
launch_fattn_vec_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
if (Q->ne[1] <= 8) {
|
||||
constexpr int cols_per_block = 8;
|
||||
constexpr int cols_per_block = 8;
|
||||
constexpr int parallel_blocks = 4;
|
||||
switch (Q->ne[0]) {
|
||||
case 64:
|
||||
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
case 128:
|
||||
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
launch_fattn_vec_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr int cols_per_block = 8;
|
||||
constexpr int cols_per_block = 8;
|
||||
constexpr int parallel_blocks = 1;
|
||||
switch (Q->ne[0]) {
|
||||
case 64:
|
||||
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
case 128:
|
||||
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
launch_fattn_vec_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
|
||||
}
|
||||
|
||||
+32
-141
@@ -52,17 +52,7 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
const int stride_KV = nb11 / sizeof(half);
|
||||
const int stride_KV2 = nb11 / sizeof(half2);
|
||||
|
||||
float slope = 1.0f;
|
||||
|
||||
// ALiBi
|
||||
if (max_bias > 0.0f) {
|
||||
const int h = blockIdx.y;
|
||||
|
||||
const float base = h < n_head_log2 ? m0 : m1;
|
||||
const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
||||
|
||||
slope = powf(base, exph);
|
||||
}
|
||||
const float slope = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
|
||||
|
||||
static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
|
||||
constexpr int nwarps = D / WARP_SIZE;
|
||||
@@ -221,164 +211,65 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
|
||||
}
|
||||
|
||||
if (parallel_blocks != 1 && tid != 0) {
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
dst_meta[(ic0 + j)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j], kqsum[j]);
|
||||
}
|
||||
if (parallel_blocks != 1 && tid < ncols) {
|
||||
dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]);
|
||||
}
|
||||
}
|
||||
|
||||
template <int D, int cols_per_block, int parallel_blocks> void launch_fattn_vec_f32(
|
||||
const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask,
|
||||
ggml_cuda_pool & pool, cudaStream_t main_stream
|
||||
) {
|
||||
ggml_cuda_pool_alloc<float> dst_tmp(pool);
|
||||
ggml_cuda_pool_alloc<float2> dst_tmp_meta(pool);
|
||||
|
||||
if (parallel_blocks > 1) {
|
||||
dst_tmp.alloc(parallel_blocks*ggml_nelements(KQV));
|
||||
dst_tmp_meta.alloc(parallel_blocks*ggml_nrows(KQV));
|
||||
template <int cols_per_block, int parallel_blocks>
|
||||
void launch_fattn_vec_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
switch (Q->ne[0]) {
|
||||
case 64: {
|
||||
constexpr int D = 64;
|
||||
constexpr int nwarps = D/WARP_SIZE;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f32<D, cols_per_block, parallel_blocks>;
|
||||
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
|
||||
} break;
|
||||
case 128: {
|
||||
constexpr int D = 128;
|
||||
constexpr int nwarps = D/WARP_SIZE;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f32<D, cols_per_block, parallel_blocks>;
|
||||
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
|
||||
} break;
|
||||
default: {
|
||||
GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
|
||||
} break;
|
||||
}
|
||||
|
||||
constexpr int nwarps = (D + WARP_SIZE - 1) / WARP_SIZE;
|
||||
const dim3 block_dim(WARP_SIZE, nwarps, 1);
|
||||
const dim3 blocks_num(parallel_blocks*((Q->ne[1] + cols_per_block - 1) / cols_per_block), Q->ne[2], Q->ne[3]);
|
||||
const int shmem = 0;
|
||||
|
||||
float scale = 1.0f;
|
||||
float max_bias = 0.0f;
|
||||
|
||||
memcpy(&scale, (float *) KQV->op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, (float *) KQV->op_params + 1, sizeof(float));
|
||||
|
||||
const uint32_t n_head = Q->ne[2];
|
||||
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
|
||||
|
||||
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||
|
||||
flash_attn_vec_ext_f32<D, cols_per_block, parallel_blocks>
|
||||
<<<blocks_num, block_dim, shmem, main_stream>>> (
|
||||
(const char *) Q->data,
|
||||
(const char *) K->data,
|
||||
(const char *) V->data,
|
||||
mask ? ((const char *) mask->data) : nullptr,
|
||||
parallel_blocks == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr,
|
||||
scale, max_bias, m0, m1, n_head_log2,
|
||||
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
|
||||
K->ne[0], K->ne[1], K->ne[2], K->ne[3],
|
||||
mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0,
|
||||
Q->nb[1], Q->nb[2], Q->nb[3],
|
||||
K->nb[1], K->nb[2], K->nb[3],
|
||||
KQV->ne[0], KQV->ne[1], KQV->ne[2], KQV->ne[3]
|
||||
);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
if (parallel_blocks == 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
const dim3 block_dim_combine(D, 1, 1);
|
||||
const dim3 blocks_num_combine(Q->ne[1], blocks_num.y, blocks_num.z);
|
||||
const int shmem_combine = 0;
|
||||
|
||||
flash_attn_combine_results<D, parallel_blocks>
|
||||
<<<blocks_num_combine, block_dim_combine, shmem_combine, main_stream>>>
|
||||
(dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
const ggml_tensor * K = dst->src[1];
|
||||
const ggml_tensor * V = dst->src[2];
|
||||
|
||||
const ggml_tensor * mask = dst->src[3];
|
||||
|
||||
ggml_tensor * KQV = dst;
|
||||
|
||||
GGML_ASSERT(Q->ne[0] == 64 || Q->ne[0] == 128 && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
|
||||
|
||||
if (Q->ne[1] == 1) {
|
||||
constexpr int cols_per_block = 1;
|
||||
constexpr int cols_per_block = 1;
|
||||
constexpr int parallel_blocks = 4;
|
||||
switch (Q->ne[0]) {
|
||||
case 64:
|
||||
launch_fattn_vec_f32< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
case 128:
|
||||
launch_fattn_vec_f32<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
launch_fattn_vec_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
if (Q->ne[1] == 2) {
|
||||
constexpr int cols_per_block = 2;
|
||||
constexpr int cols_per_block = 2;
|
||||
constexpr int parallel_blocks = 4;
|
||||
switch (Q->ne[0]) {
|
||||
case 64:
|
||||
launch_fattn_vec_f32< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
case 128:
|
||||
launch_fattn_vec_f32<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
launch_fattn_vec_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
if (Q->ne[1] <= 4) {
|
||||
constexpr int cols_per_block = 4;
|
||||
constexpr int cols_per_block = 4;
|
||||
constexpr int parallel_blocks = 4;
|
||||
switch (Q->ne[0]) {
|
||||
case 64:
|
||||
launch_fattn_vec_f32< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
case 128:
|
||||
launch_fattn_vec_f32<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
launch_fattn_vec_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
if (Q->ne[1] <= 8) {
|
||||
constexpr int cols_per_block = 8;
|
||||
constexpr int cols_per_block = 8;
|
||||
constexpr int parallel_blocks = 4;
|
||||
switch (Q->ne[0]) {
|
||||
case 64:
|
||||
launch_fattn_vec_f32< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
case 128:
|
||||
launch_fattn_vec_f32<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
launch_fattn_vec_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr int cols_per_block = 8;
|
||||
constexpr int cols_per_block = 8;
|
||||
constexpr int parallel_blocks = 1;
|
||||
switch (Q->ne[0]) {
|
||||
case 64:
|
||||
launch_fattn_vec_f32< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
case 128:
|
||||
launch_fattn_vec_f32<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
launch_fattn_vec_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
|
||||
}
|
||||
|
||||
+70
-131
@@ -1,5 +1,7 @@
|
||||
#include "common.cuh"
|
||||
#include "fattn-common.cuh"
|
||||
#include "fattn-tile-f16.cuh"
|
||||
#include "fattn-tile-f32.cuh"
|
||||
#include "fattn-vec-f16.cuh"
|
||||
#include "fattn-vec-f32.cuh"
|
||||
#include "fattn.cuh"
|
||||
@@ -83,19 +85,9 @@ static __global__ void flash_attn_ext_f16(
|
||||
const int stride_Q = nb01 / sizeof(float);
|
||||
const int stride_KV = nb11 / sizeof(half);
|
||||
|
||||
half slopeh = __float2half(1.0f);
|
||||
half2 slope2 = make_half2(1.0f, 1.0f);
|
||||
|
||||
// ALiBi
|
||||
if (max_bias > 0.0f) {
|
||||
const int h = blockIdx.y;
|
||||
|
||||
const float base = h < n_head_log2 ? m0 : m1;
|
||||
const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
||||
|
||||
slopeh = __float2half(powf(base, exph));
|
||||
slope2 = make_half2(slopeh, slopeh);
|
||||
}
|
||||
const float slopef = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
|
||||
const half slopeh = __float2half(slopef);
|
||||
const half2 slope2 = make_half2(slopef, slopef);
|
||||
|
||||
frag_b Q_b[D/16][ncols/frag_n];
|
||||
|
||||
@@ -437,117 +429,64 @@ static_assert(get_VKQ_stride( 80, 1, 16) == 16, "Test failed.");
|
||||
static_assert(get_VKQ_stride( 80, 2, 16) == 16, "Test failed.");
|
||||
static_assert(get_VKQ_stride( 80, 4, 16) == 16, "Test failed.");
|
||||
|
||||
template <int D, int cols_per_block, int nwarps, int parallel_blocks, typename KQ_acc_t> void launch_fattn_f16_impl(
|
||||
const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask,
|
||||
ggml_cuda_pool & pool, cudaStream_t main_stream
|
||||
) {
|
||||
ggml_cuda_pool_alloc<float> dst_tmp(pool);
|
||||
ggml_cuda_pool_alloc<float2> dst_tmp_meta(pool);
|
||||
template <int D, int cols_per_block, int nwarps, typename KQ_acc_t>
|
||||
void launch_fattn_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
|
||||
if (parallel_blocks > 1) {
|
||||
dst_tmp.alloc(parallel_blocks*ggml_nelements(KQV));
|
||||
dst_tmp_meta.alloc(parallel_blocks*ggml_nrows(KQV));
|
||||
}
|
||||
|
||||
constexpr int frag_m = (cols_per_block) == 8 && (D) % 32 == 0 ? 32 : 16;
|
||||
const dim3 block_dim(WARP_SIZE, nwarps, 1);
|
||||
const dim3 blocks_num(parallel_blocks*(Q->ne[1] + cols_per_block - 1) / cols_per_block, Q->ne[2], Q->ne[3]);
|
||||
const int shmem = 0;
|
||||
|
||||
float scale = 1.0f;
|
||||
float max_bias = 0.0f;
|
||||
|
||||
memcpy(&scale, (float *) KQV->op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, (float *) KQV->op_params + 1, sizeof(float));
|
||||
|
||||
const uint32_t n_head = Q->ne[2];
|
||||
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
|
||||
|
||||
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||
|
||||
flash_attn_ext_f16<D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t>
|
||||
<<<blocks_num, block_dim, shmem, main_stream>>> (
|
||||
(const char *) Q->data,
|
||||
(const char *) K->data,
|
||||
(const char *) V->data,
|
||||
mask ? ((const char *) mask->data) : nullptr,
|
||||
(parallel_blocks) == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr,
|
||||
scale, max_bias, m0, m1, n_head_log2,
|
||||
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
|
||||
K->ne[0], K->ne[1], K->ne[2], K->ne[3],
|
||||
mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0,
|
||||
Q->nb[1], Q->nb[2], Q->nb[3],
|
||||
K->nb[1], K->nb[2], K->nb[3],
|
||||
KQV->ne[0], KQV->ne[1], KQV->ne[2], KQV->ne[3]
|
||||
);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
if ((parallel_blocks) == 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
const dim3 block_dim_combine(D, 1, 1);
|
||||
const dim3 blocks_num_combine(Q->ne[1], blocks_num.y, blocks_num.z);
|
||||
const int shmem_combine = 0;
|
||||
|
||||
flash_attn_combine_results<D, parallel_blocks>
|
||||
<<<blocks_num_combine, block_dim_combine, shmem_combine, main_stream>>>
|
||||
(dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
template <int D, int cols_per_block, int nwarps, typename KQ_acc_t> void launch_fattn_f16(
|
||||
const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask,
|
||||
const int nsm, ggml_cuda_pool & pool, cudaStream_t main_stream
|
||||
) {
|
||||
constexpr int frag_m = cols_per_block == 8 && D % 32 == 0 ? 32 : 16;
|
||||
const int blocks_num_pb1 = ((Q->ne[1] + cols_per_block - 1) / cols_per_block)*Q->ne[2]*Q->ne[3];
|
||||
const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
|
||||
|
||||
if (4*blocks_num_pb1 < 2*nsm) {
|
||||
launch_fattn_f16_impl<D, cols_per_block, nwarps, 4, KQ_acc_t>(Q, K, V, KQV, mask, pool, main_stream);
|
||||
constexpr int parallel_blocks = 4;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_ext_f16<D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t>;
|
||||
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
|
||||
return;
|
||||
}
|
||||
if (2*blocks_num_pb1 < 2*nsm) {
|
||||
launch_fattn_f16_impl<D, cols_per_block, nwarps, 2, KQ_acc_t>(Q, K, V, KQV, mask, pool, main_stream);
|
||||
constexpr int parallel_blocks = 2;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_ext_f16<D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t>;
|
||||
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
|
||||
return;
|
||||
}
|
||||
launch_fattn_f16_impl<D, cols_per_block, nwarps, 1, KQ_acc_t>(Q, K, V, KQV, mask, pool, main_stream);
|
||||
constexpr int parallel_blocks = 1;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_ext_f16<D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t>;
|
||||
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
|
||||
}
|
||||
|
||||
void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
const ggml_tensor * K = dst->src[1];
|
||||
const ggml_tensor * V = dst->src[2];
|
||||
|
||||
const ggml_tensor * mask = dst->src[3];
|
||||
|
||||
ggml_tensor * KQV = dst;
|
||||
|
||||
GGML_ASSERT(Q->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(K->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(V->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(KQV->type == GGML_TYPE_F32);
|
||||
|
||||
GGML_ASSERT(!mask || mask->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(!mask || mask->ne[1] >= GGML_PAD(Q->ne[1], 16) &&
|
||||
"the Flash-Attention CUDA kernel requires the mask to be padded to 16 and at least n_queries big");
|
||||
|
||||
GGML_ASSERT(K->ne[1] % FATTN_KQ_STRIDE == 0 && "Incorrect KV cache padding.");
|
||||
const ggml_tensor * KQV = dst;
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
|
||||
ggml_cuda_set_device(ctx.device);
|
||||
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
|
||||
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
const int32_t precision = KQV->op_params[2];
|
||||
|
||||
// On AMD the tile kernels perform poorly, use the vec kernel instead:
|
||||
if (cc >= CC_OFFSET_AMD) {
|
||||
if (precision == GGML_PREC_DEFAULT) {
|
||||
ggml_cuda_flash_attn_ext_vec_f16_no_mma(ctx, dst);
|
||||
} else {
|
||||
ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (!fast_fp16_available(cc)) {
|
||||
ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
|
||||
if (Q->ne[1] <= 8) {
|
||||
ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
|
||||
} else {
|
||||
ggml_cuda_flash_attn_ext_tile_f32(ctx, dst);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (!fp16_mma_available(cc)) {
|
||||
ggml_cuda_flash_attn_ext_vec_f16_no_mma(ctx, dst);
|
||||
if (Q->ne[1] <= 8) {
|
||||
ggml_cuda_flash_attn_ext_vec_f16_no_mma(ctx, dst);
|
||||
} else {
|
||||
ggml_cuda_flash_attn_ext_tile_f16(ctx, dst);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -562,22 +501,22 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||
constexpr int nwarps = 4;
|
||||
switch (Q->ne[0]) {
|
||||
case 64:
|
||||
launch_fattn_f16< 64, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
|
||||
launch_fattn_f16< 64, cols_per_block, nwarps, float>(ctx, dst);
|
||||
break;
|
||||
case 80:
|
||||
launch_fattn_f16< 80, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
|
||||
launch_fattn_f16< 80, cols_per_block, nwarps, float>(ctx, dst);
|
||||
break;
|
||||
case 96:
|
||||
launch_fattn_f16< 96, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
|
||||
launch_fattn_f16< 96, cols_per_block, nwarps, float>(ctx, dst);
|
||||
break;
|
||||
case 112:
|
||||
launch_fattn_f16<112, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
|
||||
launch_fattn_f16<112, cols_per_block, nwarps, float>(ctx, dst);
|
||||
break;
|
||||
case 128:
|
||||
launch_fattn_f16<128, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
|
||||
launch_fattn_f16<128, cols_per_block, nwarps, float>(ctx, dst);
|
||||
break;
|
||||
case 256:
|
||||
launch_fattn_f16<256, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
|
||||
launch_fattn_f16<256, cols_per_block, nwarps, float>(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
@@ -588,22 +527,22 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||
constexpr int nwarps = 4;
|
||||
switch (Q->ne[0]) {
|
||||
case 64:
|
||||
launch_fattn_f16< 64, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
|
||||
launch_fattn_f16< 64, cols_per_block, nwarps, float>(ctx, dst);
|
||||
break;
|
||||
case 80:
|
||||
launch_fattn_f16< 80, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
|
||||
launch_fattn_f16< 80, cols_per_block, nwarps, float>(ctx, dst);
|
||||
break;
|
||||
case 96:
|
||||
launch_fattn_f16< 96, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
|
||||
launch_fattn_f16< 96, cols_per_block, nwarps, float>(ctx, dst);
|
||||
break;
|
||||
case 112:
|
||||
launch_fattn_f16<112, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
|
||||
launch_fattn_f16<112, cols_per_block, nwarps, float>(ctx, dst);
|
||||
break;
|
||||
case 128:
|
||||
launch_fattn_f16<128, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
|
||||
launch_fattn_f16<128, cols_per_block, nwarps, float>(ctx, dst);
|
||||
break;
|
||||
// case 256:
|
||||
// launch_fattn_f16<256, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
|
||||
// launch_fattn_f16<256, cols_per_block, nwarps, float>(ctx, dst);
|
||||
// break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
@@ -623,16 +562,16 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||
constexpr int nwarps = 4;
|
||||
switch (Q->ne[0]) {
|
||||
case 64:
|
||||
launch_fattn_f16< 64, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
|
||||
launch_fattn_f16< 64, cols_per_block, nwarps, half>(ctx, dst);
|
||||
break;
|
||||
case 96:
|
||||
launch_fattn_f16< 96, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
|
||||
launch_fattn_f16< 96, cols_per_block, nwarps, half>(ctx, dst);
|
||||
break;
|
||||
case 128:
|
||||
launch_fattn_f16<128, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
|
||||
launch_fattn_f16<128, cols_per_block, nwarps, half>(ctx, dst);
|
||||
break;
|
||||
case 256:
|
||||
launch_fattn_f16<256, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
|
||||
launch_fattn_f16<256, cols_per_block, nwarps, half>(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
@@ -646,22 +585,22 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||
constexpr int nwarps = 4;
|
||||
switch (Q->ne[0]) {
|
||||
case 64:
|
||||
launch_fattn_f16< 64, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
|
||||
launch_fattn_f16< 64, cols_per_block, nwarps, half>(ctx, dst);
|
||||
break;
|
||||
case 80:
|
||||
launch_fattn_f16< 80, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
|
||||
launch_fattn_f16< 80, cols_per_block, nwarps, half>(ctx, dst);
|
||||
break;
|
||||
case 96:
|
||||
launch_fattn_f16< 96, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
|
||||
launch_fattn_f16< 96, cols_per_block, nwarps, half>(ctx, dst);
|
||||
break;
|
||||
case 112:
|
||||
launch_fattn_f16<112, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
|
||||
launch_fattn_f16<112, cols_per_block, nwarps, half>(ctx, dst);
|
||||
break;
|
||||
case 128:
|
||||
launch_fattn_f16<128, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
|
||||
launch_fattn_f16<128, cols_per_block, nwarps, half>(ctx, dst);
|
||||
break;
|
||||
case 256:
|
||||
launch_fattn_f16<256, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
|
||||
launch_fattn_f16<256, cols_per_block, nwarps, half>(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
@@ -674,22 +613,22 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||
constexpr int nwarps = 4;
|
||||
switch (Q->ne[0]) {
|
||||
case 64:
|
||||
launch_fattn_f16< 64, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
|
||||
launch_fattn_f16< 64, cols_per_block, nwarps, half>(ctx, dst);
|
||||
break;
|
||||
case 80:
|
||||
launch_fattn_f16< 80, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
|
||||
launch_fattn_f16< 80, cols_per_block, nwarps, half>(ctx, dst);
|
||||
break;
|
||||
case 96:
|
||||
launch_fattn_f16< 96, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
|
||||
launch_fattn_f16< 96, cols_per_block, nwarps, half>(ctx, dst);
|
||||
break;
|
||||
case 112:
|
||||
launch_fattn_f16<112, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
|
||||
launch_fattn_f16<112, cols_per_block, nwarps, half>(ctx, dst);
|
||||
break;
|
||||
case 128:
|
||||
launch_fattn_f16<128, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
|
||||
launch_fattn_f16<128, cols_per_block, nwarps, half>(ctx, dst);
|
||||
break;
|
||||
case 256:
|
||||
launch_fattn_f16<256, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream());
|
||||
launch_fattn_f16<256, cols_per_block, nwarps, half>(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
|
||||
+2
-11
@@ -1,3 +1,4 @@
|
||||
#include "common.cuh"
|
||||
#include "softmax.cuh"
|
||||
|
||||
template <typename T>
|
||||
@@ -23,17 +24,7 @@ static __global__ void soft_max_f32(const float * x, const T * mask, float * dst
|
||||
const int warp_id = threadIdx.x / WARP_SIZE;
|
||||
const int lane_id = threadIdx.x % WARP_SIZE;
|
||||
|
||||
float slope = 1.0f;
|
||||
|
||||
// ALiBi
|
||||
if (max_bias > 0.0f) {
|
||||
const int h = rowx/nrows_y; // head index
|
||||
|
||||
const float base = h < n_head_log2 ? m0 : m1;
|
||||
const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
||||
|
||||
slope = powf(base, exph);
|
||||
}
|
||||
const float slope = get_alibi_slope(max_bias, rowx/nrows_y, n_head_log2, m0, m1);
|
||||
|
||||
extern __shared__ float data_soft_max_f32[];
|
||||
float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication
|
||||
|
||||
+33
-30
@@ -1,35 +1,36 @@
|
||||
#include "upscale.cuh"
|
||||
|
||||
static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int ne00xne01, const int scale_factor) {
|
||||
// blockIdx.z: idx of ne02*ne03
|
||||
// blockIdx.y: idx of ne01*scale_factor, aka ne1
|
||||
// blockIDx.x: idx of ne00*scale_factor / BLOCK_SIZE
|
||||
// ne00xne01: ne00 * ne01
|
||||
int ne0 = ne00 * scale_factor;
|
||||
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (nidx >= ne0) {
|
||||
static __global__ void upscale_f32(const float * x, float * dst,
|
||||
const int nb00, const int nb01, const int nb02, const int nb03,
|
||||
const int ne10, const int ne11, const int ne12, const int ne13,
|
||||
const float sf0, const float sf1, const float sf2, const float sf3) {
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (index >= ne10 * ne11 * ne12 * ne13) {
|
||||
return;
|
||||
}
|
||||
// operation
|
||||
int i00 = nidx / scale_factor;
|
||||
int i01 = blockIdx.y / scale_factor;
|
||||
int offset_src =
|
||||
i00 +
|
||||
i01 * ne00 +
|
||||
blockIdx.z * ne00xne01;
|
||||
int offset_dst =
|
||||
nidx +
|
||||
blockIdx.y * ne0 +
|
||||
blockIdx.z * ne0 * gridDim.y;
|
||||
dst[offset_dst] = x[offset_src];
|
||||
|
||||
int i10 = index % ne10;
|
||||
int i11 = (index / ne10) % ne11;
|
||||
int i12 = (index / (ne10 * ne11)) % ne12;
|
||||
int i13 = (index / (ne10 * ne11 * ne12)) % ne13;
|
||||
|
||||
int i00 = i10 / sf0;
|
||||
int i01 = i11 / sf1;
|
||||
int i02 = i12 / sf2;
|
||||
int i03 = i13 / sf3;
|
||||
|
||||
dst[index] = *(float *)((char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
|
||||
}
|
||||
|
||||
static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int ne03,
|
||||
const int scale_factor, cudaStream_t stream) {
|
||||
int ne0 = (ne00 * scale_factor);
|
||||
int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
|
||||
dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02*ne03);
|
||||
upscale_f32<<<gridDim, CUDA_UPSCALE_BLOCK_SIZE, 0, stream>>>(x, dst, ne00, ne00 * ne01, scale_factor);
|
||||
static void upscale_f32_cuda(const float * x, float * dst,
|
||||
const int nb00, const int nb01, const int nb02, const int nb03,
|
||||
const int ne10, const int ne11, const int ne12, const int ne13,
|
||||
const float sf0, const float sf1, const float sf2, const float sf3,
|
||||
cudaStream_t stream) {
|
||||
int dst_size = ne10 * ne11 * ne12 * ne13;
|
||||
int num_blocks = (dst_size + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
|
||||
|
||||
upscale_f32<<<num_blocks, CUDA_UPSCALE_BLOCK_SIZE,0,stream>>>(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
@@ -39,10 +40,12 @@ void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
const int scale_factor = dst->op_params[0];
|
||||
const float sf0 = (float)dst->ne[0]/src0->ne[0];
|
||||
const float sf1 = (float)dst->ne[1]/src0->ne[1];
|
||||
const float sf2 = (float)dst->ne[2]/src0->ne[2];
|
||||
const float sf3 = (float)dst->ne[3]/src0->ne[3];
|
||||
|
||||
upscale_f32_cuda(src0_d, dst_d, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], scale_factor, stream);
|
||||
upscale_f32_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, stream);
|
||||
}
|
||||
|
||||
@@ -120,9 +120,16 @@ extern "C" {
|
||||
#ifndef __F16C__
|
||||
#define __F16C__
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
|
||||
#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
|
||||
#ifndef __SSE3__
|
||||
#define __SSE3__
|
||||
#endif
|
||||
#ifndef __SSSE3__
|
||||
#define __SSSE3__
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// 16-bit float
|
||||
|
||||
+48
-35
@@ -1378,7 +1378,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
|
||||
|
||||
if (ne00%4 == 0) {
|
||||
while (nth < ne00/4 && nth < 256) {
|
||||
while (nth < ne00/4 && nth*ne01*ne02*ne03 < 256) {
|
||||
nth *= 2;
|
||||
}
|
||||
if (use_f16) {
|
||||
@@ -1387,7 +1387,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32_4].pipeline;
|
||||
}
|
||||
} else {
|
||||
while (nth < ne00 && nth < 1024) {
|
||||
while (nth < ne00 && nth*ne01*ne02*ne03 < 256) {
|
||||
nth *= 2;
|
||||
}
|
||||
if (use_f16) {
|
||||
@@ -2353,7 +2353,10 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
{
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
|
||||
const int sf = dst->op_params[0];
|
||||
const float sf0 = (float)ne0/src0->ne[0];
|
||||
const float sf1 = (float)ne1/src0->ne[1];
|
||||
const float sf2 = (float)ne2/src0->ne[2];
|
||||
const float sf3 = (float)ne3/src0->ne[3];
|
||||
|
||||
const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UPSCALE_F32].pipeline;
|
||||
|
||||
@@ -2376,7 +2379,10 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
|
||||
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16];
|
||||
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
|
||||
[encoder setBytes:&sf length:sizeof(sf) atIndex:18];
|
||||
[encoder setBytes:&sf0 length:sizeof(sf0) atIndex:18];
|
||||
[encoder setBytes:&sf1 length:sizeof(sf1) atIndex:19];
|
||||
[encoder setBytes:&sf2 length:sizeof(sf2) atIndex:20];
|
||||
[encoder setBytes:&sf3 length:sizeof(sf3) atIndex:21];
|
||||
|
||||
const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
|
||||
|
||||
@@ -2512,13 +2518,14 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
{
|
||||
GGML_ASSERT(ne00 % 4 == 0);
|
||||
GGML_ASSERT(ne00 % 4 == 0);
|
||||
GGML_ASSERT(ne11 % 32 == 0);
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
|
||||
struct ggml_tensor * src3 = gf->nodes[i]->src[3];
|
||||
GGML_ASSERT(ggml_are_same_shape (src1, src2));
|
||||
|
||||
GGML_ASSERT(ggml_are_same_shape(src1, src2));
|
||||
GGML_ASSERT(src3);
|
||||
struct ggml_tensor * src3 = gf->nodes[i]->src[3];
|
||||
|
||||
size_t offs_src3 = 0;
|
||||
|
||||
@@ -2528,6 +2535,11 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
GGML_ASSERT(!src3 || src3->ne[1] >= GGML_PAD(src0->ne[1], 8) &&
|
||||
"the Flash-Attention Metal kernel requires the mask to be padded to 8 and at least n_queries big");
|
||||
|
||||
const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20);
|
||||
const uint64_t nb21 = src2 ? src2->nb[1] : 0;
|
||||
const uint64_t nb22 = src2 ? src2->nb[2] : 0;
|
||||
const uint64_t nb23 = src2 ? src2->nb[3] : 0;
|
||||
|
||||
const int64_t ne30 = src3 ? src3->ne[0] : 0; GGML_UNUSED(ne30);
|
||||
//const int64_t ne31 = src3 ? src3->ne[1] : 0;
|
||||
const int64_t ne32 = src3 ? src3->ne[2] : 0; GGML_UNUSED(ne32);
|
||||
@@ -2590,34 +2602,35 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
[encoder setBuffer:id_src2 offset:offs_src2 atIndex:2];
|
||||
[encoder setBuffer:id_src3 offset:offs_src3 atIndex:3];
|
||||
if (id_src3) {
|
||||
[encoder setBuffer:id_src3 offset:offs_src3 atIndex:3];
|
||||
} else {
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:3];
|
||||
}
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:4];
|
||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:5];
|
||||
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:6];
|
||||
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:7];
|
||||
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:8];
|
||||
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:9];
|
||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:10];
|
||||
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:11];
|
||||
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:12];
|
||||
[encoder setBytes:&ne10 length:sizeof( int64_t) atIndex:13];
|
||||
[encoder setBytes:&ne11 length:sizeof( int64_t) atIndex:14];
|
||||
[encoder setBytes:&ne12 length:sizeof( int64_t) atIndex:15];
|
||||
[encoder setBytes:&ne13 length:sizeof( int64_t) atIndex:16];
|
||||
[encoder setBytes:&nb10 length:sizeof(uint64_t) atIndex:17];
|
||||
[encoder setBytes:&nb11 length:sizeof(uint64_t) atIndex:18];
|
||||
[encoder setBytes:&nb12 length:sizeof(uint64_t) atIndex:19];
|
||||
[encoder setBytes:&nb13 length:sizeof(uint64_t) atIndex:20];
|
||||
[encoder setBytes:&nb31 length:sizeof(uint64_t) atIndex:21];
|
||||
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:22];
|
||||
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:23];
|
||||
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:24];
|
||||
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:25];
|
||||
[encoder setBytes:&scale length:sizeof( float) atIndex:26];
|
||||
[encoder setBytes:&max_bias length:sizeof( float) atIndex:27];
|
||||
[encoder setBytes:&m0 length:sizeof(m0) atIndex:28];
|
||||
[encoder setBytes:&m1 length:sizeof(m1) atIndex:29];
|
||||
[encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:30];
|
||||
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:5];
|
||||
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:6];
|
||||
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:7];
|
||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:8];
|
||||
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:9];
|
||||
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:10];
|
||||
[encoder setBytes:&ne11 length:sizeof( int64_t) atIndex:11];
|
||||
[encoder setBytes:&ne12 length:sizeof( int64_t) atIndex:12];
|
||||
[encoder setBytes:&ne13 length:sizeof( int64_t) atIndex:13];
|
||||
[encoder setBytes:&nb11 length:sizeof(uint64_t) atIndex:14];
|
||||
[encoder setBytes:&nb12 length:sizeof(uint64_t) atIndex:15];
|
||||
[encoder setBytes:&nb13 length:sizeof(uint64_t) atIndex:16];
|
||||
[encoder setBytes:&nb21 length:sizeof(uint64_t) atIndex:17];
|
||||
[encoder setBytes:&nb22 length:sizeof(uint64_t) atIndex:18];
|
||||
[encoder setBytes:&nb23 length:sizeof(uint64_t) atIndex:19];
|
||||
[encoder setBytes:&nb31 length:sizeof(uint64_t) atIndex:20];
|
||||
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:21];
|
||||
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:22];
|
||||
[encoder setBytes:&scale length:sizeof( float) atIndex:23];
|
||||
[encoder setBytes:&max_bias length:sizeof( float) atIndex:24];
|
||||
[encoder setBytes:&m0 length:sizeof(m0) atIndex:25];
|
||||
[encoder setBytes:&m1 length:sizeof(m1) atIndex:26];
|
||||
[encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:27];
|
||||
|
||||
if (!use_vec_kernel) {
|
||||
// half8x8 kernel
|
||||
|
||||
+33
-41
@@ -1852,7 +1852,10 @@ kernel void kernel_upscale_f32(
|
||||
constant uint64_t & nb1,
|
||||
constant uint64_t & nb2,
|
||||
constant uint64_t & nb3,
|
||||
constant int32_t & sf,
|
||||
constant float & sf0,
|
||||
constant float & sf1,
|
||||
constant float & sf2,
|
||||
constant float & sf3,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
@@ -1861,15 +1864,17 @@ kernel void kernel_upscale_f32(
|
||||
const int64_t i2 = tgpig.y;
|
||||
const int64_t i1 = tgpig.x;
|
||||
|
||||
const int64_t i03 = i3;
|
||||
const int64_t i02 = i2;
|
||||
const int64_t i01 = i1/sf;
|
||||
|
||||
device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
|
||||
device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1);
|
||||
const int64_t i03 = i3/sf3;
|
||||
const int64_t i02 = i2/sf2;
|
||||
const int64_t i01 = i1/sf1;
|
||||
|
||||
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
|
||||
dst_ptr[i0] = src0_ptr[i0/sf];
|
||||
const int64_t i00 = i0/sf0;
|
||||
|
||||
device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
|
||||
device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
dst_ptr[0] = src0_ptr[0];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2049,27 +2054,24 @@ typedef void (flash_attn_ext_f16_t)(
|
||||
device const char * v,
|
||||
device const char * mask,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & ne03,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant uint64_t & nb03,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & ne13,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant uint64_t & nb13,
|
||||
constant uint64_t & nb21,
|
||||
constant uint64_t & nb22,
|
||||
constant uint64_t & nb23,
|
||||
constant uint64_t & nb31,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant int64_t & ne2,
|
||||
constant int64_t & ne3,
|
||||
constant float & scale,
|
||||
constant float & max_bias,
|
||||
constant float & m0,
|
||||
@@ -2090,27 +2092,24 @@ kernel void kernel_flash_attn_ext_f16(
|
||||
device const char * v,
|
||||
device const char * mask,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & ne03,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant uint64_t & nb03,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & ne13,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant uint64_t & nb13,
|
||||
constant uint64_t & nb21,
|
||||
constant uint64_t & nb22,
|
||||
constant uint64_t & nb23,
|
||||
constant uint64_t & nb31,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant int64_t & ne2,
|
||||
constant int64_t & ne3,
|
||||
constant float & scale,
|
||||
constant float & max_bias,
|
||||
constant float & m0,
|
||||
@@ -2180,10 +2179,6 @@ kernel void kernel_flash_attn_ext_f16(
|
||||
const short ne22 = ne12;
|
||||
const short ne23 = ne13;
|
||||
|
||||
const uint nb21 = nb11;
|
||||
const uint nb22 = nb12;
|
||||
const uint nb23 = nb13;
|
||||
|
||||
// broadcast
|
||||
const short rk2 = ne02/ne12;
|
||||
const short rk3 = ne03/ne13;
|
||||
@@ -2247,11 +2242,16 @@ kernel void kernel_flash_attn_ext_f16(
|
||||
simdgroup_multiply_accumulate(mqk, mq[i], mk, mqk);
|
||||
}
|
||||
|
||||
// mqk = mqk*scale + mask*slope
|
||||
simdgroup_half8x8 mm;
|
||||
simdgroup_load(mm, mp + ic + 8*cc, nb31/sizeof(half), 0, false);
|
||||
simdgroup_multiply(mm, mslope, mm);
|
||||
simdgroup_multiply_accumulate(mqk, mqk, mscale, mm);
|
||||
if (mask != q) {
|
||||
// mqk = mqk*scale + mask*slope
|
||||
simdgroup_half8x8 mm;
|
||||
simdgroup_load(mm, mp + ic + 8*cc, nb31/sizeof(half), 0, false);
|
||||
simdgroup_multiply(mm, mslope, mm);
|
||||
simdgroup_multiply_accumulate(mqk, mqk, mscale, mm);
|
||||
} else {
|
||||
// mqk = mqk*scale
|
||||
simdgroup_multiply(mqk, mscale, mqk);
|
||||
}
|
||||
|
||||
simdgroup_store(mqk, ss + 8*cc, TF, 0, false);
|
||||
}
|
||||
@@ -2425,27 +2425,24 @@ kernel void kernel_flash_attn_ext_vec_f16(
|
||||
device const char * v,
|
||||
device const char * mask,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & ne03,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant uint64_t & nb03,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & ne13,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant uint64_t & nb13,
|
||||
constant uint64_t & nb21,
|
||||
constant uint64_t & nb22,
|
||||
constant uint64_t & nb23,
|
||||
constant uint64_t & nb31,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant int64_t & ne2,
|
||||
constant int64_t & ne3,
|
||||
constant float & scale,
|
||||
constant float & max_bias,
|
||||
constant float & m0,
|
||||
@@ -2521,10 +2518,6 @@ kernel void kernel_flash_attn_ext_vec_f16(
|
||||
const short ne22 = ne12;
|
||||
const short ne23 = ne13;
|
||||
|
||||
const uint nb21 = nb11;
|
||||
const uint nb22 = nb12;
|
||||
const uint nb23 = nb13;
|
||||
|
||||
// broadcast
|
||||
const short rk2 = ne02/ne12;
|
||||
const short rk3 = ne03/ne13;
|
||||
@@ -2589,8 +2582,7 @@ kernel void kernel_flash_attn_ext_vec_f16(
|
||||
|
||||
// mqk = mqk*scale + mask*slope
|
||||
if (tiisg == 0) {
|
||||
float4 mm = (float4) mp4[ic/4 + cc];
|
||||
mqk = mqk*scale + mm*slope;
|
||||
mqk = mqk*scale + ((mask != q) ? ((float4) mp4[ic/4 + cc])*slope : (float4) 0.0f);
|
||||
|
||||
ss4[cc] = mqk;
|
||||
}
|
||||
|
||||
+2214
-40
File diff suppressed because it is too large
Load Diff
+12
-3
@@ -28,7 +28,7 @@
|
||||
|
||||
#define UNUSED GGML_UNUSED
|
||||
|
||||
#define GGML_DEBUG 1
|
||||
#define GGML_DEBUG 0
|
||||
#if (GGML_DEBUG >= 1)
|
||||
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
|
||||
#else
|
||||
@@ -134,7 +134,13 @@ static bool set_no_delay(sockfd_t sockfd) {
|
||||
int flag = 1;
|
||||
// set TCP_NODELAY to disable Nagle's algorithm
|
||||
int ret = setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY, (char *)&flag, sizeof(int));
|
||||
return ret >= 0;
|
||||
return ret == 0;
|
||||
}
|
||||
|
||||
static bool set_reuse_addr(sockfd_t sockfd) {
|
||||
int flag = 1;
|
||||
int ret = setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, (char *)&flag, sizeof(int));
|
||||
return ret == 0;
|
||||
}
|
||||
|
||||
static std::shared_ptr<socket_t> socket_connect(const char * host, int port) {
|
||||
@@ -181,7 +187,10 @@ static std::shared_ptr<socket_t> create_server_socket(const char * host, int por
|
||||
if (sock == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (!set_reuse_addr(sockfd)) {
|
||||
fprintf(stderr, "Failed to set SO_REUSEADDR\n");
|
||||
return nullptr;
|
||||
}
|
||||
struct sockaddr_in serv_addr;
|
||||
serv_addr.sin_family = AF_INET;
|
||||
serv_addr.sin_addr.s_addr = inet_addr(host);
|
||||
|
||||
@@ -13987,6 +13987,10 @@ inline void ggml_sycl_op_upscale(const ggml_tensor *src0,
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
|
||||
|
||||
#pragma message("TODO: generalize upscale operator")
|
||||
#pragma message(" https://github.com/ggerganov/ggml/pull/814")
|
||||
GGML_ASSERT(false && "TODO: generalize upscale operator");
|
||||
|
||||
const int scale_factor = dst->op_params[0];
|
||||
|
||||
upscale_f32_sycl(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
|
||||
|
||||
+1086
-1071
File diff suppressed because it is too large
Load Diff
+65
-129
@@ -294,7 +294,6 @@ struct vk_op_rope_neox_push_constants {
|
||||
struct vk_op_soft_max_push_constants {
|
||||
uint32_t KX;
|
||||
uint32_t KY;
|
||||
uint32_t KZ;
|
||||
float scale;
|
||||
float max_bias;
|
||||
float m0;
|
||||
@@ -304,7 +303,8 @@ struct vk_op_soft_max_push_constants {
|
||||
|
||||
struct vk_op_argsort_push_constants {
|
||||
uint32_t ncols;
|
||||
bool ascending;
|
||||
uint32_t ncols_pad;
|
||||
int32_t order;
|
||||
};
|
||||
|
||||
// Allow pre-recording command buffers
|
||||
@@ -1501,8 +1501,8 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
||||
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f32, "rope_f32", rope_f32_len, rope_f32_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f16, "rope_f16", rope_f16_len, rope_f16_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||
@@ -3752,7 +3752,7 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
|
||||
}
|
||||
|
||||
|
||||
static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op) {
|
||||
static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_op op) {
|
||||
switch (op) {
|
||||
case GGML_OP_ADD:
|
||||
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
@@ -3834,7 +3834,7 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||
if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_soft_max_f32;
|
||||
}
|
||||
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16 && src2->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
|
||||
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_soft_max_f32_f16;
|
||||
}
|
||||
return nullptr;
|
||||
@@ -3900,15 +3900,12 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
|
||||
}
|
||||
|
||||
template<typename PC>
|
||||
static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc) {
|
||||
static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_op op, const PC&& pc) {
|
||||
#ifdef GGML_VULKAN_DEBUG
|
||||
std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
||||
if (src1 != nullptr) {
|
||||
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
||||
}
|
||||
if (src2 != nullptr) {
|
||||
std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ", backend=" << src2->backend << ", ne0=" << src2->ne[0] << ", ne1=" << src2->ne[1] << ", ne2=" << src2->ne[2] << ", ne3=" << src2->ne[3] << ", nb0=" << src2->nb[0] << ", nb1=" << src2->nb[1] << ", nb2=" << src2->nb[2] << ", nb3=" << src2->nb[3];
|
||||
}
|
||||
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl;
|
||||
#endif
|
||||
GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
|
||||
@@ -3929,10 +3926,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||
const uint64_t nb2 = dst->nb[2];
|
||||
const uint64_t nb3 = dst->nb[3];
|
||||
|
||||
const bool use_src2 = src2 != nullptr;
|
||||
const uint64_t ne2 = use_src2 ? src2->ne[0] * src2->ne[1] : 0;
|
||||
|
||||
vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, dst, op);
|
||||
vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, dst, op);
|
||||
ggml_vk_func_t op_func;
|
||||
|
||||
if (pipeline == nullptr) {
|
||||
@@ -3955,18 +3949,15 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * extra_src1 = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
||||
ggml_tensor_extra_gpu * extra_src2 = use_src2 ? (ggml_tensor_extra_gpu *) src2->extra : nullptr;
|
||||
|
||||
vk_buffer d_X = nullptr;
|
||||
size_t x_buf_offset = 0;
|
||||
vk_buffer d_Y = nullptr;
|
||||
size_t y_buf_offset = 0;
|
||||
vk_buffer d_Z = nullptr;
|
||||
size_t z_buf_offset = 0;
|
||||
|
||||
bool src0_uma = false;
|
||||
bool src1_uma = false;
|
||||
bool src2_uma = false;
|
||||
|
||||
if (ctx->device->uma) {
|
||||
ggml_vk_host_get(ctx, src0->data, d_X, x_buf_offset);
|
||||
@@ -3975,15 +3966,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||
ggml_vk_host_get(ctx, src1->data, d_Y, y_buf_offset);
|
||||
src1_uma = d_Y != nullptr;
|
||||
}
|
||||
if (use_src2) {
|
||||
ggml_vk_host_get(ctx, src1->data, d_Z, z_buf_offset);
|
||||
src2_uma = d_Z != nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type)/ggml_blck_size(src0->type) * ne0, ctx->device->properties.limits.minStorageBufferOffsetAlignment);
|
||||
uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0;
|
||||
uint64_t z_sz = use_src2 ? ggml_vk_align_size(ggml_type_size(src2->type) * ne2, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0;
|
||||
uint64_t d_sz = ggml_type_size(dst->type) * ne0;
|
||||
|
||||
vk_buffer d_D = extra->buffer_gpu.lock();
|
||||
@@ -4007,12 +3993,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||
GGML_ASSERT(d_Y != nullptr);
|
||||
}
|
||||
|
||||
if (use_src2 && !src2_uma) {
|
||||
d_Z = extra_src2->buffer_gpu.lock();
|
||||
z_buf_offset = extra_src2->offset;
|
||||
GGML_ASSERT(d_Z != nullptr);
|
||||
}
|
||||
|
||||
if (op_supports_incontiguous) {
|
||||
x_sz = ggml_nbytes(src0);
|
||||
y_sz = use_src1 ? ggml_nbytes(src1) : 0;
|
||||
@@ -4046,7 +4026,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||
elements = { (uint32_t)ggml_nrows(src0), (uint32_t)ne00, 1 };
|
||||
break;
|
||||
case GGML_OP_GET_ROWS:
|
||||
elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
|
||||
elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
|
||||
break;
|
||||
case GGML_OP_ARGSORT:
|
||||
elements = { (uint32_t)ne00, (uint32_t)ggml_nrows(src0), 1 };
|
||||
break;
|
||||
default:
|
||||
elements = { (uint32_t)ggml_nelements(src0), 1, 1 };
|
||||
@@ -4066,7 +4049,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||
}
|
||||
|
||||
if (op == GGML_OP_SOFT_MAX) {
|
||||
// Empty src1 and src2 are possible on soft_max, but the shader needs buffers
|
||||
// Empty src1 is possible on soft_max, but the shader needs a buffer
|
||||
vk_subbuffer subbuf_y;
|
||||
if (use_src1) {
|
||||
subbuf_y = { d_Y, y_buf_offset, y_sz };
|
||||
@@ -4074,15 +4057,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||
subbuf_y = { d_X, 0, d_X->size };
|
||||
}
|
||||
|
||||
vk_subbuffer subbuf_z;
|
||||
if (use_src2) {
|
||||
subbuf_z = { d_Z, z_buf_offset, z_sz };
|
||||
} else {
|
||||
subbuf_z = { d_X, 0, d_X->size };
|
||||
}
|
||||
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
||||
} else if (use_src1) {
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
||||
@@ -4099,13 +4075,13 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||
}
|
||||
} else {
|
||||
GGML_ASSERT(op != GGML_OP_SOFT_MAX);
|
||||
GGML_ASSERT(op != GGML_OP_ARGSORT);
|
||||
|
||||
ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, ne02 * ne03);
|
||||
|
||||
switch (dst->op) {
|
||||
case GGML_OP_NORM:
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
elements = { (uint32_t)ne01, 1, 1 };
|
||||
break;
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
@@ -4145,7 +4121,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||
}
|
||||
|
||||
static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_REPEAT, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f });
|
||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_REPEAT, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f });
|
||||
}
|
||||
|
||||
static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
@@ -4153,7 +4129,7 @@ static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx,
|
||||
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||
|
||||
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_GET_ROWS, {
|
||||
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_GET_ROWS, {
|
||||
(uint32_t)ggml_nelements(src0),
|
||||
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
||||
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
||||
@@ -4168,7 +4144,7 @@ static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context * subctx, cons
|
||||
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||
|
||||
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ADD, {
|
||||
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_ADD, {
|
||||
(uint32_t)ggml_nelements(src0),
|
||||
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
||||
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
||||
@@ -4183,7 +4159,7 @@ static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context * subctx, cons
|
||||
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||
|
||||
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_MUL, {
|
||||
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_MUL, {
|
||||
(uint32_t)ggml_nelements(src0),
|
||||
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
||||
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
||||
@@ -4198,7 +4174,7 @@ static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context * subctx, co
|
||||
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||
|
||||
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SCALE, {
|
||||
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_SCALE, {
|
||||
(uint32_t)ggml_nelements(src0),
|
||||
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
||||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
||||
@@ -4211,7 +4187,7 @@ static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context * subctx, cons
|
||||
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||
|
||||
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SQR, {
|
||||
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_SQR, {
|
||||
(uint32_t)ggml_nelements(src0),
|
||||
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
||||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
||||
@@ -4225,7 +4201,7 @@ static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context * subctx, co
|
||||
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||
|
||||
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CLAMP, {
|
||||
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_CLAMP, {
|
||||
(uint32_t)ggml_nelements(src0),
|
||||
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
||||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
||||
@@ -4240,7 +4216,7 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, cons
|
||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||
const uint32_t d_offset = (extra->offset % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
|
||||
|
||||
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
|
||||
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_CPY, {
|
||||
(uint32_t)ggml_nelements(src0),
|
||||
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
||||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
||||
@@ -4252,24 +4228,24 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, cons
|
||||
static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
float * op_params = (float *)dst->op_params;
|
||||
|
||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
|
||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
|
||||
}
|
||||
|
||||
static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
float * op_params = (float *)dst->op_params;
|
||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
|
||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
|
||||
}
|
||||
|
||||
static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f });
|
||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f });
|
||||
}
|
||||
|
||||
static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
int32_t * op_params = (int32_t *)dst->op_params;
|
||||
ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] });
|
||||
ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] });
|
||||
}
|
||||
|
||||
static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
||||
static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
float * op_params = (float *)dst->op_params;
|
||||
|
||||
float scale = op_params[0];
|
||||
@@ -4285,13 +4261,9 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx,
|
||||
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||
|
||||
#pragma message("TODO: src2 is no longer used in soft_max - should be removed and ALiBi calculation should be updated")
|
||||
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/7192")
|
||||
|
||||
ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_SOFT_MAX, {
|
||||
ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_SOFT_MAX, {
|
||||
ncols,
|
||||
src1 != nullptr ? nrows_y : (uint32_t)0,
|
||||
src2 != nullptr ? (uint32_t)1 : (uint32_t)0,
|
||||
scale, max_bias,
|
||||
m0, m1,
|
||||
n_head_log2,
|
||||
@@ -4321,15 +4293,39 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
|
||||
if (is_neox) {
|
||||
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||
const float inv_ndims = -1.0f / n_dims;
|
||||
ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims });
|
||||
ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_ROPE, {
|
||||
(uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
|
||||
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims
|
||||
});
|
||||
} else {
|
||||
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f} });
|
||||
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_ROPE, {
|
||||
(uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1],
|
||||
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
int32_t * op_params = (int32_t *)dst->op_params;
|
||||
ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGSORT, { (uint32_t)src0->ne[0], ((ggml_sort_order) op_params[0]) == GGML_SORT_ORDER_ASC });
|
||||
|
||||
uint32_t ncols = src0->ne[0];
|
||||
|
||||
uint32_t ncols_pad = 1;
|
||||
while (ncols_pad < ncols) {
|
||||
ncols_pad *= 2;
|
||||
}
|
||||
|
||||
GGML_ASSERT(ncols_pad <= 1024);
|
||||
|
||||
std::cerr << "ncols=" << ncols << " ncols_pad=" << ncols_pad << " ascending=" << op_params[0] << std::endl;
|
||||
|
||||
std::cerr << ((ggml_sort_order) op_params[0]) << " " << GGML_SORT_ORDER_ASC << std::endl;
|
||||
|
||||
ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_ARGSORT, {
|
||||
ncols,
|
||||
ncols_pad,
|
||||
op_params[0],
|
||||
});
|
||||
}
|
||||
|
||||
#ifdef GGML_VULKAN_RUN_TESTS
|
||||
@@ -5432,7 +5428,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||
|
||||
const ggml_tensor * src0 = node->src[0];
|
||||
const ggml_tensor * src1 = node->src[1];
|
||||
const ggml_tensor * src2 = node->src[2];
|
||||
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
|
||||
|
||||
@@ -5547,7 +5542,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||
|
||||
break;
|
||||
case GGML_OP_SOFT_MAX:
|
||||
ggml_vk_soft_max(ctx, ctx->compute_ctx, src0, src1, src2, node);
|
||||
ggml_vk_soft_max(ctx, ctx->compute_ctx, src0, src1, node);
|
||||
|
||||
break;
|
||||
case GGML_OP_ROPE:
|
||||
@@ -6548,7 +6543,7 @@ static void ggml_vk_print_graph_origin(const ggml_tensor * tensor, std::vector<c
|
||||
}
|
||||
|
||||
static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * data, int i0, int i1, int i2, int i3) {
|
||||
if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
|
||||
if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16 && tensor->type != GGML_TYPE_I32) {
|
||||
return;
|
||||
}
|
||||
i0 = std::max(i0, 5);
|
||||
@@ -6569,6 +6564,8 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
|
||||
val = *(const float *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
|
||||
} else if (tensor->type == GGML_TYPE_F16) {
|
||||
val = ggml_fp16_to_fp32(*(const ggml_fp16_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
|
||||
} else if (tensor->type == GGML_TYPE_I32) {
|
||||
val = *(const int32_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
@@ -6671,7 +6668,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
||||
|
||||
ggml_tensor * src0 = tensor->src[0];
|
||||
ggml_tensor * src1 = tensor->src[1];
|
||||
ggml_tensor * src2 = tensor->src[2];
|
||||
|
||||
struct ggml_init_params iparams = {
|
||||
/*.mem_size =*/ 1024*1024*1024,
|
||||
@@ -6798,66 +6794,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
||||
|
||||
ggml_vk_check_tensor(std::string(ggml_op_name(tensor->op)) + "->src1", src1_clone);
|
||||
}
|
||||
if (src2 != nullptr) {
|
||||
src2_clone = ggml_dup_tensor(ggml_ctx, src2);
|
||||
|
||||
src2_size = ggml_nbytes(src2);
|
||||
|
||||
src2_buffer = malloc(src2_size);
|
||||
src2_clone->data = src2_buffer;
|
||||
if (src2->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
memcpy(src2_clone->data, src2->data, src2_size);
|
||||
memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
||||
} else if (src2->backend == GGML_BACKEND_TYPE_GPU) {
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src2->extra;
|
||||
vk_buffer buf = extra->buffer_gpu.lock();
|
||||
uint64_t offset = extra->offset;
|
||||
if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) {
|
||||
for (int i3 = 0; i3 < src2->ne[3]; i3++) {
|
||||
for (int i2 = 0; i2 < src2->ne[2]; i2++) {
|
||||
const int idx = i3*src2->ne[2] + i2;
|
||||
ggml_vk_buffer_read(ctx, buf, offset + idx * src2->nb[2], ((char *)src2_clone->data + idx * src2_clone->nb[2]), src2->ne[1] * src2->nb[1]);
|
||||
}
|
||||
}
|
||||
|
||||
src2_clone->nb[0] = src2->nb[0];
|
||||
src2_clone->nb[1] = src2->nb[1];
|
||||
for (int i = 2; i < GGML_MAX_DIMS; i++) {
|
||||
src2_clone->nb[i] = src2_clone->nb[i - 1]*src2_clone->ne[i - 1];
|
||||
}
|
||||
} else {
|
||||
if (offset + src2_size >= buf->size) {
|
||||
src2_size = buf->size - offset;
|
||||
}
|
||||
ggml_vk_buffer_read(ctx, buf, offset, src2_clone->data, src2_size);
|
||||
memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
||||
}
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
||||
ggml_vk_print_tensor(ctx, src2, "src2");
|
||||
std::cerr << "TENSOR CHECK: " << ggml_op_name(src2_clone->op) << " (check " << check_counter << ")" << std::endl;
|
||||
std::cerr << "src2_clone=" << tensor << " src2_clone->backend: " << src2_clone->backend << " src2_clone->type: " << ggml_type_name(src2_clone->type) << " ne0=" << src2_clone->ne[0] << " nb0=" << src2_clone->nb[0] << " ne1=" << src2_clone->ne[1] << " nb1=" << src2_clone->nb[1] << " ne2=" << src2_clone->ne[2] << " nb2=" << src2_clone->nb[2] << " ne3=" << src2_clone->ne[3] << " nb3=" << src2_clone->nb[3] << std::endl;
|
||||
if (src2->src[0] != nullptr) {
|
||||
std::cerr << "src2->src[0]=" << src2->src[0] << " op=" << ggml_op_name(src2->src[0]->op) << " type=" << ggml_type_name(src2->src[0]->type) << " backend=" << src2->src[0]->backend << " ne0=" << src2->src[0]->ne[0] << " nb0=" << src2->src[0]->nb[0] << " ne1=" << src2->src[0]->ne[1] << " nb1=" << src2->src[0]->nb[1] << " ne2=" << src2->src[0]->ne[2] << " nb2=" << src2->src[0]->nb[2] << " ne3=" << src2->src[0]->ne[3] << " nb3=" << src2->src[0]->nb[3] << std::endl;
|
||||
}
|
||||
if (src2->src[1] != nullptr) {
|
||||
std::cerr << "src2->src[1]=" << src2->src[1] << " op=" << ggml_op_name(src2->src[1]->op) << " type=" << ggml_type_name(src2->src[1]->type) << " backend=" << src2->src[1]->backend << " ne0=" << src2->src[1]->ne[0] << " nb0=" << src2->src[1]->nb[0] << " ne1=" << src2->src[1]->ne[1] << " nb1=" << src2->src[1]->nb[1] << " ne2=" << src2->src[1]->ne[2] << " nb2=" << src2->src[1]->nb[2] << " ne3=" << src2->src[1]->ne[3] << " nb3=" << src2->src[1]->nb[3] << std::endl;
|
||||
}
|
||||
std::cerr << std::endl << "Result:" << std::endl;
|
||||
ggml_vk_print_tensor_area(src2_clone, src2_clone->data, 5, 5, 0, 0);
|
||||
std::cerr << std::endl;
|
||||
std::cerr << std::endl << "Result:" << std::endl;
|
||||
ggml_vk_print_tensor_area(src2_clone, src2_clone->data, 5, 5, 1, 0);
|
||||
std::cerr << std::endl;
|
||||
std::vector<const ggml_tensor *> done;
|
||||
ggml_vk_print_graph_origin(src2_clone, done);
|
||||
}
|
||||
|
||||
ggml_vk_check_tensor(std::string(ggml_op_name(tensor->op)) + "->src2", src2_clone);
|
||||
}
|
||||
|
||||
if (tensor->op == GGML_OP_MUL_MAT) {
|
||||
tensor_clone = ggml_mul_mat(ggml_ctx, src0_clone, src1_clone);
|
||||
@@ -6877,7 +6813,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
||||
tensor_clone = ggml_rms_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params);
|
||||
} else if (tensor->op == GGML_OP_SOFT_MAX) {
|
||||
if (src1 != nullptr) {
|
||||
tensor_clone = ggml_soft_max_ext(ggml_ctx, src0_clone, src1_clone, src2_clone, ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]);
|
||||
tensor_clone = ggml_soft_max_ext(ggml_ctx, src0_clone, src1_clone, ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]);
|
||||
} else {
|
||||
tensor_clone = ggml_soft_max(ggml_ctx, src0_clone);
|
||||
}
|
||||
@@ -6964,9 +6900,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
||||
if (src1 != nullptr) {
|
||||
free(src1_buffer);
|
||||
}
|
||||
if (src2 != nullptr) {
|
||||
free(src2_buffer);
|
||||
}
|
||||
|
||||
ggml_free(ggml_ctx);
|
||||
}
|
||||
@@ -7026,8 +6959,11 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
||||
} else if (tensor->type == GGML_TYPE_F16) {
|
||||
correct = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]));
|
||||
result = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]));
|
||||
} else if (tensor->type == GGML_TYPE_I32) {
|
||||
correct = *(int32_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]);
|
||||
result = *(int32_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]);
|
||||
} else {
|
||||
std::cerr << "comp_size=" << comp_size << " but required is " << (i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]) << std::endl;
|
||||
std::cerr << "Results check not implemented for type " << ggml_type_name(tensor->type) << std::endl;
|
||||
}
|
||||
} else {
|
||||
std::cerr << "Missing debug code for type " << ggml_type_name(tensor->type) << std::endl;
|
||||
|
||||
@@ -565,7 +565,8 @@ extern "C" {
|
||||
// n-dimensional tensor
|
||||
struct ggml_tensor {
|
||||
enum ggml_type type;
|
||||
enum ggml_backend_type backend;
|
||||
|
||||
GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
|
||||
|
||||
struct ggml_backend_buffer * buffer;
|
||||
|
||||
@@ -766,7 +767,8 @@ extern "C" {
|
||||
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
||||
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
||||
|
||||
GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
||||
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
||||
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
||||
|
||||
// use this to compute the memory overhead of a tensor
|
||||
GGML_API size_t ggml_tensor_overhead(void);
|
||||
@@ -1673,12 +1675,24 @@ extern "C" {
|
||||
float p1);
|
||||
|
||||
// nearest interpolate
|
||||
// multiplies ne0 and ne1 by scale factor
|
||||
// used in stable-diffusion
|
||||
GGML_API struct ggml_tensor * ggml_upscale(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int scale_factor);
|
||||
|
||||
// nearest interpolate
|
||||
// nearest interpolate to specified dimensions
|
||||
// used in tortoise.cpp
|
||||
GGML_API struct ggml_tensor * ggml_upscale_ext(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int ne2,
|
||||
int ne3);
|
||||
|
||||
// pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
|
||||
GGML_API struct ggml_tensor * ggml_pad(
|
||||
struct ggml_context * ctx,
|
||||
|
||||
+59
-27
@@ -2432,7 +2432,6 @@ layout (push_constant) uniform parameter
|
||||
{
|
||||
uint KX;
|
||||
uint KY;
|
||||
uint KZ;
|
||||
float scale;
|
||||
float max_bias;
|
||||
float m0;
|
||||
@@ -2449,8 +2448,7 @@ layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) readonly buffer Y {B_TYPE data_b[];};
|
||||
layout (binding = 2) readonly buffer Z {C_TYPE data_c[];};
|
||||
layout (binding = 3) buffer D {D_TYPE data_d[];};
|
||||
layout (binding = 2) buffer D {D_TYPE data_d[];};
|
||||
|
||||
shared FLOAT_TYPE vals[BLOCK_SIZE];
|
||||
|
||||
@@ -2459,7 +2457,7 @@ void main() {
|
||||
const uint rowx = gl_WorkGroupID.x;
|
||||
const uint rowy = rowx % p.KY;
|
||||
|
||||
float slope = 0.0f;
|
||||
float slope = 1.0f;
|
||||
|
||||
// ALiBi
|
||||
if (p.max_bias > 0.0f) {
|
||||
@@ -2472,11 +2470,18 @@ void main() {
|
||||
}
|
||||
|
||||
// Find max
|
||||
vals[tid] = uintBitsToFloat(0xFF800000);
|
||||
FLOAT_TYPE max_val = uintBitsToFloat(0xFF800000);
|
||||
|
||||
[[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) {
|
||||
vals[tid] = max(vals[tid], FLOAT_TYPE(data_a[rowx * p.KX + col]) * p.scale + (p.KY > 0 ? FLOAT_TYPE(data_b[rowy * p.KX + col]) : FLOAT_TYPE(0.0f)) + (p.KZ > 0 ? slope * FLOAT_TYPE(data_c[col]) : 0.0f));
|
||||
[[unroll]] for (uint col0 = 0; col0 < p.KX; col0 += BLOCK_SIZE) {
|
||||
const uint col = col0 + tid;
|
||||
|
||||
if (col >= p.KX) {
|
||||
break;
|
||||
}
|
||||
|
||||
max_val = max(max_val, FLOAT_TYPE(data_a[rowx * p.KX + col]) * p.scale + (p.KY > 0 ? slope * FLOAT_TYPE(data_b[rowy * p.KX + col]) : FLOAT_TYPE(0.0f)));
|
||||
}
|
||||
vals[tid] = max_val;
|
||||
|
||||
barrier();
|
||||
[[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
|
||||
@@ -2486,15 +2491,21 @@ void main() {
|
||||
barrier();
|
||||
}
|
||||
|
||||
const FLOAT_TYPE max_val = vals[0];
|
||||
max_val = vals[0];
|
||||
barrier();
|
||||
|
||||
// Sum up values
|
||||
vals[tid] = FLOAT_TYPE(0.0f);
|
||||
|
||||
[[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) {
|
||||
[[unroll]] for (uint col0 = 0; col0 < p.KX; col0 += BLOCK_SIZE) {
|
||||
const uint col = col0 + tid;
|
||||
|
||||
if (col >= p.KX) {
|
||||
break;
|
||||
}
|
||||
|
||||
const uint i = rowx * p.KX + col;
|
||||
const FLOAT_TYPE val = exp(FLOAT_TYPE(data_a[i]) * p.scale + (p.KY > 0 ? FLOAT_TYPE(data_b[rowy * p.KX + col]) : FLOAT_TYPE(0.0f)) - max_val);
|
||||
const FLOAT_TYPE val = exp(FLOAT_TYPE(data_a[i]) * p.scale + (p.KY > 0 ? slope * FLOAT_TYPE(data_b[rowy * p.KX + col]) : FLOAT_TYPE(0.0f)) - max_val);
|
||||
vals[tid] += val;
|
||||
data_d[i] = D_TYPE(val);
|
||||
}
|
||||
@@ -2509,7 +2520,13 @@ void main() {
|
||||
|
||||
const D_TYPE divisor = D_TYPE(vals[0]);
|
||||
|
||||
[[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) {
|
||||
[[unroll]] for (uint col0 = 0; col0 < p.KX; col0 += BLOCK_SIZE) {
|
||||
const uint col = col0 + tid;
|
||||
|
||||
if (col >= p.KX) {
|
||||
break;
|
||||
}
|
||||
|
||||
data_d[rowx*p.KX + col] /= divisor;
|
||||
}
|
||||
}
|
||||
@@ -2672,20 +2689,26 @@ argsort_src = """
|
||||
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
|
||||
layout(local_size_x = 1024, local_size_y = 1, local_size_z = 1) in;
|
||||
#define BLOCK_SIZE 1024
|
||||
#define ASC 0
|
||||
|
||||
layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
||||
layout (binding = 1) buffer D {int data_d[];};
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint ncols;
|
||||
bool ascending;
|
||||
uint ncols_pad;
|
||||
uint order;
|
||||
} p;
|
||||
|
||||
shared int dst_row[BLOCK_SIZE];
|
||||
|
||||
void swap(uint idx0, uint idx1) {
|
||||
int tmp = data_d[idx0];
|
||||
data_d[idx0] = data_d[idx1];
|
||||
data_d[idx1] = tmp;
|
||||
int tmp = dst_row[idx0];
|
||||
dst_row[idx0] = dst_row[idx1];
|
||||
dst_row[idx1] = tmp;
|
||||
}
|
||||
|
||||
void main() {
|
||||
@@ -2693,36 +2716,45 @@ void main() {
|
||||
const int col = int(gl_LocalInvocationID.x);
|
||||
const uint row = gl_WorkGroupID.y;
|
||||
|
||||
if (col >= p.ncols) {
|
||||
if (col >= p.ncols_pad) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint a_idx = row * p.ncols;
|
||||
const uint d_idx = row * p.ncols;
|
||||
const uint row_offset = row * p.ncols;
|
||||
|
||||
// initialize indices
|
||||
if (col < p.ncols) {
|
||||
data_d[col] = col;
|
||||
}
|
||||
dst_row[col] = col;
|
||||
barrier();
|
||||
|
||||
for (uint k = 2; k <= p.ncols; k *= 2) {
|
||||
for (uint k = 2; k <= p.ncols_pad; k *= 2) {
|
||||
for (uint j = k / 2; j > 0; j /= 2) {
|
||||
const uint ixj = col ^ j;
|
||||
if (ixj > col) {
|
||||
if ((col & k) == 0) {
|
||||
if (p.ascending ? data_a[a_idx + data_d[d_idx + col]] > data_a[a_idx + data_d[d_idx + ixj]] : data_a[a_idx + data_d[d_idx + col]] < data_a[a_idx + data_d[d_idx + ixj]]) {
|
||||
swap(d_idx + col, d_idx + ixj);
|
||||
if (dst_row[col] >= p.ncols ||
|
||||
(dst_row[ixj] < p.ncols && (p.order == ASC ?
|
||||
data_a[row_offset + dst_row[col]] > data_a[row_offset + dst_row[ixj]] :
|
||||
data_a[row_offset + dst_row[col]] < data_a[row_offset + dst_row[ixj]]))
|
||||
) {
|
||||
swap(col, ixj);
|
||||
}
|
||||
} else {
|
||||
if (p.ascending ? data_a[a_idx + data_d[d_idx + col]] < data_a[a_idx + data_d[d_idx + ixj]] : data_a[a_idx + data_d[d_idx + col]] > data_a[a_idx + data_d[d_idx + ixj]]) {
|
||||
swap(d_idx + col, d_idx + ixj);
|
||||
if (dst_row[ixj] >= p.ncols ||
|
||||
(dst_row[col] < p.ncols && (p.order == ASC ?
|
||||
data_a[row_offset + dst_row[col]] < data_a[row_offset + dst_row[ixj]] :
|
||||
data_a[row_offset + dst_row[col]] > data_a[row_offset + dst_row[ixj]]))
|
||||
) {
|
||||
swap(col, ixj);
|
||||
}
|
||||
}
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
}
|
||||
|
||||
if (col < p.ncols) {
|
||||
data_d[row_offset + col] = dst_row[col];
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
|
||||
@@ -5188,7 +5188,14 @@ static bool llm_load_tensors(
|
||||
{
|
||||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
||||
if (!model.output) {
|
||||
// needs to be on GPU
|
||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||
ml.n_created--; // artificial tensor
|
||||
ml.size_data += ggml_nbytes(model.output);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
@@ -6622,6 +6629,7 @@ static struct ggml_tensor * llm_build_kqv(
|
||||
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const int64_t n_embd_head_v = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
|
||||
cb(q, "q", il);
|
||||
@@ -6644,8 +6652,8 @@ static struct ggml_tensor * llm_build_kqv(
|
||||
struct ggml_tensor * v =
|
||||
ggml_view_3d(ctx, kv.v_l[il],
|
||||
n_embd_head_v, n_kv, n_head_kv,
|
||||
ggml_row_size(kv.v_l[il]->type, n_embd_k_gqa),
|
||||
ggml_row_size(kv.v_l[il]->type, n_embd_head_k),
|
||||
ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa),
|
||||
ggml_row_size(kv.v_l[il]->type, n_embd_head_v),
|
||||
0);
|
||||
cb(v, "v", il);
|
||||
|
||||
@@ -6655,7 +6663,7 @@ static struct ggml_tensor * llm_build_kqv(
|
||||
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
|
||||
}
|
||||
|
||||
cur = ggml_reshape_2d(ctx, cur, n_embd_head_k*n_head, n_tokens);
|
||||
cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
|
||||
} else {
|
||||
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
||||
cb(kq, "kq", il);
|
||||
@@ -6700,7 +6708,7 @@ static struct ggml_tensor * llm_build_kqv(
|
||||
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
|
||||
cb(kqv_merged, "kqv_merged", il);
|
||||
|
||||
cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
|
||||
cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_v*n_head, n_tokens);
|
||||
cb(cur, "kqv_merged_cont", il);
|
||||
}
|
||||
|
||||
@@ -12575,16 +12583,16 @@ struct llm_tokenizer_wpm {
|
||||
// to lowercase, pad chinese characters, pad punctuation
|
||||
std::string new_str = "";
|
||||
for (uint32_t code : cpts_nfd) {
|
||||
int type = unicode_cpt_type(code);
|
||||
if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
|
||||
const codepoint_flags flags = unicode_cpt_flags(code);
|
||||
if (flags.is_accent_mark || flags.is_control) {
|
||||
continue;
|
||||
}
|
||||
code = unicode_tolower(code);
|
||||
if (type == CODEPOINT_TYPE_SEPARATOR) {
|
||||
if (flags.is_separator || flags.is_whitespace) { //####FIXME: is_separator ?
|
||||
code = ' ';
|
||||
}
|
||||
std::string s = unicode_cpt_to_utf8(code);
|
||||
if (type == CODEPOINT_TYPE_PUNCTUATION || is_ascii_punct(code) || is_chinese_char(code)) {
|
||||
if (flags.is_punctuation || is_ascii_punct(code) || is_chinese_char(code)) {
|
||||
new_str += " ";
|
||||
new_str += s;
|
||||
new_str += " ";
|
||||
@@ -12818,6 +12826,13 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||
}
|
||||
}
|
||||
|
||||
if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
|
||||
LLAMA_LOG_WARN(
|
||||
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
||||
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
||||
"Are you sure this is what you want?\n", __FUNCTION__);
|
||||
}
|
||||
|
||||
if (add_special && vocab.special_add_eos == 1) {
|
||||
GGML_ASSERT(vocab.special_eos_id != -1);
|
||||
output.push_back(vocab.special_eos_id);
|
||||
@@ -12844,6 +12859,13 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||
}
|
||||
}
|
||||
|
||||
if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
|
||||
LLAMA_LOG_WARN(
|
||||
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
||||
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
||||
"Are you sure this is what you want?\n", __FUNCTION__);
|
||||
}
|
||||
|
||||
if (add_special && vocab.special_add_eos == 1) {
|
||||
GGML_ASSERT(vocab.special_add_eos != -1);
|
||||
output.push_back(vocab.special_eos_id);
|
||||
@@ -13904,9 +13926,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
|
||||
|
||||
// Sample the next word X using top-k sampling
|
||||
llama_sample_top_k(nullptr, candidates, int(k), 1);
|
||||
if (ctx) {
|
||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
}
|
||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
llama_token X = llama_sample_token(ctx, candidates);
|
||||
t_start_sample_us = ggml_time_us();
|
||||
|
||||
@@ -13920,9 +13940,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
|
||||
// Update mu using the learning rate and error
|
||||
*mu = *mu - eta * e;
|
||||
|
||||
if (ctx) {
|
||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
}
|
||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
return X;
|
||||
}
|
||||
|
||||
@@ -17015,13 +17033,13 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam
|
||||
}
|
||||
else {
|
||||
if (cell_range_begin != kv_self.size) {
|
||||
cell_ranges.push_back({ cell_range_begin, i });
|
||||
cell_ranges.emplace_back(cell_range_begin, i);
|
||||
cell_range_begin = kv_self.size;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (cell_range_begin != kv_self.size) {
|
||||
cell_ranges.push_back({ cell_range_begin, kv_self.size });
|
||||
cell_ranges.emplace_back(cell_range_begin, kv_self.size);
|
||||
}
|
||||
|
||||
// DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
|
||||
|
||||
+176
-90
@@ -1,117 +1,203 @@
|
||||
#!/bin/bash
|
||||
test_suite=${1:-}
|
||||
test_number=${2:-}
|
||||
|
||||
PROG=${0##*/}
|
||||
build_dir="build-ci-debug"
|
||||
|
||||
if [ x"$1" = x"-h" ] || [ x"$1" = x"--help" ]; then
|
||||
echo "Usage: $PROG [OPTION]... <test_regex> (test_number)"
|
||||
echo "Debug specific ctest program."
|
||||
echo
|
||||
echo "Options:"
|
||||
echo " -h, --help Display this help and exit"
|
||||
echo
|
||||
echo "Arguments:"
|
||||
echo " <test_regex> (Mandatory) Supply one regex to the script to filter tests"
|
||||
echo " (test_number) (Optional) Test number to run a specific test"
|
||||
echo
|
||||
echo "Example:"
|
||||
echo " $PROG test-tokenizer"
|
||||
echo " $PROG test-tokenizer 3"
|
||||
echo
|
||||
exit 0
|
||||
fi
|
||||
# Print Color Commands
|
||||
red=$(tput setaf 1)
|
||||
green=$(tput setaf 2)
|
||||
yellow=$(tput setaf 3)
|
||||
blue=$(tput setaf 4)
|
||||
magenta=$(tput setaf 5)
|
||||
cyan=$(tput setaf 6)
|
||||
normal=$(tput sgr0)
|
||||
|
||||
# Function to select and debug a test
|
||||
function select_test() {
|
||||
test_suite=${1:-test}
|
||||
test_number=${2:-}
|
||||
|
||||
# Sanity Check If Tests Is Detected
|
||||
printf "\n\nGathering tests that fit REGEX: ${test_suite} ...\n"
|
||||
tests=($(ctest -R ${test_suite} -V -N | grep -E " +Test +#[0-9]+*" | cut -d':' -f2 | awk '{$1=$1};1'))
|
||||
if [ ${#tests[@]} -eq 0 ]
|
||||
then
|
||||
echo "No tests avaliable... check your compliation process..."
|
||||
echo "Exiting."
|
||||
exit 1
|
||||
fi
|
||||
# Print Help Message
|
||||
####################
|
||||
|
||||
if [ -z $test_number ]
|
||||
then
|
||||
# List out avaliable tests
|
||||
printf "Which test would you like to debug?\n"
|
||||
id=0
|
||||
for s in "${tests[@]}"
|
||||
do
|
||||
echo "Test# ${id}"
|
||||
echo " $s"
|
||||
((id++))
|
||||
done
|
||||
print_full_help() {
|
||||
cat << EOF
|
||||
Usage: $PROG [OPTION]... <test_regex> (test_number)
|
||||
Debug specific ctest program.
|
||||
|
||||
# Prompt user which test they wanted to run
|
||||
printf "\nRun test#? "
|
||||
read test_number
|
||||
else
|
||||
printf "\nUser Already Requested #${test_number}"
|
||||
fi
|
||||
Options:
|
||||
-h, --help display this help and exit
|
||||
-g run in gdb mode
|
||||
|
||||
# Start GDB with the requested test binary and arguments
|
||||
printf "Debugging(GDB) test: ${tests[test_number]}\n"
|
||||
# Change IFS (Internal Field Separator)
|
||||
sIFS=$IFS
|
||||
IFS=$'\n'
|
||||
Arguments:
|
||||
<test_regex> (Mandatory) Supply one regex to the script to filter tests
|
||||
(test_number) (Optional) Test number to run a specific test
|
||||
|
||||
# Get test args
|
||||
gdb_args=($(ctest -R ${test_suite} -V -N | grep "Test command" | cut -d':' -f3 | awk '{$1=$1};1' ))
|
||||
IFS=$sIFS
|
||||
printf "Debug arguments: ${gdb_args[test_number]}\n\n"
|
||||
|
||||
# Expand paths if needed
|
||||
args=()
|
||||
for x in $(echo ${gdb_args[test_number]} | sed -e 's/"\/\<//' -e 's/\>"//')
|
||||
do
|
||||
args+=($(echo $x | sed -e 's/.*\/..\//..\//'))
|
||||
done
|
||||
|
||||
# Execute debugger
|
||||
echo "gdb args: ${args[@]}"
|
||||
gdb --args ${args[@]}
|
||||
Example:
|
||||
$PROG test-tokenizer
|
||||
$PROG test-tokenizer 3
|
||||
EOF
|
||||
}
|
||||
|
||||
abort() {
|
||||
echo "Error: $1" >&2
|
||||
cat << EOF >&2
|
||||
Usage: $PROG [OPTION]... <test_regex> (test_number)
|
||||
Debug specific ctest program.
|
||||
Refer to --help for full instructions.
|
||||
EOF
|
||||
exit 1
|
||||
}
|
||||
|
||||
|
||||
# Dependency Sanity Check
|
||||
#########################
|
||||
|
||||
check_dependency() {
|
||||
command -v "$1" >/dev/null 2>&1 || {
|
||||
abort "$1 is required but not found. Please install it and try again."
|
||||
}
|
||||
}
|
||||
|
||||
check_dependency ctest
|
||||
check_dependency cmake
|
||||
|
||||
|
||||
# Step 0: Check the args
|
||||
if [ -z "$test_suite" ]
|
||||
then
|
||||
echo "Usage: $PROG [OPTION]... <test_regex> (test_number)"
|
||||
echo "Supply one regex to the script to filter tests,"
|
||||
echo "and optionally a test number to run a specific test."
|
||||
echo "Use --help flag for full instructions"
|
||||
exit 1
|
||||
########################
|
||||
|
||||
if [ x"$1" = x"-h" ] || [ x"$1" = x"--help" ]; then
|
||||
print_full_help >&2
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Parse command-line options
|
||||
gdb_mode=false
|
||||
while getopts "g" opt; do
|
||||
case $opt in
|
||||
g)
|
||||
gdb_mode=true
|
||||
echo "gdb_mode Mode Enabled"
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Shift the option parameters
|
||||
shift $((OPTIND - 1))
|
||||
|
||||
# Positionial Argument Processing : <test_regex>
|
||||
if [ -z "${1}" ]; then
|
||||
abort "Test regex is required"
|
||||
else
|
||||
test_suite=${1:-}
|
||||
fi
|
||||
|
||||
# Positionial Argument Processing : (test_number)
|
||||
test_number=${2:-}
|
||||
|
||||
|
||||
# Step 1: Reset and Setup folder context
|
||||
########################################
|
||||
|
||||
## Sanity check that we are actually in a git repo
|
||||
repo_root=$(git rev-parse --show-toplevel)
|
||||
if [ ! -d "$repo_root" ]; then
|
||||
echo "Error: Not in a Git repository."
|
||||
exit 1
|
||||
abort "Not in a Git repository."
|
||||
fi
|
||||
|
||||
## Reset folder to root context of git repo
|
||||
pushd "$repo_root" || exit 1
|
||||
## Reset folder to root context of git repo and Create and enter build directory
|
||||
pushd "$repo_root"
|
||||
rm -rf "$build_dir" && mkdir "$build_dir" || abort "Failed to make $build_dir"
|
||||
|
||||
## Create and enter build directory
|
||||
rm -rf "$build_dir" && mkdir "$build_dir" || exit 1
|
||||
|
||||
# Step 2: Setup Build Environment and Compile Test Binaries
|
||||
cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_FATAL_WARNINGS=ON || exit 1
|
||||
pushd "$build_dir" && make -j || exit 1
|
||||
###########################################################
|
||||
|
||||
# Step 3: Debug the Test
|
||||
select_test "$test_suite" "$test_number"
|
||||
# Note: test-eval-callback requires -DLLAMA_CURL
|
||||
cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_CURL=1 || abort "Failed to build enviroment"
|
||||
pushd "$build_dir"
|
||||
make -j || abort "Failed to compile"
|
||||
popd > /dev/null || exit 1
|
||||
|
||||
# Step 4: Return to the directory from which the user ran the command.
|
||||
popd || exit 1
|
||||
popd || exit 1
|
||||
popd || exit 1
|
||||
|
||||
# Step 3: Find all tests available that matches REGEX
|
||||
####################################################
|
||||
|
||||
# Ctest Gather Tests
|
||||
# `-R test-tokenizer` : looks for all the test files named `test-tokenizer*` (R=Regex)
|
||||
# `-N` : "show-only" disables test execution & shows test commands that you can feed to GDB.
|
||||
# `-V` : Verbose Mode
|
||||
printf "\n\nGathering tests that fit REGEX: ${test_suite} ...\n"
|
||||
pushd "$build_dir"
|
||||
tests=($(ctest -R ${test_suite} -V -N | grep -E " +Test +#[0-9]+*" | cut -d':' -f2 | awk '{$1=$1};1'))
|
||||
if [ ${#tests[@]} -eq 0 ]; then
|
||||
abort "No tests avaliable... check your compliation process..."
|
||||
fi
|
||||
popd > /dev/null || exit 1
|
||||
|
||||
|
||||
# Step 4: Identify Test Command for Debugging
|
||||
#############################################
|
||||
|
||||
# Select test number
|
||||
if [ -z $test_number ]; then
|
||||
# List out avaliable tests
|
||||
printf "Which test would you like to debug?\n"
|
||||
id=0
|
||||
for s in "${tests[@]}"
|
||||
do
|
||||
echo "Test# ${id}"
|
||||
echo " $s"
|
||||
((id++))
|
||||
done
|
||||
|
||||
# Prompt user which test they wanted to run
|
||||
printf "\nRun test#? "
|
||||
read test_number
|
||||
|
||||
else
|
||||
printf "\nUser Already Requested #${test_number}\n"
|
||||
|
||||
fi
|
||||
|
||||
# Grab all tests commands
|
||||
pushd "$build_dir"
|
||||
sIFS=$IFS # Save Initial IFS (Internal Field Separator)
|
||||
IFS=$'\n' # Change IFS (Internal Field Separator) (So we split ctest output by newline rather than by spaces)
|
||||
test_args=($(ctest -R ${test_suite} -V -N | grep "Test command" | cut -d':' -f3 | awk '{$1=$1};1' )) # Get test args
|
||||
IFS=$sIFS # Reset IFS (Internal Field Separator)
|
||||
popd > /dev/null || exit 1
|
||||
|
||||
# Grab specific test command
|
||||
single_test_name="${tests[test_number]}"
|
||||
single_test_command="${test_args[test_number]}"
|
||||
|
||||
|
||||
# Step 5: Execute or GDB Debug
|
||||
##############################
|
||||
|
||||
printf "${magenta}Running Test #${test_number}: ${single_test_name}${normal}\n"
|
||||
printf "${cyan}single_test_command: ${single_test_command}${normal}\n"
|
||||
|
||||
if [ "$gdb_mode" = "true" ]; then
|
||||
# Execute debugger
|
||||
pushd "$repo_root" || exit 1
|
||||
eval "gdb --args ${single_test_command}"
|
||||
popd > /dev/null || exit 1
|
||||
|
||||
else
|
||||
# Execute Test
|
||||
pushd "$repo_root" || exit 1
|
||||
eval "${single_test_command}"
|
||||
exit_code=$?
|
||||
popd > /dev/null || exit 1
|
||||
|
||||
# Print Result
|
||||
printf "${blue}Ran Test #${test_number}: ${single_test_name}${normal}\n"
|
||||
printf "${yellow}Command: ${single_test_command}${normal}\n"
|
||||
if [ $exit_code -eq 0 ]; then
|
||||
printf "${green}TEST PASS${normal}\n"
|
||||
else
|
||||
printf "${red}TEST FAIL${normal}\n"
|
||||
fi
|
||||
|
||||
fi
|
||||
|
||||
# Return to the directory from which the user ran the command.
|
||||
popd > /dev/null || exit 1
|
||||
|
||||
+118
-48
@@ -1,64 +1,134 @@
|
||||
import regex
|
||||
import ctypes
|
||||
import unicodedata
|
||||
|
||||
|
||||
def get_matches(regex_expr):
|
||||
regex_expr_compiled = regex.compile(regex_expr)
|
||||
unicode_ranges = []
|
||||
current_range = None
|
||||
|
||||
for codepoint in range(0x110000):
|
||||
char = chr(codepoint)
|
||||
if regex_expr_compiled.match(char):
|
||||
if current_range is None:
|
||||
current_range = [codepoint, codepoint]
|
||||
else:
|
||||
current_range[1] = codepoint
|
||||
elif current_range is not None:
|
||||
unicode_ranges.append(tuple(current_range))
|
||||
current_range = None
|
||||
|
||||
if current_range is not None:
|
||||
unicode_ranges.append(tuple(current_range))
|
||||
|
||||
return unicode_ranges
|
||||
class CoodepointFlags (ctypes.Structure):
|
||||
_fields_ = [ # see definition in unicode.h
|
||||
("is_undefined", ctypes.c_uint16, 1),
|
||||
("is_number", ctypes.c_uint16, 1), # regex: \p{N}
|
||||
("is_letter", ctypes.c_uint16, 1), # regex: \p{L}
|
||||
("is_separator", ctypes.c_uint16, 1), # regex: \p{Z}
|
||||
("is_accent_mark", ctypes.c_uint16, 1), # regex: \p{M}
|
||||
("is_punctuation", ctypes.c_uint16, 1), # regex: \p{P}
|
||||
("is_symbol", ctypes.c_uint16, 1), # regex: \p{S}
|
||||
("is_control", ctypes.c_uint16, 1), # regex: \p{C}
|
||||
]
|
||||
|
||||
|
||||
def print_cat(mode, cat, ranges):
|
||||
if mode == "range":
|
||||
print("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_{} = {{".format(cat)) # noqa: NP100
|
||||
if mode == "map":
|
||||
print("const std::map<uint32_t, uint32_t> unicode_map_{} = {{".format(cat)) # noqa: NP100
|
||||
for i, values in enumerate(ranges):
|
||||
end = ",\n" if (i % 4 == 3 or i + 1 == len(ranges)) else ", "
|
||||
values = ["0x%08X" % value for value in values]
|
||||
print("{" + ", ".join(values) + "}", end=end) # noqa: NP100
|
||||
print("};") # noqa: NP100
|
||||
print("") # noqa: NP100
|
||||
assert (ctypes.sizeof(CoodepointFlags) == 2)
|
||||
|
||||
|
||||
print_cat("range", "number", get_matches(r'\p{N}'))
|
||||
print_cat("range", "letter", get_matches(r'\p{L}'))
|
||||
print_cat("range", "separator", get_matches(r'\p{Z}'))
|
||||
print_cat("range", "accent_mark", get_matches(r'\p{M}'))
|
||||
print_cat("range", "punctuation", get_matches(r'\p{P}'))
|
||||
print_cat("range", "symbol", get_matches(r'\p{S}'))
|
||||
print_cat("range", "control", get_matches(r'\p{C}'))
|
||||
MAX_CODEPOINTS = 0x110000
|
||||
|
||||
print_cat("range", "whitespace", get_matches(r'\s'))
|
||||
regex_number = regex.compile(r'\p{N}')
|
||||
regex_letter = regex.compile(r'\p{L}')
|
||||
regex_separator = regex.compile(r'\p{Z}')
|
||||
regex_accent_mark = regex.compile(r'\p{M}')
|
||||
regex_punctuation = regex.compile(r'\p{P}')
|
||||
regex_symbol = regex.compile(r'\p{S}')
|
||||
regex_control = regex.compile(r'\p{C}')
|
||||
regex_whitespace = regex.compile(r'\s')
|
||||
|
||||
codepoint_flags = (CoodepointFlags * MAX_CODEPOINTS)()
|
||||
table_whitespace = []
|
||||
table_lowercase = []
|
||||
table_uppercase = []
|
||||
table_nfd = []
|
||||
|
||||
map_lowercase = []
|
||||
map_uppercase = []
|
||||
for codepoint in range(0x110000):
|
||||
for codepoint in range(MAX_CODEPOINTS):
|
||||
# convert codepoint to unicode character
|
||||
char = chr(codepoint)
|
||||
|
||||
# regex categories
|
||||
flags = codepoint_flags[codepoint]
|
||||
flags.is_number = bool(regex_number.match(char))
|
||||
flags.is_letter = bool(regex_letter.match(char))
|
||||
flags.is_separator = bool(regex_separator.match(char))
|
||||
flags.is_accent_mark = bool(regex_accent_mark.match(char))
|
||||
flags.is_punctuation = bool(regex_punctuation.match(char))
|
||||
flags.is_symbol = bool(regex_symbol.match(char))
|
||||
flags.is_control = bool(regex_control.match(char))
|
||||
flags.is_undefined = bytes(flags)[0] == 0
|
||||
assert (not flags.is_undefined)
|
||||
|
||||
# whitespaces
|
||||
if bool(regex_whitespace.match(char)):
|
||||
table_whitespace.append(codepoint)
|
||||
|
||||
# lowercase conversion
|
||||
lower = ord(char.lower()[0])
|
||||
upper = ord(char.upper()[0])
|
||||
if codepoint != lower:
|
||||
map_lowercase.append((codepoint, lower))
|
||||
table_lowercase.append((codepoint, lower))
|
||||
|
||||
# uppercase conversion
|
||||
upper = ord(char.upper()[0])
|
||||
if codepoint != upper:
|
||||
map_uppercase.append((codepoint, upper))
|
||||
print_cat("map", "lowercase", map_lowercase)
|
||||
print_cat("map", "uppercase", map_uppercase)
|
||||
table_uppercase.append((codepoint, upper))
|
||||
|
||||
# NFD normalization
|
||||
norm = ord(unicodedata.normalize('NFD', char)[0])
|
||||
if codepoint != norm:
|
||||
table_nfd.append((codepoint, norm))
|
||||
|
||||
|
||||
# TODO: generate unicode_map_nfd
|
||||
# group ranges with same flags
|
||||
ranges_flags = [(0, codepoint_flags[0])] # start, flags
|
||||
for codepoint, flags in enumerate(codepoint_flags):
|
||||
if bytes(flags) != bytes(ranges_flags[-1][1]):
|
||||
ranges_flags.append((codepoint, flags))
|
||||
ranges_flags.append((MAX_CODEPOINTS, CoodepointFlags()))
|
||||
|
||||
|
||||
# group ranges with same nfd
|
||||
ranges_nfd = [(0, 0, 0)] # start, last, nfd
|
||||
for codepoint, norm in table_nfd:
|
||||
start = ranges_nfd[-1][0]
|
||||
if ranges_nfd[-1] != (start, codepoint - 1, norm):
|
||||
ranges_nfd.append(None)
|
||||
start = codepoint
|
||||
ranges_nfd[-1] = (start, codepoint, norm)
|
||||
|
||||
|
||||
# Generate 'unicode-data.cpp'
|
||||
|
||||
|
||||
def out(line=""):
|
||||
print(line, end='\n') # noqa
|
||||
|
||||
|
||||
out("""\
|
||||
// generated with scripts/gen-unicode-data.py
|
||||
|
||||
#include "unicode-data.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
""")
|
||||
|
||||
out("const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = { // start, flags // last=next_start-1")
|
||||
for codepoint, flags in ranges_flags:
|
||||
flags = int.from_bytes(bytes(flags), "little")
|
||||
out("{0x%06X, 0x%04X}," % (codepoint, flags))
|
||||
out("};\n")
|
||||
|
||||
out("const std::unordered_set<uint32_t> unicode_set_whitespace = {")
|
||||
out(", ".join("0x%06X" % cpt for cpt in table_whitespace))
|
||||
out("};\n")
|
||||
|
||||
out("const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {")
|
||||
for tuple in table_lowercase:
|
||||
out("{0x%06X, 0x%06X}," % tuple)
|
||||
out("};\n")
|
||||
|
||||
out("const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {")
|
||||
for tuple in table_uppercase:
|
||||
out("{0x%06X, 0x%06X}," % tuple)
|
||||
out("};\n")
|
||||
|
||||
out("const std::vector<range_nfd> unicode_ranges_nfd = { // start, last, nfd")
|
||||
for triple in ranges_nfd:
|
||||
out("{0x%06X, 0x%06X, 0x%06X}," % triple)
|
||||
out("};\n")
|
||||
|
||||
@@ -112,6 +112,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
||||
# src/ggml-opencl.h -> ggml-opencl.h
|
||||
# src/ggml-quants.c -> ggml-quants.c
|
||||
# src/ggml-quants.h -> ggml-quants.h
|
||||
# src/ggml-rpc.cpp -> ggml-rpc.cpp
|
||||
# src/ggml-rpc.h -> ggml-rpc.h
|
||||
# src/ggml-sycl.cpp -> ggml-sycl.cpp
|
||||
# src/ggml-sycl.h -> ggml-sycl.h
|
||||
# src/ggml-vulkan.cpp -> ggml-vulkan.cpp
|
||||
@@ -149,6 +151,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
||||
-e 's/src\/ggml-opencl\.h/ggml-opencl.h/g' \
|
||||
-e 's/src\/ggml-quants\.c/ggml-quants.c/g' \
|
||||
-e 's/src\/ggml-quants\.h/ggml-quants.h/g' \
|
||||
-e 's/src\/ggml-rpc\.cpp/ggml-rpc.cpp/g' \
|
||||
-e 's/src\/ggml-rpc\.h/ggml-rpc.h/g' \
|
||||
-e 's/src\/ggml-sycl\.cpp/ggml-sycl.cpp/g' \
|
||||
-e 's/src\/ggml-sycl\.h/ggml-sycl.h/g' \
|
||||
-e 's/src\/ggml-vulkan\.cpp/ggml-vulkan.cpp/g' \
|
||||
|
||||
@@ -1 +1 @@
|
||||
30f54cbb3ada3e4c5bc6924de3e5918e5be4ff11
|
||||
126d34985705a5a2222723c145cb4e125ac689f3
|
||||
|
||||
@@ -20,6 +20,8 @@ cp -rpv ../ggml/src/ggml-opencl.cpp ./ggml-opencl.cpp
|
||||
cp -rpv ../ggml/src/ggml-opencl.h ./ggml-opencl.h
|
||||
cp -rpv ../ggml/src/ggml-quants.c ./ggml-quants.c
|
||||
cp -rpv ../ggml/src/ggml-quants.h ./ggml-quants.h
|
||||
cp -rpv ../ggml/src/ggml-rpc.cpp ./ggml-rpc.cpp
|
||||
cp -rpv ../ggml/src/ggml-rpc.h ./ggml-rpc.h
|
||||
cp -rpv ../ggml/src/ggml-sycl.cpp ./ggml-sycl.cpp
|
||||
cp -rpv ../ggml/src/ggml-sycl.h ./ggml-sycl.h
|
||||
cp -rpv ../ggml/src/ggml-vulkan.cpp ./ggml-vulkan.cpp
|
||||
|
||||
+62
-13
@@ -16,6 +16,7 @@
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
|
||||
static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
|
||||
// static RNG initialization (revisit if n_threads stops being constant)
|
||||
static const size_t n_threads = std::thread::hardware_concurrency();
|
||||
@@ -49,6 +50,22 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
|
||||
t.join();
|
||||
}
|
||||
|
||||
#if 0
|
||||
const char * val_str = getenv("GGML_TEST_EPS");
|
||||
float val = 1e-9f;
|
||||
if (val_str != nullptr) {
|
||||
val = std::stof(val_str);
|
||||
printf("GGML_TEST_EPS=%e\n", val);
|
||||
}
|
||||
|
||||
// test quantization with very small values that may result in nan scales due to division by zero
|
||||
if (ggml_is_quantized(tensor->type)) {
|
||||
for (int i = 0; i < 256; i++) {
|
||||
data[i] = val;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
|
||||
ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
|
||||
} else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) {
|
||||
@@ -64,6 +81,7 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
|
||||
}
|
||||
}
|
||||
ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im);
|
||||
GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size()));
|
||||
ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
|
||||
} else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
|
||||
// This is going to create some weird integers though.
|
||||
@@ -1329,23 +1347,47 @@ struct test_upscale : public test_case {
|
||||
const ggml_type type;
|
||||
const std::array<int64_t, 4> ne;
|
||||
const int32_t scale_factor;
|
||||
const bool transpose;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR3(type, ne, scale_factor);
|
||||
return VARS_TO_STR4(type, ne, scale_factor, transpose);
|
||||
}
|
||||
|
||||
test_upscale(ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne = {512, 512, 3, 1},
|
||||
int32_t scale_factor = 2)
|
||||
: type(type), ne(ne), scale_factor(scale_factor) {}
|
||||
int32_t scale_factor = 2, bool transpose = false)
|
||||
: type(type), ne(ne), scale_factor(scale_factor), transpose(transpose) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||
if (transpose) a = ggml_transpose(ctx, a);
|
||||
ggml_tensor * out = ggml_upscale(ctx, a, scale_factor);
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_UPSCALE (ext)
|
||||
struct test_upscale_ext : public test_case {
|
||||
const ggml_type type;
|
||||
const std::array<int64_t, 4> ne;
|
||||
const std::array<int64_t, 4> ne_tgt;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR3(type, ne, ne_tgt);
|
||||
}
|
||||
|
||||
test_upscale_ext(ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne = {2, 5, 7, 11},
|
||||
std::array<int64_t, 4> ne_tgt = {5, 7, 11, 13})
|
||||
: type(type), ne(ne), ne_tgt(ne_tgt) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||
ggml_tensor * out = ggml_upscale_ext(ctx, a, ne_tgt[0], ne_tgt[1],ne_tgt[2], ne_tgt[3]);
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_GROUP_NORM
|
||||
struct test_group_norm : public test_case {
|
||||
const ggml_type type;
|
||||
@@ -1487,25 +1529,27 @@ struct test_flash_attn_ext : public test_case {
|
||||
const int64_t kv; // kv size
|
||||
const int64_t nb; // batch size
|
||||
|
||||
const bool mask; // use mask
|
||||
|
||||
const float max_bias; // ALiBi
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR5(hs, nh, kv, nb, max_bias);
|
||||
return VARS_TO_STR6(hs, nh, kv, nb, mask, max_bias);
|
||||
}
|
||||
|
||||
double max_nmse_err() override {
|
||||
return 5e-4;
|
||||
}
|
||||
|
||||
test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8, float max_bias = 0.0f)
|
||||
: hs(hs), nh(nh), kv(kv), nb(nb), max_bias(max_bias) {}
|
||||
test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8, bool mask = true, float max_bias = 0.0f)
|
||||
: hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * q = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, hs, nb, nh, 1);
|
||||
ggml_tensor * k = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1);
|
||||
ggml_tensor * v = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1);
|
||||
ggml_tensor * mask = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1);
|
||||
ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, mask, 1.0f/sqrtf(hs), max_bias);
|
||||
ggml_tensor * m = mask ? ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1) : nullptr;
|
||||
ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, m, 1.0f/sqrtf(hs), max_bias);
|
||||
return out;
|
||||
}
|
||||
};
|
||||
@@ -2167,6 +2211,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
|
||||
test_cases.emplace_back(new test_sum_rows());
|
||||
test_cases.emplace_back(new test_upscale());
|
||||
test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, { 512, 512, 3, 1 }, 2, true));
|
||||
test_cases.emplace_back(new test_upscale_ext());
|
||||
test_cases.emplace_back(new test_group_norm());
|
||||
test_cases.emplace_back(new test_acc());
|
||||
test_cases.emplace_back(new test_pad());
|
||||
@@ -2175,11 +2221,14 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
test_cases.emplace_back(new test_leaky_relu());
|
||||
|
||||
for (int hs : { 64, 80, 128, 256, }) {
|
||||
for (float max_bias : {0.0f, 8.0f}) {
|
||||
for (int nh : { 32, }) {
|
||||
for (int kv : { 512, 1024, }) {
|
||||
for (int nb : { 1, 2, 4, 8, }) {
|
||||
test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, max_bias));
|
||||
for (bool mask : { true, false } ) {
|
||||
for (float max_bias : { 0.0f, 8.0f }) {
|
||||
if (!mask && max_bias > 0.0f) continue;
|
||||
for (int nh : { 32, }) {
|
||||
for (int kv : { 512, 1024, }) {
|
||||
for (int nb : { 1, 2, 4, 8, }) {
|
||||
test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# Test libllama tokenizer == AutoTokenizer.
|
||||
# Brute force random tokens/text generation.
|
||||
# Brute force random words/text generation.
|
||||
#
|
||||
# Sample usage:
|
||||
#
|
||||
@@ -12,10 +12,10 @@ import argparse
|
||||
import subprocess
|
||||
import random
|
||||
|
||||
from typing import Iterator
|
||||
from typing import Callable, Iterator
|
||||
|
||||
import cffi
|
||||
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
logger = logging.getLogger("test-tokenizer-random-bpe")
|
||||
|
||||
@@ -145,28 +145,35 @@ def generator_custom_text() -> Iterator[str]:
|
||||
def generator_custom_text_edge_cases() -> Iterator[str]:
|
||||
"""Edge cases found while debugging"""
|
||||
yield from [
|
||||
'\x1f-a', # unicode_ranges_control, {0x00001C, 0x00001F}
|
||||
'¼-a', # unicode_ranges_digit, 0x00BC
|
||||
'½-a', # unicode_ranges_digit, 0x00BD
|
||||
'¾-a', # unicode_ranges_digit, 0x00BE
|
||||
'a 〇b', # unicode_ranges_digit, 0x3007
|
||||
'Ⅵ-a', # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms
|
||||
'\uFEFF//', # unicode_ranges_control, 0xFEFF (BOM)
|
||||
'<s>a' # TODO: Phi-3 fail
|
||||
'\x1f-a', # unicode_ranges_control, {0x00001C, 0x00001F}
|
||||
'¼-a', # unicode_ranges_digit, 0x00BC
|
||||
'½-a', # unicode_ranges_digit, 0x00BD
|
||||
'¾-a', # unicode_ranges_digit, 0x00BE
|
||||
'a 〇b', # unicode_ranges_digit, 0x3007
|
||||
'Ⅵ-a', # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms
|
||||
'\uFEFF//', # unicode_ranges_control, 0xFEFF (BOM)
|
||||
'Cửa Việt', # llama-3, ignore_merges = true
|
||||
'<s>a', # TODO: Phi-3 fail
|
||||
'a\na', # TODO: Bert fail
|
||||
]
|
||||
|
||||
|
||||
def generator_random_chars(iterations = 100) -> Iterator[str]:
|
||||
def generator_vocab_words(vocab: list[str]) -> Iterator[str]:
|
||||
"""Brute force check all vocab words"""
|
||||
yield from vocab
|
||||
|
||||
|
||||
def generator_random_chars(iterations=100) -> Iterator[str]:
|
||||
"""Brute force random text with simple characters"""
|
||||
|
||||
WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
|
||||
CHARS = list(set("""
|
||||
CHARS = list(sorted(set("""
|
||||
ABCDEFGHIJKLMNOPQRSTUVWXYZ
|
||||
abcdefghijklmnopqrstuvwxyz
|
||||
ÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÄËÏÖÜ
|
||||
áéíóúàèìòùâêîôûäëïöü
|
||||
.-,*/-+ª!"·$%&/()=?¿[]{}<>\\|@#~½¬~;:_
|
||||
"""))
|
||||
""")))
|
||||
|
||||
rand = random.Random()
|
||||
for m in range(iterations):
|
||||
@@ -181,13 +188,13 @@ def generator_random_chars(iterations = 100) -> Iterator[str]:
|
||||
yield "".join(text)
|
||||
|
||||
|
||||
def generator_random_vocab_chars(tokenizer: PreTrainedTokenizerBase, iterations = 100) -> Iterator[str]:
|
||||
def generator_random_vocab_chars(vocab: list[str], iterations=100) -> Iterator[str]:
|
||||
"""Brute force random text with vocab characters"""
|
||||
|
||||
vocab_ids = list(tokenizer.vocab.values())
|
||||
vocab_text = tokenizer.decode(vocab_ids, skip_special_tokens=True)
|
||||
vocab_chars = list(set(vocab_text))
|
||||
del vocab_ids, vocab_text
|
||||
vocab_chars = set()
|
||||
for word in vocab:
|
||||
vocab_chars.update(word)
|
||||
vocab_chars = list(sorted(vocab_chars))
|
||||
|
||||
rand = random.Random()
|
||||
for m in range(iterations):
|
||||
@@ -196,19 +203,11 @@ def generator_random_vocab_chars(tokenizer: PreTrainedTokenizerBase, iterations
|
||||
yield "".join(text)
|
||||
|
||||
|
||||
def generator_random_vocab_tokens(tokenizer: PreTrainedTokenizerBase, iterations = 100) -> Iterator[str]:
|
||||
"""Brute force random text from vocab tokens"""
|
||||
def generator_random_vocab_words(vocab: list[str], iterations=100) -> Iterator[str]:
|
||||
"""Brute force random text from vocab words"""
|
||||
|
||||
space_id = tokenizer.encode(" ", add_special_tokens=False)[0]
|
||||
vocab_ids = list(tokenizer.vocab.values())
|
||||
vocab_ids = list(sorted(vocab_ids + vocab_ids))
|
||||
for i in range(1, len(vocab_ids), 2):
|
||||
vocab_ids[i] = space_id
|
||||
vocab_tokens = tokenizer.decode(vocab_ids, skip_special_tokens=True)
|
||||
vocab_tokens = vocab_tokens.split(" ")
|
||||
del vocab_ids
|
||||
|
||||
yield from vocab_tokens
|
||||
vocab = [w.strip() for w in vocab]
|
||||
yield from vocab
|
||||
|
||||
rand = random.Random()
|
||||
for m in range(iterations):
|
||||
@@ -217,14 +216,13 @@ def generator_random_vocab_tokens(tokenizer: PreTrainedTokenizerBase, iterations
|
||||
num_words = rand.randint(300, 400)
|
||||
for i in range(num_words):
|
||||
k = rand.randint(1, 3)
|
||||
tokens = rand.choices(vocab_tokens, k=k)
|
||||
tokens = [t.strip(" \n\r\t") for t in tokens]
|
||||
words = rand.choices(vocab, k=k)
|
||||
sep = rand.choice(" \n\r\t")
|
||||
text.append("".join(tokens) + sep)
|
||||
text.append("".join(words) + sep)
|
||||
yield "".join(text)
|
||||
|
||||
|
||||
def generator_random_bytes(iterations = 100) -> Iterator[str]:
|
||||
def generator_random_bytes(iterations=100) -> Iterator[str]:
|
||||
"""Brute force random bytes"""
|
||||
|
||||
WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
|
||||
@@ -242,10 +240,10 @@ def generator_random_bytes(iterations = 100) -> Iterator[str]:
|
||||
yield "".join(text)
|
||||
|
||||
|
||||
def test_compare_tokenizer(model: LibLlamaModel, tokenizer: PreTrainedTokenizerBase, generator: Iterator[str]):
|
||||
def test_compare_tokenizer(func_tokenize1: Callable, func_tokenize2: Callable, generator: Iterator[str]):
|
||||
|
||||
def find_first_mismatch(ids1: list[int], ids2: list[int]):
|
||||
for i, (a,b) in enumerate(zip(ids1, ids2)):
|
||||
for i, (a, b) in enumerate(zip(ids1, ids2)):
|
||||
if a != b:
|
||||
return i
|
||||
if len(ids1) == len(ids2):
|
||||
@@ -255,15 +253,12 @@ def test_compare_tokenizer(model: LibLlamaModel, tokenizer: PreTrainedTokenizerB
|
||||
t0 = time.perf_counter()
|
||||
logger.info("%s: %s" % (generator.__name__, "ini"))
|
||||
for text in generator:
|
||||
ids1 = model.tokenize(text, add_special=False, parse_special=False)
|
||||
ids2 = tokenizer.encode(text, add_special_tokens=False)
|
||||
ids1 = func_tokenize1(text)
|
||||
ids2 = func_tokenize2(text)
|
||||
if ids1 != ids2:
|
||||
i = find_first_mismatch(ids1, ids2)
|
||||
ids1 = list(ids1)[max(0, i - 2) : i + 2 + 1]
|
||||
ids2 = list(ids2)[max(0, i - 2) : i + 2 + 1]
|
||||
text2 = tokenizer.decode(ids2, skip_special_tokens=True)
|
||||
assert (text2 in text)
|
||||
logger.info(" Text: " + repr(text2))
|
||||
logger.info(" TokenIDs: " + str(ids1))
|
||||
logger.info(" Expected: " + str(ids2))
|
||||
raise Exception()
|
||||
@@ -271,25 +266,37 @@ def test_compare_tokenizer(model: LibLlamaModel, tokenizer: PreTrainedTokenizerB
|
||||
logger.info("%s: end, time: %.3f secs" % (generator.__name__, t1 - t0))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
def main(argv: list[str] = None):
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("vocab_file", help="path to vocab 'gguf' file")
|
||||
parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
|
||||
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
|
||||
args = parser.parse_args()
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
|
||||
|
||||
model = LibLlamaModel(LibLlama(), args.vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=2048))
|
||||
|
||||
model = LibLlamaModel(LibLlama(), args.vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=4096))
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.dir_tokenizer)
|
||||
|
||||
test_compare_tokenizer(model, tokenizer, generator_custom_text())
|
||||
test_compare_tokenizer(model, tokenizer, generator_custom_text_edge_cases())
|
||||
test_compare_tokenizer(model, tokenizer, generator_random_chars(10_000))
|
||||
test_compare_tokenizer(model, tokenizer, generator_random_vocab_chars(tokenizer, 10_000))
|
||||
test_compare_tokenizer(model, tokenizer, generator_random_vocab_tokens(tokenizer, 10_000))
|
||||
# test_compare_tokenizer(model, tokenizer, generator_random_bytes(10_000)) # FAIL
|
||||
def func_tokenize2(text: str):
|
||||
return tokenizer.encode(text, add_special_tokens=False)
|
||||
|
||||
parse_special = all(len(func_tokenize2(t)) == 1 for t in tokenizer.all_special_tokens)
|
||||
|
||||
def func_tokenize1(text: str):
|
||||
return model.tokenize(text, add_special=False, parse_special=parse_special)
|
||||
|
||||
vocab = list(sorted(tokenizer.batch_decode(list(tokenizer.get_vocab().values()), skip_special_tokens=True)))
|
||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text())
|
||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text_edge_cases())
|
||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_vocab_words(vocab))
|
||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_chars(10_000))
|
||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_chars(vocab, 10_000))
|
||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_words(vocab, 10_000))
|
||||
# test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_bytes(10_000)) # FAIL
|
||||
|
||||
model.free()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
+6969
-2169
File diff suppressed because it is too large
Load Diff
+15
-12
@@ -1,17 +1,20 @@
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
|
||||
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_number;
|
||||
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_letter;
|
||||
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_separator;
|
||||
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace;
|
||||
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_accent_mark;
|
||||
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_punctuation;
|
||||
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_symbol;
|
||||
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_control;
|
||||
extern const std::multimap<uint32_t, uint32_t> unicode_map_nfd;
|
||||
extern const std::map<char32_t, char32_t> unicode_map_lowercase;
|
||||
struct range_nfd {
|
||||
uint32_t first;
|
||||
uint32_t last;
|
||||
uint32_t nfd;
|
||||
};
|
||||
|
||||
static const uint32_t MAX_CODEPOINTS = 0x110000;
|
||||
|
||||
extern const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags;
|
||||
extern const std::unordered_set<uint32_t> unicode_set_whitespace;
|
||||
extern const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase;
|
||||
extern const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase;
|
||||
extern const std::vector<range_nfd> unicode_ranges_nfd;
|
||||
|
||||
+89
-111
@@ -1,4 +1,4 @@
|
||||
#include "unicode.h"
|
||||
#include "unicode.h"
|
||||
#include "unicode-data.h"
|
||||
|
||||
#include <cassert>
|
||||
@@ -109,57 +109,49 @@ static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset)
|
||||
// return result;
|
||||
//}
|
||||
|
||||
static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
|
||||
std::unordered_map<uint32_t, int> cpt_types;
|
||||
for (auto p : unicode_ranges_number) {
|
||||
for (auto i = p.first; i <= p.second; ++i) {
|
||||
cpt_types[i] = CODEPOINT_TYPE_NUMBER;
|
||||
static std::vector<codepoint_flags> unicode_cpt_flags_array() {
|
||||
std::vector<codepoint_flags> cpt_flags(MAX_CODEPOINTS, codepoint_flags::UNDEFINED);
|
||||
|
||||
assert (unicode_ranges_flags.front().first == 0);
|
||||
assert (unicode_ranges_flags.back().first == MAX_CODEPOINTS);
|
||||
for (size_t i = 1; i < unicode_ranges_flags.size(); ++i) {
|
||||
const auto range_ini = unicode_ranges_flags[i-1]; // codepoint_ini, flags
|
||||
const auto range_end = unicode_ranges_flags[i]; // codepoint_end, flags
|
||||
for (uint32_t cpt = range_ini.first; cpt < range_end.first; ++cpt) {
|
||||
cpt_flags[cpt] = range_ini.second;
|
||||
}
|
||||
}
|
||||
for (auto p : unicode_ranges_letter) {
|
||||
for (auto i = p.first; i <= p.second; ++i) {
|
||||
cpt_types[i] = CODEPOINT_TYPE_LETTER;
|
||||
}
|
||||
|
||||
for (auto cpt : unicode_set_whitespace) {
|
||||
cpt_flags[cpt].is_whitespace = true;
|
||||
}
|
||||
for (auto p : unicode_ranges_separator) {
|
||||
for (auto i = p.first; i <= p.second; ++i) {
|
||||
cpt_types[i] = CODEPOINT_TYPE_SEPARATOR;
|
||||
}
|
||||
|
||||
for (auto p : unicode_map_lowercase) {
|
||||
cpt_flags[p.second].is_lowercase = true;
|
||||
}
|
||||
for (auto p : unicode_ranges_accent_mark) {
|
||||
for (auto i = p.first; i <= p.second; ++i) {
|
||||
cpt_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
|
||||
}
|
||||
|
||||
for (auto p : unicode_map_uppercase) {
|
||||
cpt_flags[p.second].is_uppercase = true;
|
||||
}
|
||||
for (auto p : unicode_ranges_punctuation) {
|
||||
for (auto i = p.first; i <= p.second; ++i) {
|
||||
cpt_types[i] = CODEPOINT_TYPE_PUNCTUATION;
|
||||
}
|
||||
|
||||
for (auto &range : unicode_ranges_nfd) { // start, last, nfd
|
||||
cpt_flags[range.nfd].is_nfd = true;
|
||||
}
|
||||
for (auto p : unicode_ranges_symbol) {
|
||||
for (auto i = p.first; i <= p.second; ++i) {
|
||||
cpt_types[i] = CODEPOINT_TYPE_SYMBOL;
|
||||
}
|
||||
}
|
||||
for (auto p : unicode_ranges_control) {
|
||||
for (auto i = p.first; i <= p.second; ++i) {
|
||||
cpt_types[i] = CODEPOINT_TYPE_CONTROL;
|
||||
}
|
||||
}
|
||||
return cpt_types;
|
||||
|
||||
return cpt_flags;
|
||||
}
|
||||
|
||||
static std::unordered_map<uint8_t, std::string> unicode_byte_to_utf8_map() {
|
||||
std::unordered_map<uint8_t, std::string> map;
|
||||
for (int ch = u'!'; ch <= u'~'; ++ch) {
|
||||
for (int ch = 0x21; ch <= 0x7E; ++ch) { // u'!' to u'~'
|
||||
assert(0 <= ch && ch < 256);
|
||||
map[ch] = unicode_cpt_to_utf8(ch);
|
||||
}
|
||||
for (int ch = u'¡'; ch <= u'¬'; ++ch) {
|
||||
for (int ch = 0xA1; ch <= 0xAC; ++ch) { // u'¡' to u'¬'
|
||||
assert(0 <= ch && ch < 256);
|
||||
map[ch] = unicode_cpt_to_utf8(ch);
|
||||
}
|
||||
for (int ch = u'®'; ch <= u'ÿ'; ++ch) {
|
||||
for (int ch = 0xAE; ch <= 0xFF; ++ch) { // u'®' to u'ÿ'
|
||||
assert(0 <= ch && ch < 256);
|
||||
map[ch] = unicode_cpt_to_utf8(ch);
|
||||
}
|
||||
@@ -175,15 +167,15 @@ static std::unordered_map<uint8_t, std::string> unicode_byte_to_utf8_map() {
|
||||
|
||||
static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
|
||||
std::unordered_map<std::string, uint8_t> map;
|
||||
for (int ch = u'!'; ch <= u'~'; ++ch) {
|
||||
for (int ch = 0x21; ch <= 0x7E; ++ch) { // u'!' to u'~'
|
||||
assert(0 <= ch && ch < 256);
|
||||
map[unicode_cpt_to_utf8(ch)] = ch;
|
||||
}
|
||||
for (int ch = u'¡'; ch <= u'¬'; ++ch) {
|
||||
for (int ch = 0xA1; ch <= 0xAC; ++ch) { // u'¡' to u'¬'
|
||||
assert(0 <= ch && ch < 256);
|
||||
map[unicode_cpt_to_utf8(ch)] = ch;
|
||||
}
|
||||
for (int ch = u'®'; ch <= u'ÿ'; ++ch) {
|
||||
for (int ch = 0xAE; ch <= 0xFF; ++ch) { // u'®' to u'ÿ'
|
||||
assert(0 <= ch && ch < 256);
|
||||
map[unicode_cpt_to_utf8(ch)] = ch;
|
||||
}
|
||||
@@ -238,8 +230,9 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
|
||||
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
|
||||
};
|
||||
|
||||
auto _get_cpt_type = [&] (const size_t pos) -> int {
|
||||
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_type(cpts[pos]) : CODEPOINT_TYPE_UNIDENTIFIED;
|
||||
auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
|
||||
static const codepoint_flags undef(codepoint_flags::UNDEFINED);
|
||||
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : undef;
|
||||
};
|
||||
|
||||
size_t _prev_end = offset_ini;
|
||||
@@ -261,7 +254,7 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
|
||||
|
||||
for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
|
||||
const char32_t cpt = _get_cpt(pos);
|
||||
const int cpt_type = _get_cpt_type(pos);
|
||||
const auto flags = _get_flags(pos);
|
||||
|
||||
// regex: 's|'t|'re|'ve|'m|'ll|'d
|
||||
if (cpt == '\'' && pos+1 < offset_end) {
|
||||
@@ -281,39 +274,37 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
|
||||
}
|
||||
}
|
||||
|
||||
char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt);
|
||||
int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
|
||||
auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags);
|
||||
// regex: <space>?\p{L}+
|
||||
if (cpt2_type == CODEPOINT_TYPE_LETTER) {
|
||||
if (flags2.is_letter) {
|
||||
pos += (cpt == ' ');
|
||||
while (cpt2_type == CODEPOINT_TYPE_LETTER) {
|
||||
cpt2_type = _get_cpt_type(++pos);
|
||||
while (flags2.is_letter) {
|
||||
flags2 = _get_flags(++pos);
|
||||
}
|
||||
_add_token(pos);
|
||||
continue;
|
||||
}
|
||||
// regex: <space>?\p{N}+
|
||||
if (cpt2_type == CODEPOINT_TYPE_NUMBER) {
|
||||
if (flags2.is_number) {
|
||||
pos += (cpt == ' ');
|
||||
while (cpt2_type == CODEPOINT_TYPE_NUMBER) {
|
||||
cpt2_type = _get_cpt_type(++pos);
|
||||
while (flags2.is_number) {
|
||||
flags2 = _get_flags(++pos);
|
||||
}
|
||||
_add_token(pos);
|
||||
continue;
|
||||
}
|
||||
// regex: <space>?[^\s\p{L}\p{N}]+
|
||||
if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
||||
if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
|
||||
pos += (cpt == ' ');
|
||||
while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
||||
cpt2_type = _get_cpt_type(++pos);
|
||||
cpt2 = _get_cpt(pos);
|
||||
while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
|
||||
flags2 = _get_flags(++pos);
|
||||
}
|
||||
_add_token(pos);
|
||||
continue;
|
||||
}
|
||||
|
||||
size_t num_whitespaces = 0;
|
||||
while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) {
|
||||
while (_get_flags(pos+num_whitespaces).is_whitespace) {
|
||||
num_whitespaces++;
|
||||
}
|
||||
|
||||
@@ -357,8 +348,9 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
||||
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
|
||||
};
|
||||
|
||||
auto _get_cpt_type = [&] (const size_t pos) -> int {
|
||||
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_type(cpts[pos]) : CODEPOINT_TYPE_UNIDENTIFIED;
|
||||
auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
|
||||
static const codepoint_flags undef(codepoint_flags::UNDEFINED);
|
||||
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : undef;
|
||||
};
|
||||
|
||||
size_t _prev_end = offset_ini;
|
||||
@@ -380,7 +372,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
||||
|
||||
for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
|
||||
const char32_t cpt = _get_cpt(pos);
|
||||
const int cpt_type = _get_cpt_type(pos);
|
||||
const auto flags = _get_flags(pos);
|
||||
|
||||
// regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive
|
||||
if (cpt == '\'' && pos+1 < offset_end) {
|
||||
@@ -401,10 +393,10 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
||||
}
|
||||
|
||||
// regex: [^\r\n\p{L}\p{N}]?\p{L}+ //####FIXME: the first \p{L} is correct?
|
||||
if (cpt != '\r' && cpt != '\n' && /*cpt_type != CODEPOINT_TYPE_LETTER &&*/ cpt_type != CODEPOINT_TYPE_NUMBER) {
|
||||
if (cpt_type == CODEPOINT_TYPE_LETTER || _get_cpt_type(pos+1) == CODEPOINT_TYPE_LETTER) { // one or more letters
|
||||
if (!(cpt == '\r' || cpt == '\n' || /*flags.is_letter |*/ flags.is_number)) {
|
||||
if (flags.is_letter || _get_flags(pos+1).is_letter) { // one or more letters
|
||||
pos++;
|
||||
while (_get_cpt_type(pos) == CODEPOINT_TYPE_LETTER) {
|
||||
while (_get_flags(pos).is_letter) {
|
||||
pos++;
|
||||
}
|
||||
_add_token(pos);
|
||||
@@ -413,9 +405,9 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
||||
}
|
||||
|
||||
// regex: \p{N}{1,3}
|
||||
if (cpt_type == CODEPOINT_TYPE_NUMBER) {
|
||||
if (flags.is_number) {
|
||||
size_t ini = pos;
|
||||
while (_get_cpt_type(pos) == CODEPOINT_TYPE_NUMBER) {
|
||||
while (_get_flags(pos).is_number) {
|
||||
if (++pos - ini >= 3 ) {
|
||||
_add_token(pos);
|
||||
ini = pos;
|
||||
@@ -426,14 +418,13 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
||||
}
|
||||
|
||||
// regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
|
||||
char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt);
|
||||
int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
|
||||
if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
||||
auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags);
|
||||
if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
|
||||
pos += (cpt == ' ');
|
||||
while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
||||
cpt2_type = _get_cpt_type(++pos);
|
||||
cpt2 = _get_cpt(pos);
|
||||
while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
|
||||
flags2 = _get_flags(++pos);
|
||||
}
|
||||
char32_t cpt2 = _get_cpt(pos);
|
||||
while (cpt2 == '\r' || cpt2 == '\n') {
|
||||
cpt2 = _get_cpt(++pos);
|
||||
}
|
||||
@@ -443,7 +434,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
||||
|
||||
size_t num_whitespaces = 0;
|
||||
size_t last_end_r_or_n = 0;
|
||||
while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) {
|
||||
while (_get_flags(pos+num_whitespaces).is_whitespace) {
|
||||
char32_t cpt2 = _get_cpt(pos+num_whitespaces);
|
||||
if (cpt2 == '\r' || cpt2 == '\n') {
|
||||
last_end_r_or_n = pos + num_whitespaces + 1;
|
||||
@@ -589,15 +580,14 @@ std::string unicode_cpt_to_utf8(uint32_t cp) {
|
||||
}
|
||||
|
||||
std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts) {
|
||||
std::vector<uint32_t> result;
|
||||
result.reserve(cpts.size());
|
||||
auto comp = [] (const uint32_t cpt, const range_nfd & range) {
|
||||
return cpt < range.first;
|
||||
};
|
||||
std::vector<uint32_t> result(cpts.size());
|
||||
for (size_t i = 0; i < cpts.size(); ++i) {
|
||||
auto it = unicode_map_nfd.find(cpts[i]);
|
||||
if (it == unicode_map_nfd.end()) {
|
||||
result.push_back(cpts[i]);
|
||||
} else {
|
||||
result.push_back(it->second);
|
||||
}
|
||||
const uint32_t cpt = cpts[i];
|
||||
auto it = std::upper_bound(unicode_ranges_nfd.cbegin(), unicode_ranges_nfd.cend(), cpt, comp) - 1;
|
||||
result[i] = (it->first <= cpt && cpt <= it->last) ? it->nfd : cpt;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
@@ -611,31 +601,19 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
|
||||
return result;
|
||||
}
|
||||
|
||||
int unicode_cpt_type(uint32_t cp) {
|
||||
static std::unordered_map<uint32_t, int> cpt_types = unicode_cpt_type_map();
|
||||
const auto it = cpt_types.find(cp);
|
||||
return it == cpt_types.end() ? CODEPOINT_TYPE_UNIDENTIFIED : it->second;
|
||||
codepoint_flags unicode_cpt_flags(const uint32_t cp) {
|
||||
static const codepoint_flags undef(codepoint_flags::UNDEFINED);
|
||||
static const auto cpt_flags = unicode_cpt_flags_array();
|
||||
return cp < cpt_flags.size() ? cpt_flags[cp] : undef;
|
||||
}
|
||||
|
||||
int unicode_cpt_type(const std::string & utf8) {
|
||||
if (utf8.length() == 0) {
|
||||
return CODEPOINT_TYPE_UNIDENTIFIED;
|
||||
codepoint_flags unicode_cpt_flags(const std::string & utf8) {
|
||||
static const codepoint_flags undef(codepoint_flags::UNDEFINED);
|
||||
if (utf8.empty()) {
|
||||
return undef; // undefined
|
||||
}
|
||||
size_t offset = 0;
|
||||
return unicode_cpt_type(unicode_cpt_from_utf8(utf8, offset));
|
||||
}
|
||||
|
||||
bool unicode_cpt_is_whitespace(uint32_t cp) {
|
||||
static const std::unordered_set<uint32_t> is_whitespace = [] {
|
||||
std::unordered_set<uint32_t> is_whitespace;
|
||||
for (auto p : unicode_ranges_whitespace) {
|
||||
for (auto i = p.first; i <= p.second; ++i) {
|
||||
is_whitespace.insert(i);
|
||||
}
|
||||
}
|
||||
return is_whitespace;
|
||||
}();
|
||||
return (bool)is_whitespace.count(cp);
|
||||
return unicode_cpt_flags(unicode_cpt_from_utf8(utf8, offset));
|
||||
}
|
||||
|
||||
std::string unicode_byte_to_utf8(uint8_t byte) {
|
||||
@@ -656,21 +634,21 @@ char32_t unicode_tolower(char32_t cp) {
|
||||
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
|
||||
// unicode categories
|
||||
static const std::map<std::string, int> k_ucat_enum = {
|
||||
{ "\\p{N}", CODEPOINT_TYPE_NUMBER },
|
||||
{ "\\p{L}", CODEPOINT_TYPE_LETTER },
|
||||
{ "\\p{P}", CODEPOINT_TYPE_PUNCTUATION },
|
||||
{ "\\p{N}", codepoint_flags::NUMBER },
|
||||
{ "\\p{L}", codepoint_flags::LETTER },
|
||||
{ "\\p{P}", codepoint_flags::PUNCTUATION },
|
||||
};
|
||||
|
||||
static const std::map<int, int> k_ucat_cpt = {
|
||||
{ CODEPOINT_TYPE_NUMBER, 0xD1 },
|
||||
{ CODEPOINT_TYPE_LETTER, 0xD2 },
|
||||
{ CODEPOINT_TYPE_PUNCTUATION, 0xD3 },
|
||||
{ codepoint_flags::NUMBER, 0xD1 },
|
||||
{ codepoint_flags::LETTER, 0xD2 },
|
||||
{ codepoint_flags::PUNCTUATION, 0xD3 },
|
||||
};
|
||||
|
||||
static const std::map<int, std::string> k_ucat_map = {
|
||||
{ CODEPOINT_TYPE_NUMBER, "\x30-\x39" }, // 0-9
|
||||
{ CODEPOINT_TYPE_LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z
|
||||
{ CODEPOINT_TYPE_PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
|
||||
{ codepoint_flags::NUMBER, "\x30-\x39" }, // 0-9
|
||||
{ codepoint_flags::LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z
|
||||
{ codepoint_flags::PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
|
||||
};
|
||||
|
||||
// compute collapsed codepoints only if needed by at least one regex
|
||||
@@ -701,10 +679,10 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
||||
continue;
|
||||
}
|
||||
|
||||
const int cpt_type = unicode_cpt_type(cpts[i]);
|
||||
const int cpt_flag = unicode_cpt_flags(cpts[i]).category_flag();
|
||||
|
||||
if (k_ucat_cpt.find(cpt_type) != k_ucat_cpt.end()) {
|
||||
text_collapsed[i] = k_ucat_cpt.at(cpt_type);
|
||||
if (k_ucat_cpt.find(cpt_flag) != k_ucat_cpt.end()) {
|
||||
text_collapsed[i] = k_ucat_cpt.at(cpt_flag);
|
||||
} else {
|
||||
text_collapsed[i] = (char) 0xD0; // fallback
|
||||
}
|
||||
|
||||
@@ -4,24 +4,56 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#define CODEPOINT_TYPE_UNIDENTIFIED 0
|
||||
#define CODEPOINT_TYPE_NUMBER 1
|
||||
#define CODEPOINT_TYPE_LETTER 2
|
||||
#define CODEPOINT_TYPE_SEPARATOR 3
|
||||
#define CODEPOINT_TYPE_ACCENT_MARK 4
|
||||
#define CODEPOINT_TYPE_PUNCTUATION 5
|
||||
#define CODEPOINT_TYPE_SYMBOL 6
|
||||
#define CODEPOINT_TYPE_CONTROL 7
|
||||
struct codepoint_flags {
|
||||
enum {
|
||||
UNDEFINED = 0x0001,
|
||||
NUMBER = 0x0002, // regex: \p{N}
|
||||
LETTER = 0x0004, // regex: \p{L}
|
||||
SEPARATOR = 0x0008, // regex: \p{Z}
|
||||
ACCENT_MARK = 0x0010, // regex: \p{M}
|
||||
PUNCTUATION = 0x0020, // regex: \p{P}
|
||||
SYMBOL = 0x0040, // regex: \p{S}
|
||||
CONTROL = 0x0080, // regex: \p{C}
|
||||
MASK_CATEGORIES = 0x00FF,
|
||||
};
|
||||
|
||||
// codepoint type
|
||||
uint16_t is_undefined : 1;
|
||||
uint16_t is_number : 1; // regex: \p{N}
|
||||
uint16_t is_letter : 1; // regex: \p{L}
|
||||
uint16_t is_separator : 1; // regex: \p{Z}
|
||||
uint16_t is_accent_mark : 1; // regex: \p{M}
|
||||
uint16_t is_punctuation : 1; // regex: \p{P}
|
||||
uint16_t is_symbol : 1; // regex: \p{S}
|
||||
uint16_t is_control : 1; // regex: \p{C}
|
||||
// helper flags
|
||||
uint16_t is_whitespace : 1; // regex: \s
|
||||
uint16_t is_lowercase : 1;
|
||||
uint16_t is_uppercase : 1;
|
||||
uint16_t is_nfd : 1;
|
||||
|
||||
// decode from uint16
|
||||
inline codepoint_flags(const uint16_t flags=0) {
|
||||
*reinterpret_cast<uint16_t*>(this) = flags;
|
||||
}
|
||||
|
||||
inline uint16_t as_uint() const {
|
||||
return *reinterpret_cast<const uint16_t*>(this);
|
||||
}
|
||||
|
||||
inline uint16_t category_flag() const {
|
||||
return this->as_uint() & MASK_CATEGORIES;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
std::string unicode_cpt_to_utf8(uint32_t cp);
|
||||
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
|
||||
|
||||
std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
|
||||
|
||||
int unicode_cpt_type(uint32_t cp);
|
||||
int unicode_cpt_type(const std::string & utf8);
|
||||
|
||||
bool unicode_cpt_is_whitespace(uint32_t cp);
|
||||
codepoint_flags unicode_cpt_flags(const uint32_t cp);
|
||||
codepoint_flags unicode_cpt_flags(const std::string & utf8);
|
||||
|
||||
std::string unicode_byte_to_utf8(uint8_t byte);
|
||||
uint8_t unicode_utf8_to_byte(const std::string & utf8);
|
||||
|
||||
Reference in New Issue
Block a user