forked from wylab/llama.cpp
Compare commits
12 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| a4e8912dfd | |||
| edbf42edfd | |||
| d643bb2c79 | |||
| 8e186ef0e7 | |||
| 5fbfe384d4 | |||
| c76532e7ba | |||
| 2aa777d86d | |||
| eb0f5c28d3 | |||
| cf4cb59e64 | |||
| 0d5c742161 | |||
| 42158ae2e8 | |||
| 797f2ac062 |
+142
-125
@@ -1,4 +1,4 @@
|
||||
name: Create Release
|
||||
name: Release
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
@@ -227,6 +227,66 @@ jobs:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
|
||||
name: llama-bin-ubuntu-vulkan-x64.zip
|
||||
|
||||
windows-cpu:
|
||||
runs-on: windows-latest
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- arch: 'x64'
|
||||
- arch: 'arm64'
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-latest-cmake-cpu-${{ matrix.arch }}
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Install Ninja
|
||||
run: |
|
||||
choco install ninja
|
||||
|
||||
- name: libCURL
|
||||
id: get_libcurl
|
||||
uses: ./.github/actions/windows-setup-curl
|
||||
with:
|
||||
architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
|
||||
|
||||
- name: Build
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
cmake -S . -B build -G "Ninja Multi-Config" `
|
||||
-D CMAKE_TOOLCHAIN_FILE=cmake/${{ matrix.arch }}-windows-llvm.cmake `
|
||||
-DGGML_NATIVE=OFF `
|
||||
-DGGML_BACKEND_DL=ON `
|
||||
-DGGML_CPU_ALL_VARIANTS=ON `
|
||||
-DGGML_OPENMP=OFF `
|
||||
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" `
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build --config Release
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
|
||||
7z a llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-bin-win-cpu-${{ matrix.arch }}.zip
|
||||
name: llama-bin-win-cpu-${{ matrix.arch }}.zip
|
||||
|
||||
windows:
|
||||
runs-on: windows-latest
|
||||
|
||||
@@ -237,52 +297,30 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'cpu-x64'
|
||||
- backend: 'vulkan'
|
||||
arch: 'x64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF'
|
||||
#- build: 'openblas-x64'
|
||||
# arch: 'x64'
|
||||
# defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||
- build: 'vulkan-x64'
|
||||
arch: 'x64'
|
||||
defines: '-DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
|
||||
- build: 'cpu-arm64'
|
||||
arch: 'arm64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF'
|
||||
- build: 'opencl-adreno-arm64'
|
||||
defines: '-DGGML_VULKAN=ON'
|
||||
target: 'ggml-vulkan'
|
||||
- backend: 'opencl-adreno'
|
||||
arch: 'arm64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
|
||||
target: 'ggml-opencl'
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-latest-cmake-${{ matrix.build }}
|
||||
key: windows-latest-cmake-${{ matrix.backend }}-${{ matrix.arch }}
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Download OpenBLAS
|
||||
id: get_openblas
|
||||
if: ${{ matrix.build == 'openblas-x64' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
|
||||
curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
|
||||
mkdir $env:RUNNER_TEMP/openblas
|
||||
tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas
|
||||
$vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
|
||||
$msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
|
||||
$lib = $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
|
||||
& $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
|
||||
|
||||
- name: Install Vulkan SDK
|
||||
id: get_vulkan
|
||||
if: ${{ matrix.build == 'vulkan-x64' }}
|
||||
if: ${{ matrix.backend == 'vulkan' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
|
||||
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
|
||||
@@ -296,7 +334,7 @@ jobs:
|
||||
|
||||
- name: Install OpenCL Headers and Libs
|
||||
id: install_opencl
|
||||
if: ${{ matrix.build == 'opencl-adreno-arm64' }}
|
||||
if: ${{ matrix.backend == 'opencl-adreno' && matrix.arch == 'arm64' }}
|
||||
run: |
|
||||
git clone https://github.com/KhronosGroup/OpenCL-Headers
|
||||
cd OpenCL-Headers
|
||||
@@ -314,46 +352,22 @@ jobs:
|
||||
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
|
||||
cmake --build build-arm64-release --target install --config release
|
||||
|
||||
- name: libCURL
|
||||
id: get_libcurl
|
||||
uses: ./.github/actions/windows-setup-curl
|
||||
with:
|
||||
architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
cmake -S . -B build ${{ matrix.defines }} `
|
||||
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" `
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||
|
||||
- name: Add libopenblas.dll
|
||||
id: add_libopenblas_dll
|
||||
if: ${{ matrix.build == 'openblas-x64' }}
|
||||
run: |
|
||||
cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
|
||||
cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
cmake -S . -B build ${{ matrix.defines }} -DGGML_NATIVE=OFF -DGGML_CPU=OFF -DGGML_BACKEND_DL=ON -DLLAMA_CURL=OFF
|
||||
cmake --build build --config Release --target ${{ matrix.target }}
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
|
||||
7z a llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip .\build\bin\Release\${{ matrix.target }}.dll
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
|
||||
name: llama-bin-win-${{ matrix.build }}.zip
|
||||
path: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
|
||||
name: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
|
||||
|
||||
windows-cuda:
|
||||
runs-on: windows-2019
|
||||
@@ -366,8 +380,6 @@ jobs:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
@@ -386,45 +398,30 @@ jobs:
|
||||
run: |
|
||||
choco install ninja
|
||||
|
||||
- name: libCURL
|
||||
id: get_libcurl
|
||||
uses: ./.github/actions/windows-setup-curl
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
shell: cmd
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
|
||||
cmake -S . -B build -G "Ninja Multi-Config" ^
|
||||
-DGGML_NATIVE=OFF ^
|
||||
-DGGML_BACKEND_DL=ON ^
|
||||
-DGGML_CPU_ALL_VARIANTS=ON ^
|
||||
-DGGML_NATIVE=OFF ^
|
||||
-DGGML_CPU=OFF ^
|
||||
-DGGML_CUDA=ON ^
|
||||
-DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" ^
|
||||
${{ env.CMAKE_ARGS }}
|
||||
-DLLAMA_CURL=OFF
|
||||
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
|
||||
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
|
||||
cmake --build build --config Release
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
cmake --build build --config Release -j %NINJA_JOBS% --target ggml-cuda
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-cuda${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
|
||||
7z a llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip .\build\bin\Release\ggml-cuda.dll
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-cuda${{ matrix.cuda }}-x64.zip
|
||||
name: llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
|
||||
path: llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
|
||||
name: llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
|
||||
|
||||
- name: Copy and pack Cuda runtime
|
||||
run: |
|
||||
@@ -432,13 +429,13 @@ jobs:
|
||||
$dst='.\build\bin\cudart\'
|
||||
robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||
robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||
7z a cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip $dst\*
|
||||
7z a cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip $dst\*
|
||||
|
||||
- name: Upload Cuda runtime
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
|
||||
name: cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
|
||||
path: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
|
||||
name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
|
||||
|
||||
windows-sycl:
|
||||
runs-on: windows-latest
|
||||
@@ -455,8 +452,6 @@ jobs:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
@@ -469,15 +464,18 @@ jobs:
|
||||
run: |
|
||||
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
|
||||
# TODO: add libcurl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: examples/sycl/win-build-sycl.bat
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
shell: cmd
|
||||
run: |
|
||||
call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||
cmake -G "Ninja" -B build ^
|
||||
-DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^
|
||||
-DCMAKE_BUILD_TYPE=Release ^
|
||||
-DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
|
||||
-DGGML_CPU=OFF -DGGML_SYCL=ON ^
|
||||
-DLLAMA_CURL=OFF
|
||||
cmake --build build --target ggml-sycl -j
|
||||
|
||||
- name: Build the release package
|
||||
id: pack_artifacts
|
||||
@@ -502,12 +500,12 @@ jobs:
|
||||
cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
|
||||
|
||||
echo "cp oneAPI running time dll files to ./build/bin done"
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
|
||||
7z a llama-bin-win-sycl-x64.zip ./build/bin/*
|
||||
|
||||
- name: Upload the release package
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
|
||||
path: llama-bin-win-sycl-x64.zip
|
||||
name: llama-bin-win-sycl-x64.zip
|
||||
|
||||
windows-hip:
|
||||
@@ -521,8 +519,6 @@ jobs:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Clone rocWMMA repository
|
||||
id: clone_rocwmma
|
||||
@@ -532,7 +528,7 @@ jobs:
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-latest-cmake-hip-release
|
||||
key: windows-latest-cmake-hip-${{ matrix.gpu_target }}-x64
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Install
|
||||
@@ -550,14 +546,8 @@ jobs:
|
||||
run: |
|
||||
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
|
||||
|
||||
- name: libCURL
|
||||
id: get_libcurl
|
||||
uses: ./.github/actions/windows-setup-curl
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
|
||||
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
|
||||
@@ -569,31 +559,23 @@ jobs:
|
||||
-DAMDGPU_TARGETS=${{ matrix.gpu_target }} `
|
||||
-DGGML_HIP_ROCWMMA_FATTN=ON `
|
||||
-DGGML_HIP=ON `
|
||||
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" `
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
|
||||
-DLLAMA_CURL=OFF
|
||||
cmake --build build --target ggml-hip -j ${env:NUMBER_OF_PROCESSORS}
|
||||
md "build\bin\rocblas\library\"
|
||||
cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
|
||||
cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
|
||||
cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\libcurl-x64.dll
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*
|
||||
7z a llama-bin-win-hip-${{ matrix.gpu_target }}-x64.zip .\build\bin\*
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
|
||||
name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
|
||||
path: llama-bin-win-hip-${{ matrix.gpu_target }}-x64.zip
|
||||
name: llama-bin-win-hip-${{ matrix.gpu_target }}-x64.zip
|
||||
|
||||
ios-xcode-build:
|
||||
runs-on: macos-latest
|
||||
@@ -655,14 +637,16 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
needs:
|
||||
- ubuntu-22-cpu
|
||||
- ubuntu-22-vulkan
|
||||
- windows
|
||||
- windows-cpu
|
||||
- windows-cuda
|
||||
- windows-sycl
|
||||
- windows-hip
|
||||
- ubuntu-22-cpu
|
||||
- ubuntu-22-vulkan
|
||||
- macOS-arm64
|
||||
- macOS-x64
|
||||
- ios-xcode-build
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -680,10 +664,43 @@ jobs:
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
path: ./artifact
|
||||
merge-multiple: true
|
||||
|
||||
- name: Move artifacts
|
||||
id: move_artifacts
|
||||
run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release
|
||||
run: |
|
||||
mkdir -p release
|
||||
|
||||
echo "Adding CPU backend files to existing zips..."
|
||||
for arch in x64 arm64; do
|
||||
cpu_zip="artifact/llama-bin-win-cpu-${arch}.zip"
|
||||
temp_dir=$(mktemp -d)
|
||||
echo "Extracting CPU backend for $arch..."
|
||||
unzip "$cpu_zip" -d "$temp_dir"
|
||||
|
||||
echo "Adding CPU files to $arch zips..."
|
||||
for target_zip in artifact/llama-bin-win-*-${arch}.zip; do
|
||||
if [[ "$target_zip" == "$cpu_zip" ]]; then
|
||||
continue
|
||||
fi
|
||||
echo "Adding CPU backend to $(basename "$target_zip")"
|
||||
realpath_target_zip=$(realpath "$target_zip")
|
||||
(cd "$temp_dir" && zip -r "$realpath_target_zip" .)
|
||||
done
|
||||
|
||||
rm -rf "$temp_dir"
|
||||
done
|
||||
|
||||
echo "Renaming and moving zips to release..."
|
||||
for zip_file in artifact/llama-bin-win-*.zip; do
|
||||
base_name=$(basename "$zip_file" .zip)
|
||||
zip_name="llama-${{ steps.tag.outputs.name }}-${base_name#llama-}.zip"
|
||||
echo "Moving $zip_file to release/$zip_name"
|
||||
mv "$zip_file" "release/$zip_name"
|
||||
done
|
||||
|
||||
echo "Moving other artifacts..."
|
||||
mv -v artifact/*.zip release
|
||||
|
||||
- name: Create release
|
||||
id: create_release
|
||||
@@ -702,7 +719,7 @@ jobs:
|
||||
const path = require('path');
|
||||
const fs = require('fs');
|
||||
const release_id = '${{ steps.create_release.outputs.id }}';
|
||||
for (let file of await fs.readdirSync('./artifact/release')) {
|
||||
for (let file of await fs.readdirSync('./release')) {
|
||||
if (path.extname(file) === '.zip') {
|
||||
console.log('uploadReleaseAsset', file);
|
||||
await github.repos.uploadReleaseAsset({
|
||||
@@ -710,7 +727,7 @@ jobs:
|
||||
repo: context.repo.repo,
|
||||
release_id: release_id,
|
||||
name: file,
|
||||
data: await fs.readFileSync(`./artifact/release/${file}`)
|
||||
data: await fs.readFileSync(`./release/${file}`)
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
+1
-1
@@ -1678,7 +1678,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params) {
|
||||
params.warmup = false;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING}));
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL}));
|
||||
add_opt(common_arg(
|
||||
{"--spm-infill"},
|
||||
string_format(
|
||||
|
||||
@@ -2645,7 +2645,7 @@ class Qwen2Model(TextModel):
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
|
||||
@ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
|
||||
class Qwen2VLModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.QWEN2VL
|
||||
|
||||
@@ -2669,7 +2669,7 @@ class Qwen2VLModel(TextModel):
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
|
||||
@ModelBase.register("Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
|
||||
@ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
|
||||
class Qwen2VLVisionModel(VisionModel):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
@@ -81,14 +81,14 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
|
||||
}
|
||||
}
|
||||
|
||||
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
|
||||
static void batch_encode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
|
||||
// clear previous kv_cache values (irrelevant for embeddings)
|
||||
llama_kv_self_clear(ctx);
|
||||
|
||||
// run model
|
||||
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
|
||||
if (llama_decode(ctx, batch) < 0) {
|
||||
LOG_ERR("%s : failed to decode\n", __func__);
|
||||
if (llama_encode(ctx, batch) < 0) {
|
||||
LOG_ERR("%s : failed to encode\n", __func__);
|
||||
}
|
||||
|
||||
for (int i = 0; i < batch.n_tokens; i++) {
|
||||
@@ -233,7 +233,7 @@ int main(int argc, char ** argv) {
|
||||
// encode if at capacity
|
||||
if (batch.n_tokens + n_toks > n_batch) {
|
||||
float * out = emb + p * n_embd;
|
||||
batch_decode(ctx, batch, out, s, n_embd);
|
||||
batch_encode(ctx, batch, out, s, n_embd);
|
||||
common_batch_clear(batch);
|
||||
p += s;
|
||||
s = 0;
|
||||
@@ -246,7 +246,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// final batch
|
||||
float * out = emb + p * n_embd;
|
||||
batch_decode(ctx, batch, out, s, n_embd);
|
||||
batch_encode(ctx, batch, out, s, n_embd);
|
||||
|
||||
// save embeddings to chunks
|
||||
for (int i = 0; i < n_chunks; i++) {
|
||||
@@ -267,7 +267,7 @@ int main(int argc, char ** argv) {
|
||||
batch_add_seq(query_batch, query_tokens, 0);
|
||||
|
||||
std::vector<float> query_emb(n_embd, 0);
|
||||
batch_decode(ctx, query_batch, query_emb.data(), 1, n_embd);
|
||||
batch_encode(ctx, query_batch, query_emb.data(), 1, n_embd);
|
||||
|
||||
common_batch_clear(query_batch);
|
||||
|
||||
|
||||
@@ -98,7 +98,7 @@ int main(int argc, char ** argv) {
|
||||
auto generate = [&](const std::string & prompt) {
|
||||
std::string response;
|
||||
|
||||
const bool is_first = llama_kv_self_used_cells(ctx) == 0;
|
||||
const bool is_first = llama_kv_self_seq_pos_max(ctx, 0) == 0;
|
||||
|
||||
// tokenize the prompt
|
||||
const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
|
||||
@@ -113,7 +113,7 @@ int main(int argc, char ** argv) {
|
||||
while (true) {
|
||||
// check if we have enough space in the context to evaluate this batch
|
||||
int n_ctx = llama_n_ctx(ctx);
|
||||
int n_ctx_used = llama_kv_self_used_cells(ctx);
|
||||
int n_ctx_used = llama_kv_self_seq_pos_max(ctx, 0);
|
||||
if (n_ctx_used + batch.n_tokens > n_ctx) {
|
||||
printf("\033[0m\n");
|
||||
fprintf(stderr, "context size exceeded\n");
|
||||
|
||||
+12
-1
@@ -528,14 +528,15 @@ extern "C" {
|
||||
GGML_UNARY_OP_STEP,
|
||||
GGML_UNARY_OP_TANH,
|
||||
GGML_UNARY_OP_ELU,
|
||||
GGML_UNARY_OP_RELU,
|
||||
GGML_UNARY_OP_SIGMOID,
|
||||
GGML_UNARY_OP_GELU,
|
||||
GGML_UNARY_OP_GELU_ERF,
|
||||
GGML_UNARY_OP_GELU_QUICK,
|
||||
GGML_UNARY_OP_SILU,
|
||||
GGML_UNARY_OP_HARDSWISH,
|
||||
GGML_UNARY_OP_HARDSIGMOID,
|
||||
GGML_UNARY_OP_EXP,
|
||||
GGML_UNARY_OP_RELU,
|
||||
|
||||
GGML_UNARY_OP_COUNT,
|
||||
};
|
||||
@@ -1024,6 +1025,16 @@ extern "C" {
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// GELU using erf (error function) when possible
|
||||
// some backends may fallback to approximation based on Abramowitz and Stegun formula
|
||||
GGML_API struct ggml_tensor * ggml_gelu_erf(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_gelu_erf_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_gelu_quick(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
@@ -2202,6 +2202,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||
} break;
|
||||
|
||||
case GGML_UNARY_OP_GELU:
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
case GGML_UNARY_OP_SILU:
|
||||
{
|
||||
|
||||
@@ -2691,6 +2691,109 @@ static void ggml_compute_forward_gelu(
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_gelu_erf
|
||||
|
||||
static void ggml_compute_forward_gelu_erf_f32(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
assert(ggml_is_contiguous_1(src0));
|
||||
assert(ggml_is_contiguous_1(dst));
|
||||
assert(ggml_are_same_shape(src0, dst));
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int nc = src0->ne[0];
|
||||
const int nr = ggml_nrows(src0);
|
||||
|
||||
// rows per thread
|
||||
const int dr = (nr + nth - 1)/nth;
|
||||
|
||||
// row range for this thread
|
||||
const int ir0 = dr*ith;
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
for (int i1 = ir0; i1 < ir1; i1++) {
|
||||
ggml_vec_gelu_erf_f32(nc,
|
||||
(float *) ((char *) dst->data + i1*( dst->nb[1])),
|
||||
(float *) ((char *) src0->data + i1*(src0->nb[1])));
|
||||
|
||||
#ifndef NDEBUG
|
||||
for (int k = 0; k < nc; k++) {
|
||||
const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
||||
GGML_UNUSED(x);
|
||||
assert(!isnan(x));
|
||||
assert(!isinf(x));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_gelu_erf_f16(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
assert(ggml_is_contiguous_1(src0));
|
||||
assert(ggml_is_contiguous_1(dst));
|
||||
assert(ggml_are_same_shape(src0, dst));
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int nc = src0->ne[0];
|
||||
const int nr = ggml_nrows(src0);
|
||||
|
||||
// rows per thread
|
||||
const int dr = (nr + nth - 1)/nth;
|
||||
|
||||
// row range for this thread
|
||||
const int ir0 = dr*ith;
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
for (int i1 = ir0; i1 < ir1; i1++) {
|
||||
ggml_vec_gelu_erf_f16(nc,
|
||||
(ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])),
|
||||
(ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1])));
|
||||
|
||||
#ifndef NDEBUG
|
||||
for (int k = 0; k < nc; k++) {
|
||||
const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
||||
const float v = GGML_FP16_TO_FP32(x);
|
||||
GGML_UNUSED(v);
|
||||
assert(!isnan(v));
|
||||
assert(!isinf(v));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_gelu_erf(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
ggml_compute_forward_gelu_erf_f32(params, dst);
|
||||
} break;
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
ggml_compute_forward_gelu_erf_f16(params, dst);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_gelu_quick
|
||||
|
||||
static void ggml_compute_forward_gelu_quick_f32(
|
||||
@@ -7749,6 +7852,10 @@ void ggml_compute_forward_unary(
|
||||
{
|
||||
ggml_compute_forward_gelu(params, dst);
|
||||
} break;
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
{
|
||||
ggml_compute_forward_gelu_erf(params, dst);
|
||||
} break;
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
{
|
||||
ggml_compute_forward_gelu_quick(params, dst);
|
||||
|
||||
@@ -428,6 +428,7 @@ inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp
|
||||
static const float GELU_COEF_A = 0.044715f;
|
||||
static const float GELU_QUICK_COEF = -1.702f;
|
||||
static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
||||
static const float SQRT_2_INV = 0.70710678118654752440084436210484f;
|
||||
|
||||
inline static float ggml_gelu_f32(float x) {
|
||||
return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
|
||||
@@ -440,6 +441,14 @@ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp
|
||||
}
|
||||
}
|
||||
|
||||
inline static void ggml_vec_gelu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
float xi = GGML_FP16_TO_FP32(x[i]);
|
||||
float res = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
|
||||
y[i] = GGML_FP32_TO_FP16(res);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef GGML_GELU_FP16
|
||||
inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
|
||||
uint16_t t;
|
||||
@@ -463,6 +472,13 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
|
||||
}
|
||||
#endif
|
||||
|
||||
inline static void ggml_vec_gelu_erf_f32(const int n, float * y, const float * x) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
float xi = x[i];
|
||||
y[i] = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
|
||||
}
|
||||
}
|
||||
|
||||
inline static float ggml_gelu_quick_f32(float x) {
|
||||
return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
|
||||
}
|
||||
|
||||
@@ -149,6 +149,8 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_SIGMOID,
|
||||
GGML_METAL_KERNEL_TYPE_GELU,
|
||||
GGML_METAL_KERNEL_TYPE_GELU_4,
|
||||
GGML_METAL_KERNEL_TYPE_GELU_ERF,
|
||||
GGML_METAL_KERNEL_TYPE_GELU_ERF_4,
|
||||
GGML_METAL_KERNEL_TYPE_GELU_QUICK,
|
||||
GGML_METAL_KERNEL_TYPE_GELU_QUICK_4,
|
||||
GGML_METAL_KERNEL_TYPE_SILU,
|
||||
@@ -1103,6 +1105,8 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIGMOID, sigmoid, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU, gelu, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_4, gelu_4, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_ERF, gelu_erf, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_ERF_4, gelu_erf_4, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK, gelu_quick, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK_4, gelu_quick_4, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU, silu, true);
|
||||
@@ -1613,6 +1617,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||
case GGML_UNARY_OP_RELU:
|
||||
case GGML_UNARY_OP_SIGMOID:
|
||||
case GGML_UNARY_OP_GELU:
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
case GGML_UNARY_OP_SILU:
|
||||
case GGML_UNARY_OP_ELU:
|
||||
@@ -2251,6 +2256,25 @@ static bool ggml_metal_encode_node(
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
{
|
||||
int64_t n = ggml_nelements(dst);
|
||||
|
||||
id<MTLComputePipelineState> pipeline = nil;
|
||||
|
||||
if (n % 4 == 0) {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_ERF_4].pipeline;
|
||||
n /= 4;
|
||||
} else {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_ERF].pipeline;
|
||||
}
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
{
|
||||
int64_t n = ggml_nelements(dst);
|
||||
|
||||
@@ -856,6 +856,7 @@ kernel void kernel_tanh(
|
||||
constant float GELU_COEF_A = 0.044715f;
|
||||
constant float GELU_QUICK_COEF = -1.702f;
|
||||
constant float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
||||
constant float SQRT_2_INV = 0.70710678118654752440084436210484f;
|
||||
|
||||
kernel void kernel_gelu(
|
||||
device const float * src0,
|
||||
@@ -897,6 +898,42 @@ kernel void kernel_gelu_quick_4(
|
||||
dst[tpig] = x*(1.0f/(1.0f+exp(GELU_QUICK_COEF*x)));
|
||||
}
|
||||
|
||||
// based on Abramowitz and Stegun formula 7.1.26 or similar Hastings' approximation
|
||||
// ref: https://www.johndcook.com/blog/python_erf/
|
||||
constant float p_erf = 0.3275911f;
|
||||
constant float a1_erf = 0.254829592f;
|
||||
constant float a2_erf = -0.284496736f;
|
||||
constant float a3_erf = 1.421413741f;
|
||||
constant float a4_erf = -1.453152027f;
|
||||
constant float a5_erf = 1.061405429f;
|
||||
|
||||
template<typename T>
|
||||
T erf_approx(T x) {
|
||||
T sign_x = sign(x);
|
||||
x = fabs(x);
|
||||
T t = 1.0f / (1.0f + p_erf * x);
|
||||
T y = 1.0f - (((((a5_erf * t + a4_erf) * t) + a3_erf) * t + a2_erf) * t + a1_erf) * t * exp(-x * x);
|
||||
return sign_x * y;
|
||||
}
|
||||
|
||||
kernel void kernel_gelu_erf(
|
||||
device const float * src0,
|
||||
device float * dst,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
device const float & x = src0[tpig];
|
||||
|
||||
dst[tpig] = 0.5f*x*(1.0f+erf_approx<float>(x*SQRT_2_INV));
|
||||
}
|
||||
|
||||
kernel void kernel_gelu_erf_4(
|
||||
device const float4 * src0,
|
||||
device float4 * dst,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
device const float4 & x = src0[tpig];
|
||||
|
||||
dst[tpig] = 0.5f*x*(1.0f+erf_approx<float4>(x*SQRT_2_INV));
|
||||
}
|
||||
|
||||
kernel void kernel_silu(
|
||||
device const float * src0,
|
||||
device float * dst,
|
||||
|
||||
@@ -27,6 +27,7 @@
|
||||
#include <cmath>
|
||||
#include <memory>
|
||||
#include <charconv>
|
||||
#include <mutex>
|
||||
|
||||
#undef MIN
|
||||
#undef MAX
|
||||
@@ -74,6 +75,7 @@ struct ggml_cl_version {
|
||||
cl_uint minor = 0;
|
||||
};
|
||||
|
||||
|
||||
struct ggml_cl_compiler_version {
|
||||
ADRENO_CL_COMPILER_TYPE type;
|
||||
int major = -1;
|
||||
@@ -91,6 +93,14 @@ struct ggml_cl_compiler_version {
|
||||
}
|
||||
};
|
||||
|
||||
static size_t align_to(size_t value, size_t to_alignment) {
|
||||
GGML_ASSERT(to_alignment && "Invalid alignment (must be non-zero)");
|
||||
GGML_ASSERT((to_alignment & (to_alignment - 1)) == 0 && "to_alignment must be power-of-two");
|
||||
|
||||
return ((value + to_alignment - 1) / to_alignment) * to_alignment;
|
||||
}
|
||||
|
||||
|
||||
// Parses a version string of form "XX.YY ". On an error returns ggml_cl_version with all zeroes.
|
||||
static ggml_cl_version parse_cl_version(std::string_view str) {
|
||||
size_t major_str_begin = 0;
|
||||
@@ -221,13 +231,25 @@ static ggml_cl_compiler_version get_adreno_cl_compiler_version(const char *drive
|
||||
return { type, major, minor, patch };
|
||||
}
|
||||
|
||||
struct ggml_backend_opencl_context;
|
||||
|
||||
// backend device context
|
||||
struct ggml_backend_opencl_device_context {
|
||||
cl_platform_id platform;
|
||||
std::string platform_name;
|
||||
|
||||
cl_device_id device;
|
||||
std::string device_name;
|
||||
cl_device_id device;
|
||||
std::string device_name;
|
||||
cl_device_type device_type;
|
||||
std::string device_version;
|
||||
|
||||
// Initialized by ggml_cl2_init().
|
||||
ggml_backend_opencl_context * backend_ctx = nullptr;
|
||||
|
||||
// Initialized by ggml_backend_opencl_device_get_buffer_type()
|
||||
ggml_backend_buffer_type buffer_type;
|
||||
|
||||
cl_context context = nullptr;
|
||||
};
|
||||
|
||||
// backend context
|
||||
@@ -248,6 +270,8 @@ struct ggml_backend_opencl_context {
|
||||
|
||||
int adreno_wave_size;
|
||||
|
||||
cl_bool non_uniform_workgroups;
|
||||
|
||||
cl_context context;
|
||||
cl_command_queue queue;
|
||||
|
||||
@@ -344,15 +368,8 @@ struct ggml_backend_opencl_context {
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
};
|
||||
|
||||
static ggml_backend_device g_ggml_backend_opencl_device;
|
||||
static ggml_backend_opencl_device_context g_ggml_ctx_dev_main {
|
||||
/*.platform =*/ nullptr,
|
||||
/*.platform_nane =*/ "",
|
||||
/*.device =*/ nullptr,
|
||||
/*.device_name =*/ "",
|
||||
};
|
||||
|
||||
static int ggml_backend_opencl_n_devices = 0;
|
||||
// All registered devices with a default device in the front.
|
||||
static std::vector<ggml_backend_device> g_ggml_backend_opencl_devices;
|
||||
|
||||
// Profiling
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
@@ -1107,25 +1124,19 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
GGML_LOG_CONT("\n");
|
||||
}
|
||||
|
||||
static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
static bool initialized = false;
|
||||
static ggml_backend_opencl_context *backend_ctx = nullptr;
|
||||
// XXX static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
// XXX static bool initialized = false;
|
||||
// XXX static ggml_backend_opencl_context *backend_ctx = nullptr;
|
||||
|
||||
if (initialized) {
|
||||
return backend_ctx;
|
||||
}
|
||||
static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev);
|
||||
|
||||
ggml_backend_opencl_device_context *dev_ctx = (ggml_backend_opencl_device_context *)dev->context;
|
||||
GGML_ASSERT(dev_ctx);
|
||||
GGML_ASSERT(dev_ctx->platform == nullptr);
|
||||
GGML_ASSERT(dev_ctx->device == nullptr);
|
||||
GGML_ASSERT(backend_ctx == nullptr);
|
||||
namespace /* anonymous */ {
|
||||
extern struct ggml_backend_device_i ggml_backend_opencl_device_i;
|
||||
}
|
||||
|
||||
initialized = true;
|
||||
backend_ctx = new ggml_backend_opencl_context();
|
||||
backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
|
||||
|
||||
cl_int err;
|
||||
// Look for available and suitable devices.
|
||||
static std::vector<ggml_backend_device> ggml_opencl_probe_devices(ggml_backend_reg * reg) {
|
||||
std::vector<ggml_backend_device> found_devices;
|
||||
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
GGML_LOG_INFO("ggml_opencl: OpenCL profiling enabled\n");
|
||||
@@ -1158,11 +1169,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
struct cl_device devices[NDEV];
|
||||
unsigned n_devices = 0;
|
||||
struct cl_device * default_device = NULL;
|
||||
unsigned default_platform_number = 0;
|
||||
|
||||
cl_platform_id platform_ids[NPLAT];
|
||||
if (clGetPlatformIDs(NPLAT, platform_ids, &n_platforms) != CL_SUCCESS) {
|
||||
GGML_LOG_ERROR("ggml_opencl: plaform IDs not available.\n");
|
||||
return backend_ctx;
|
||||
return found_devices;
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < n_platforms; i++) {
|
||||
@@ -1197,19 +1209,22 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
}
|
||||
|
||||
if (default_device == NULL && p->default_device != NULL) {
|
||||
default_device = p->default_device;
|
||||
default_device = p->default_device;
|
||||
default_platform_number = i;
|
||||
}
|
||||
}
|
||||
|
||||
if (n_devices == 0) {
|
||||
GGML_LOG_ERROR("ggml_opencl: could find any OpenCL devices.\n");
|
||||
return backend_ctx;
|
||||
return found_devices;
|
||||
}
|
||||
|
||||
char * user_platform_string = getenv("GGML_OPENCL_PLATFORM");
|
||||
char * user_device_string = getenv("GGML_OPENCL_DEVICE");
|
||||
int user_platform_number = -1;
|
||||
int user_device_number = -1;
|
||||
char * user_platform_string = getenv("GGML_OPENCL_PLATFORM");
|
||||
char * user_device_string = getenv("GGML_OPENCL_DEVICE");
|
||||
int user_platform_number = -1;
|
||||
int user_device_number = -1;
|
||||
cl_device * candidate_devices = nullptr;
|
||||
unsigned n_candidate_devices = 0;
|
||||
|
||||
unsigned n;
|
||||
if (user_platform_string != NULL && sscanf(user_platform_string, " %u", &n) == 1 && n < n_platforms) {
|
||||
@@ -1224,12 +1239,11 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
GGML_LOG_ERROR("ggml_opencl: invalid device number %d\n", user_device_number);
|
||||
exit(1);
|
||||
}
|
||||
default_device = &platform->devices[user_device_number];
|
||||
default_device = &platform->devices[user_device_number];
|
||||
candidate_devices = platform->devices;
|
||||
n_candidate_devices = platform->n_devices;
|
||||
} else {
|
||||
|
||||
struct cl_device * selected_devices = devices;
|
||||
unsigned n_selected_devices = n_devices;
|
||||
|
||||
// Choose a platform by matching a substring.
|
||||
if (user_platform_number == -1 && user_platform_string != NULL && user_platform_string[0] != 0) {
|
||||
for (unsigned i = 0; i < n_platforms; i++) {
|
||||
struct cl_platform * p = &platforms[i];
|
||||
@@ -1244,20 +1258,20 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
if (user_platform_number != -1) {
|
||||
struct cl_platform * p = &platforms[user_platform_number];
|
||||
selected_devices = p->devices;
|
||||
n_selected_devices = p->n_devices;
|
||||
default_device = p->default_device;
|
||||
if (n_selected_devices == 0) {
|
||||
GGML_LOG_ERROR("ggml_opencl: selected platform '%s' does not have any devices.\n", p->name);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
int platform_idx = user_platform_number != -1 ? user_platform_number : default_platform_number;
|
||||
struct cl_platform * p = &platforms[platform_idx];
|
||||
candidate_devices = p->devices;
|
||||
n_candidate_devices = p->n_devices;
|
||||
default_device = p->default_device;
|
||||
if (n_candidate_devices == 0) {
|
||||
GGML_LOG_ERROR("ggml_opencl: selected platform '%s' does not have any devices.\n", p->name);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (user_device_number == -1 && user_device_string != NULL && user_device_string[0] != 0) {
|
||||
for (unsigned i = 0; i < n_selected_devices; i++) {
|
||||
struct cl_device * d = &selected_devices[i];
|
||||
for (unsigned i = 0; i < n_candidate_devices; i++) {
|
||||
struct cl_device * d = &candidate_devices[i];
|
||||
if (strstr(d->name, user_device_string) != NULL) {
|
||||
user_device_number = d->number;
|
||||
break;
|
||||
@@ -1269,71 +1283,145 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
}
|
||||
}
|
||||
if (user_device_number != -1) {
|
||||
selected_devices = &devices[user_device_number];
|
||||
n_selected_devices = 1;
|
||||
default_device = &selected_devices[0];
|
||||
candidate_devices = &devices[user_device_number];
|
||||
n_candidate_devices = 1;
|
||||
default_device = &candidate_devices[0];
|
||||
}
|
||||
|
||||
GGML_ASSERT(n_selected_devices > 0);
|
||||
GGML_ASSERT(n_candidate_devices > 0);
|
||||
|
||||
if (default_device == NULL) {
|
||||
default_device = &selected_devices[0];
|
||||
default_device = &candidate_devices[0];
|
||||
}
|
||||
}
|
||||
|
||||
GGML_LOG_INFO("ggml_opencl: selecting platform: '%s'\n", default_device->platform->name);
|
||||
GGML_LOG_INFO("ggml_opencl: selecting device: '%s (%s)'\n", default_device->name, default_device->version);
|
||||
if (default_device->type != CL_DEVICE_TYPE_GPU) {
|
||||
GGML_LOG_WARN("ggml_opencl: warning, not a GPU: '%s'.\n", default_device->name);
|
||||
GGML_ASSERT(n_candidate_devices != 0 && candidate_devices);
|
||||
|
||||
// Put the default device in front.
|
||||
for (unsigned i = 1; i < n_candidate_devices; i++) {
|
||||
if (&candidate_devices[i] == default_device) {
|
||||
std::swap(candidate_devices[0], candidate_devices[i]);
|
||||
default_device = &candidate_devices[0];
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
dev_ctx->platform = default_device->platform->id;
|
||||
dev_ctx->device = default_device->id;
|
||||
backend_ctx->device = default_device->id;
|
||||
GGML_LOG_INFO("ggml_opencl: selected platform: '%s'\n", default_device->platform->name);
|
||||
|
||||
if (strstr(default_device->name, "Adreno") ||
|
||||
strstr(default_device->name, "Qualcomm") ||
|
||||
strstr(default_device->version, "Adreno")) {
|
||||
std::vector<cl_device_id> device_ids;
|
||||
for (auto dev = candidate_devices, dev_end = candidate_devices + n_candidate_devices; dev != dev_end; dev++) {
|
||||
device_ids.push_back(dev->id);
|
||||
}
|
||||
|
||||
cl_int err;
|
||||
cl_context shared_context;
|
||||
cl_context_properties properties[] = { (intptr_t) CL_CONTEXT_PLATFORM, (intptr_t) default_device->platform->id, 0 };
|
||||
|
||||
CL_CHECK(
|
||||
(shared_context = clCreateContext(properties, device_ids.size(), device_ids.data(), NULL, NULL, &err), err));
|
||||
|
||||
for (auto dev = candidate_devices, dev_end = candidate_devices + n_candidate_devices; dev != dev_end; dev++) {
|
||||
GGML_LOG_INFO("\nggml_opencl: device: '%s (%s)'\n", dev->name, dev->version);
|
||||
|
||||
auto dev_ctx = std::unique_ptr<ggml_backend_opencl_device_context>(new ggml_backend_opencl_device_context{
|
||||
/*.platform =*/dev->platform->id,
|
||||
/*.platform_nane =*/dev->platform->name,
|
||||
/*.device =*/dev->id,
|
||||
/*.device_name =*/dev->name,
|
||||
/*.device_type =*/dev->type,
|
||||
/*.device_version =*/dev->version,
|
||||
/*.backend_ctx =*/nullptr,
|
||||
/*.buffer_type =*/{},
|
||||
/*.context =*/shared_context,
|
||||
});
|
||||
|
||||
found_devices.push_back(ggml_backend_device{
|
||||
/* .iface = */ ggml_backend_opencl_device_i,
|
||||
/* .reg = */ reg,
|
||||
/* .context = */ dev_ctx.get(),
|
||||
});
|
||||
|
||||
if (!ggml_cl2_init(&found_devices.back())) {
|
||||
found_devices.pop_back();
|
||||
GGML_LOG_INFO("ggml_opencl: drop unsupported device.\n");
|
||||
continue;
|
||||
}
|
||||
|
||||
dev_ctx.release();
|
||||
}
|
||||
|
||||
if (found_devices.size()) {
|
||||
auto * dev_ctx = static_cast<ggml_backend_opencl_device_context *>(found_devices.front().context);
|
||||
GGML_LOG_INFO("ggml_opencl: default device: '%s (%s)'\n", dev_ctx->device_name.c_str(),
|
||||
dev_ctx->device_version.c_str());
|
||||
|
||||
if (dev_ctx->device_type != CL_DEVICE_TYPE_GPU) {
|
||||
GGML_LOG_WARN("ggml_opencl: warning, the default device is not a GPU: '%s'.\n",
|
||||
dev_ctx->device_name.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
return found_devices;
|
||||
}
|
||||
|
||||
// Initialize device if it is supported (returns nullptr if it is not).
|
||||
static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
GGML_ASSERT(dev);
|
||||
GGML_ASSERT(dev->context);
|
||||
|
||||
ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) dev->context;
|
||||
GGML_ASSERT(dev_ctx->platform);
|
||||
GGML_ASSERT(dev_ctx->device);
|
||||
|
||||
if (dev_ctx->backend_ctx) {
|
||||
return dev_ctx->backend_ctx;
|
||||
}
|
||||
|
||||
auto backend_ctx = std::make_unique<ggml_backend_opencl_context>();
|
||||
backend_ctx->device = dev_ctx->device;
|
||||
backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
|
||||
|
||||
if (strstr(dev_ctx->device_name.c_str(), "Adreno") ||
|
||||
strstr(dev_ctx->device_name.c_str(), "Qualcomm") ||
|
||||
strstr(dev_ctx->device_version.c_str(), "Adreno")) {
|
||||
backend_ctx->gpu_family = GPU_FAMILY::ADRENO;
|
||||
// Usually device version contains the detailed device name
|
||||
backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->version);
|
||||
backend_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_version.c_str());
|
||||
if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::ADRENO_UNKNOWN) {
|
||||
backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->name);
|
||||
backend_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_name.c_str());
|
||||
}
|
||||
|
||||
// Use wave size of 64 for all Adreno GPUs.
|
||||
backend_ctx->adreno_wave_size = 64;
|
||||
} else if (strstr(default_device->name, "Intel")) {
|
||||
} else if (strstr(dev_ctx->device_name.c_str(), "Intel")) {
|
||||
backend_ctx->gpu_family = GPU_FAMILY::INTEL;
|
||||
} else {
|
||||
GGML_LOG_ERROR("Unsupported GPU: %s\n", default_device->name);
|
||||
GGML_LOG_ERROR("Unsupported GPU: %s\n", dev_ctx->device_name.c_str());
|
||||
backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
|
||||
return backend_ctx;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
if (backend_ctx->gpu_family != GPU_FAMILY::ADRENO) {
|
||||
GGML_LOG_ERROR("ggml_opencl: Adreno-specific kernels should not be enabled for non-Adreno GPUs; "
|
||||
"run on an Adreno GPU or recompile with CMake option `-DGGML_OPENCL_USE_ADRENO_KERNELS=OFF`\n");
|
||||
return backend_ctx;
|
||||
return nullptr;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Populate backend device name
|
||||
dev_ctx->platform_name = default_device->platform->name;
|
||||
dev_ctx->device_name = default_device->name;
|
||||
backend_ctx->device_name = default_device->name;
|
||||
backend_ctx->device_name = dev_ctx->device_name;
|
||||
|
||||
// A local ref of cl_device_id for convenience
|
||||
cl_device_id device = backend_ctx->device;
|
||||
|
||||
ggml_cl_version platform_version = get_opencl_platform_version(default_device->platform->id);
|
||||
ggml_cl_version platform_version = get_opencl_platform_version(dev_ctx->platform);
|
||||
|
||||
// Check device OpenCL version, OpenCL 2.0 or above is required
|
||||
ggml_cl_version opencl_c_version = get_opencl_c_version(platform_version, device);
|
||||
if (opencl_c_version.major < 2) {
|
||||
GGML_LOG_ERROR("ggml_opencl: OpenCL 2.0 or above is required\n");
|
||||
return backend_ctx;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Check driver version
|
||||
@@ -1364,7 +1452,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
// fp16 is required
|
||||
if (!backend_ctx->fp16_support) {
|
||||
GGML_LOG_ERROR("ggml_opencl: device does not support FP16\n");
|
||||
return backend_ctx;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes
|
||||
@@ -1373,7 +1461,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
strstr(ext_buffer, "cl_intel_subgroups") == NULL) {
|
||||
GGML_LOG_ERROR("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) "
|
||||
"(note that subgroups is an optional feature in OpenCL 3.0)\n");
|
||||
return backend_ctx;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
cl_uint base_align_in_bits;
|
||||
@@ -1397,6 +1485,15 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
GGML_LOG_INFO("ggml_opencl: SVM atomics support: %s\n",
|
||||
svm_caps & CL_DEVICE_SVM_ATOMICS ? "true" : "false");
|
||||
|
||||
if (opencl_c_version.major >= 3) {
|
||||
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof(cl_bool),
|
||||
&backend_ctx->non_uniform_workgroups, 0));
|
||||
} else {
|
||||
GGML_ASSERT(opencl_c_version.major == 2);
|
||||
// Non-uniform workgroup sizes is mandatory feature in v2.x.
|
||||
backend_ctx->non_uniform_workgroups = true;
|
||||
}
|
||||
|
||||
// Print out configurations
|
||||
#ifdef GGML_OPENCL_SOA_Q
|
||||
GGML_LOG_INFO("ggml_opencl: flattening quantized weights representation as struct of arrays (GGML_OPENCL_SOA_Q)\n");
|
||||
@@ -1406,14 +1503,10 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
GGML_LOG_INFO("ggml_opencl: using kernels optimized for Adreno (GGML_OPENCL_USE_ADRENO_KERNELS)\n");
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
|
||||
cl_context_properties properties[] = {
|
||||
(intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)dev_ctx->platform, 0
|
||||
};
|
||||
|
||||
CL_CHECK((backend_ctx->context = clCreateContext(properties, 1, &device, NULL, NULL, &err), err));
|
||||
cl_int err;
|
||||
|
||||
// A local ref of cl_context for convenience
|
||||
cl_context context = backend_ctx->context;
|
||||
cl_context context = backend_ctx->context = dev_ctx->context;
|
||||
|
||||
//CL_CHECK((queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err),
|
||||
// (err != CL_INVALID_QUEUE_PROPERTIES && err != CL_INVALID_VALUE ? err :
|
||||
@@ -1426,7 +1519,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
CL_CHECK((backend_ctx->queue = clCreateCommandQueue(context, device, command_queue_props, &err), err));
|
||||
|
||||
// Load kernels
|
||||
load_cl_kernels(backend_ctx, opencl_c_version);
|
||||
load_cl_kernels(backend_ctx.get(), opencl_c_version);
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
// Allocate intermediate buffers and images
|
||||
@@ -1456,10 +1549,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
CL_CHECK((backend_ctx->B_d_max = clCreateBuffer(context, 0, max_B_d_bytes, NULL, &err), err));
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
|
||||
// For now we support a single devices
|
||||
ggml_backend_opencl_n_devices = 1;
|
||||
|
||||
return backend_ctx;
|
||||
dev_ctx->backend_ctx = backend_ctx.release();
|
||||
return dev_ctx->backend_ctx;
|
||||
}
|
||||
|
||||
static void ggml_cl2_free(void) {
|
||||
@@ -1664,10 +1755,46 @@ static void ggml_backend_opencl_synchronize(ggml_backend_t backend) {
|
||||
GGML_UNUSED(backend);
|
||||
}
|
||||
|
||||
// Syncronizes the 'backend_ctx's device with others so that commands
|
||||
// enqueued to it won't start until commands in the other devices have
|
||||
// completed.
|
||||
static void sync_with_other_backends(ggml_backend_opencl_context * backend_ctx) {
|
||||
if (g_ggml_backend_opencl_devices.size() < 2)
|
||||
return; // No other devices to synchronize with.
|
||||
|
||||
std::vector<cl_event> events;
|
||||
events.reserve(g_ggml_backend_opencl_devices.size());
|
||||
|
||||
for (ggml_backend_device & backend_dev : g_ggml_backend_opencl_devices) {
|
||||
auto * other_backend_ctx = ggml_cl2_init(&backend_dev);
|
||||
if (backend_ctx != other_backend_ctx) {
|
||||
cl_event ev;
|
||||
CL_CHECK(clEnqueueMarkerWithWaitList(other_backend_ctx->queue, 0, nullptr, &ev));
|
||||
CL_CHECK(clFlush(other_backend_ctx->queue));
|
||||
events.push_back(ev);
|
||||
}
|
||||
}
|
||||
|
||||
CL_CHECK(clEnqueueBarrierWithWaitList(backend_ctx->queue, events.size(), events.data(), nullptr));
|
||||
for (auto ev : events) {
|
||||
CL_CHECK(clReleaseEvent(ev));
|
||||
}
|
||||
}
|
||||
|
||||
static void sync_with_other_backends(ggml_backend_t backend) {
|
||||
auto * backend_ctx = static_cast<ggml_backend_opencl_context *>(backend->context);
|
||||
sync_with_other_backends(backend_ctx);
|
||||
}
|
||||
|
||||
static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
|
||||
// NOTE: this may oversynchronize by synchronizing with
|
||||
// backends/devices which don't compute 'cgraph's
|
||||
// dependencies.
|
||||
sync_with_other_backends(backend);
|
||||
|
||||
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
||||
continue;
|
||||
}
|
||||
@@ -2058,15 +2185,16 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
// The original tensor memory is divided into scales and quants, i.e.,
|
||||
// we first store scales, then quants.
|
||||
// Create subbuffer for scales.
|
||||
region.origin = extra_orig->offset + tensor->view_offs + offset;
|
||||
region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
|
||||
region.size = size_d;
|
||||
extra->d = clCreateSubBuffer(
|
||||
extra_orig->data_device, CL_MEM_READ_WRITE,
|
||||
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
||||
CL_CHECK(err);
|
||||
auto previous_origin = region.origin;
|
||||
|
||||
// Create subbuffer for quants.
|
||||
region.origin = extra_orig->offset + tensor->view_offs + offset + size_d;
|
||||
region.origin = align_to(previous_origin + size_d, backend_ctx->alignment);
|
||||
region.size = size_q;
|
||||
extra->q = clCreateSubBuffer(
|
||||
extra_orig->data_device, CL_MEM_READ_WRITE,
|
||||
@@ -2271,8 +2399,8 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||
cl_context context = backend_ctx->context;
|
||||
cl_command_queue queue = backend_ctx->queue;
|
||||
|
||||
// Make sure all previously submitted commands are finished.
|
||||
CL_CHECK(clFinish(queue));
|
||||
// Make sure all previously submitted commands in other devices are finished.
|
||||
sync_with_other_backends(backend_ctx);
|
||||
|
||||
#ifdef GGML_OPENCL_SOA_Q
|
||||
// In end-to-end runs, get_tensor is usually used to get back the logits,
|
||||
@@ -2376,13 +2504,8 @@ static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_b
|
||||
}
|
||||
|
||||
static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) {
|
||||
// FIXME: not thread safe, device may not be initialized yet
|
||||
static cl_uint alignment = -1;
|
||||
if (alignment == (cl_uint)-1) {
|
||||
ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device);
|
||||
alignment = backend_ctx->alignment;
|
||||
}
|
||||
return alignment;
|
||||
ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device);
|
||||
return backend_ctx->alignment;
|
||||
}
|
||||
|
||||
static size_t ggml_backend_opencl_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) {
|
||||
@@ -2409,16 +2532,6 @@ static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = {
|
||||
/* .is_host = */ NULL,
|
||||
};
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type() {
|
||||
static ggml_backend_buffer_type buffer_type = {
|
||||
/* .iface = */ ggml_backend_opencl_buffer_type_interface,
|
||||
/* .device = */ &g_ggml_backend_opencl_device,
|
||||
/* .context = */ nullptr,
|
||||
};
|
||||
|
||||
return &buffer_type;
|
||||
}
|
||||
|
||||
//
|
||||
// backend device
|
||||
//
|
||||
@@ -2476,9 +2589,15 @@ static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, co
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_opencl_device_get_buffer_type(ggml_backend_dev_t dev) {
|
||||
return ggml_backend_opencl_buffer_type();
|
||||
auto * dev_ctx = static_cast<ggml_backend_opencl_device_context *>(dev->context);
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
dev_ctx->buffer_type = ggml_backend_buffer_type{
|
||||
/* .iface = */ ggml_backend_opencl_buffer_type_interface,
|
||||
/* .device = */ dev,
|
||||
/* .context = */ nullptr,
|
||||
};
|
||||
|
||||
return &dev_ctx->buffer_type;
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_opencl_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
||||
@@ -2494,12 +2613,21 @@ static bool ggml_backend_opencl_device_supports_op(ggml_backend_dev_t dev, const
|
||||
}
|
||||
|
||||
static bool ggml_backend_opencl_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
||||
return buft->iface.get_name == ggml_backend_opencl_buffer_type_get_name;
|
||||
// Check 'dev' and 'buffer_type' are not objects belonging to this backend.
|
||||
if (dev->iface.get_name != ggml_backend_opencl_device_get_name ||
|
||||
buft->iface.get_name != ggml_backend_opencl_buffer_type_get_name) {
|
||||
return false;
|
||||
}
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
// Check cl_context is the same. clEnqueue* commands may not use
|
||||
// buffers from another cl_context.
|
||||
ggml_backend_opencl_context * backend_ctx0 = ggml_cl2_init(dev);
|
||||
ggml_backend_opencl_context * backend_ctx1 = ggml_cl2_init(buft->device);
|
||||
return backend_ctx0->context == backend_ctx1->context;
|
||||
}
|
||||
|
||||
static struct ggml_backend_device_i ggml_backend_opencl_device_i = {
|
||||
namespace /* anonymous */ {
|
||||
struct ggml_backend_device_i ggml_backend_opencl_device_i = {
|
||||
/* .get_name = */ ggml_backend_opencl_device_get_name,
|
||||
/* .get_description = */ ggml_backend_opencl_device_get_description,
|
||||
/* .get_memory = */ ggml_backend_opencl_device_get_memory,
|
||||
@@ -2516,6 +2644,7 @@ static struct ggml_backend_device_i ggml_backend_opencl_device_i = {
|
||||
/* .event_free = */ NULL,
|
||||
/* .event_synchronize = */ NULL,
|
||||
};
|
||||
}
|
||||
|
||||
// Backend registry
|
||||
|
||||
@@ -2526,15 +2655,15 @@ static const char * ggml_backend_opencl_reg_get_name(ggml_backend_reg_t reg) {
|
||||
}
|
||||
|
||||
static size_t ggml_backend_opencl_reg_device_count(ggml_backend_reg_t reg) {
|
||||
return ggml_backend_opencl_n_devices;
|
||||
return g_ggml_backend_opencl_devices.size();
|
||||
|
||||
GGML_UNUSED(reg);
|
||||
}
|
||||
|
||||
static ggml_backend_dev_t ggml_backend_opencl_reg_device_get(ggml_backend_reg_t reg, size_t index) {
|
||||
GGML_ASSERT(index == 0);
|
||||
GGML_ASSERT(index < ggml_backend_opencl_reg_device_count(reg));
|
||||
|
||||
return &g_ggml_backend_opencl_device;
|
||||
return &g_ggml_backend_opencl_devices[index];
|
||||
|
||||
GGML_UNUSED(reg);
|
||||
GGML_UNUSED(index);
|
||||
@@ -2548,27 +2677,23 @@ static struct ggml_backend_reg_i ggml_backend_opencl_reg_i = {
|
||||
};
|
||||
|
||||
ggml_backend_reg_t ggml_backend_opencl_reg(void) {
|
||||
// TODO: make this thread-safe somehow?
|
||||
static std::mutex mutex;
|
||||
static ggml_backend_reg reg;
|
||||
static bool initialized = false;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
|
||||
if (!initialized) {
|
||||
reg = ggml_backend_reg {
|
||||
/* .api_version = */ GGML_BACKEND_API_VERSION,
|
||||
/* .iface = */ ggml_backend_opencl_reg_i,
|
||||
/* .context = */ NULL,
|
||||
};
|
||||
|
||||
g_ggml_backend_opencl_device = ggml_backend_device {
|
||||
/* .iface = */ ggml_backend_opencl_device_i,
|
||||
/* .reg = */ ®,
|
||||
/* .context = */ &g_ggml_ctx_dev_main,
|
||||
};
|
||||
|
||||
ggml_cl2_init(&g_ggml_backend_opencl_device);
|
||||
|
||||
initialized = true;
|
||||
if (initialized) {
|
||||
return ®
|
||||
}
|
||||
initialized = true;
|
||||
|
||||
g_ggml_backend_opencl_devices = ggml_opencl_probe_devices(®);
|
||||
|
||||
reg = ggml_backend_reg{
|
||||
/* .api_version = */ GGML_BACKEND_API_VERSION,
|
||||
/* .iface = */ ggml_backend_opencl_reg_i,
|
||||
/* .context = */ NULL,
|
||||
};
|
||||
|
||||
return ®
|
||||
}
|
||||
@@ -2942,14 +3067,19 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
size_t global_work_size[] = {(size_t)n, 1, 1};
|
||||
size_t local_work_size[] = {64, 1, 1};
|
||||
|
||||
size_t * local_work_size_ptr = local_work_size;
|
||||
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
|
||||
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
||||
}
|
||||
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
||||
|
||||
g_profiling_info.emplace_back();
|
||||
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
||||
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
||||
#else
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
||||
#endif
|
||||
} else {
|
||||
unsigned int nth = MIN(64, ne0);
|
||||
@@ -3077,14 +3207,19 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
size_t global_work_size[] = {(size_t)n, 1, 1};
|
||||
size_t local_work_size[] = {64, 1, 1};
|
||||
|
||||
size_t * local_work_size_ptr = local_work_size;
|
||||
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
|
||||
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
||||
}
|
||||
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
||||
|
||||
g_profiling_info.emplace_back();
|
||||
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
||||
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
||||
#else
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
||||
#endif
|
||||
} else {
|
||||
unsigned int nth = MIN(64, ne0);
|
||||
@@ -3233,14 +3368,19 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
size_t global_work_size[] = {(size_t)n, 1, 1};
|
||||
size_t local_work_size[] = {64, 1, 1};
|
||||
|
||||
size_t * local_work_size_ptr = local_work_size;
|
||||
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
|
||||
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
||||
}
|
||||
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
||||
|
||||
g_profiling_info.emplace_back();
|
||||
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
||||
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
||||
#else
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -3273,14 +3413,19 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
size_t global_work_size[] = {(size_t)n, 1, 1};
|
||||
size_t local_work_size[] = {64, 1, 1};
|
||||
|
||||
size_t * local_work_size_ptr = local_work_size;
|
||||
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
|
||||
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
||||
}
|
||||
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
||||
|
||||
g_profiling_info.emplace_back();
|
||||
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
||||
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
||||
#else
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -3320,14 +3465,19 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
|
||||
size_t global_work_size[] = {(size_t)n, 1, 1};
|
||||
size_t local_work_size[] = {64, 1, 1};
|
||||
|
||||
size_t * local_work_size_ptr = local_work_size;
|
||||
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
|
||||
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
||||
}
|
||||
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
||||
|
||||
g_profiling_info.emplace_back();
|
||||
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
||||
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
||||
#else
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -4230,14 +4380,19 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
|
||||
size_t global_work_size[] = {(size_t)n, 1, 1};
|
||||
size_t local_work_size[] = {64, 1, 1};
|
||||
|
||||
size_t * local_work_size_ptr = local_work_size;
|
||||
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
|
||||
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
||||
}
|
||||
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
||||
|
||||
g_profiling_info.emplace_back();
|
||||
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
||||
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
||||
#else
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -4418,14 +4573,19 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
|
||||
size_t global_work_size[] = {(size_t)ne00, (size_t)ne01, (size_t)ne02};
|
||||
size_t local_work_size[] = {64, 1, 1};
|
||||
|
||||
size_t * local_work_size_ptr = local_work_size;
|
||||
if (ne00 % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
|
||||
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
||||
}
|
||||
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
||||
|
||||
g_profiling_info.emplace_back();
|
||||
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
||||
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
||||
#else
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
+16
-1
@@ -1099,9 +1099,10 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
|
||||
"HARDSWISH",
|
||||
"HARDSIGMOID",
|
||||
"EXP",
|
||||
"GELU_ERF",
|
||||
};
|
||||
|
||||
static_assert(GGML_UNARY_OP_COUNT == 14, "GGML_UNARY_OP_COUNT != 14");
|
||||
static_assert(GGML_UNARY_OP_COUNT == 15, "GGML_UNARY_OP_COUNT != 15");
|
||||
|
||||
|
||||
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
||||
@@ -2501,6 +2502,20 @@ struct ggml_tensor * ggml_gelu_inplace(
|
||||
return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU);
|
||||
}
|
||||
|
||||
// ggml_gelu_erf
|
||||
|
||||
struct ggml_tensor * ggml_gelu_erf(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a) {
|
||||
return ggml_unary(ctx, a, GGML_UNARY_OP_GELU_ERF);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_gelu_erf_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a) {
|
||||
return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU_ERF);
|
||||
}
|
||||
|
||||
// ggml_gelu_quick
|
||||
|
||||
struct ggml_tensor * ggml_gelu_quick(
|
||||
|
||||
@@ -251,7 +251,7 @@ class GGUFReader:
|
||||
offs += curr_size
|
||||
return offs - orig_offs, aparts, data_idxs, types
|
||||
# We can't deal with this one.
|
||||
raise ValueError('Unknown/unhandled field type {gtype}')
|
||||
raise ValueError(f'Unknown/unhandled field type {gtype}')
|
||||
|
||||
def _get_tensor_info_field(self, orig_offs: int) -> ReaderField:
|
||||
offs = orig_offs
|
||||
|
||||
+4
-2
@@ -610,10 +610,12 @@ extern "C" {
|
||||
|
||||
// Returns the number of tokens in the KV cache (slow, use only for debug)
|
||||
// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
|
||||
LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx);
|
||||
DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
|
||||
"Use llama_kv_self_seq_pos_max() instead");
|
||||
|
||||
// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
|
||||
LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx);
|
||||
DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
|
||||
"Use llama_kv_self_seq_pos_max() instead");
|
||||
|
||||
// Clear the KV cache - both cell info is erased and KV data is zeroed
|
||||
LLAMA_API void llama_kv_self_clear(
|
||||
|
||||
+3
-1
@@ -1,5 +1,6 @@
|
||||
#include "llama-batch.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cstring>
|
||||
#include <algorithm>
|
||||
|
||||
@@ -281,9 +282,10 @@ llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0
|
||||
batch = in_batch;
|
||||
GGML_ASSERT(batch.n_tokens > 0);
|
||||
if (!batch.pos) {
|
||||
assert(p0 >= 0);
|
||||
pos.resize(batch.n_tokens);
|
||||
for (int32_t i = 0; i < batch.n_tokens; i++) {
|
||||
pos[i] = i + p0;
|
||||
pos[i] = p0 + i;
|
||||
}
|
||||
batch.pos = pos.data();
|
||||
}
|
||||
|
||||
+35
-4
@@ -857,11 +857,17 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (!inp_batch.pos) {
|
||||
if (inp_batch.seq_id) {
|
||||
LLAMA_LOG_ERROR("%s: pos == NULL, but seq_id != NULL\n", __func__);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
|
||||
|
||||
// temporary allocate memory for the input batch if needed
|
||||
// TODO: this is incorrect for multiple sequences because get_pos_max() is the maximum across all sequences
|
||||
llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->get_pos_max() + 1);
|
||||
llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->seq_pos_max(0) + 1);
|
||||
|
||||
const llama_batch & batch = batch_allocr.batch;
|
||||
|
||||
@@ -2292,22 +2298,47 @@ int32_t llama_apply_adapter_cvec(
|
||||
// kv cache
|
||||
//
|
||||
|
||||
// deprecated
|
||||
int32_t llama_kv_self_n_tokens(const llama_context * ctx) {
|
||||
const auto * kv = ctx->get_kv_self();
|
||||
if (!kv) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return kv->get_n_tokens();
|
||||
int32_t res = 0;
|
||||
|
||||
for (uint32_t s = 0; s < ctx->get_cparams().n_seq_max; s++) {
|
||||
const llama_pos p0 = kv->seq_pos_min(s);
|
||||
const llama_pos p1 = kv->seq_pos_max(s);
|
||||
|
||||
if (p0 >= 0) {
|
||||
res += (p1 - p0) + 1;
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// deprecated
|
||||
// note: this is the same as above - will be removed anyway, so it's ok
|
||||
int32_t llama_kv_self_used_cells(const llama_context * ctx) {
|
||||
const auto * kv = ctx->get_kv_self();
|
||||
if (!kv) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return kv->get_used_cells();
|
||||
int32_t res = 0;
|
||||
|
||||
for (uint32_t s = 0; s < ctx->get_cparams().n_seq_max; s++) {
|
||||
const llama_pos p0 = kv->seq_pos_min(s);
|
||||
const llama_pos p1 = kv->seq_pos_max(s);
|
||||
|
||||
if (p0 >= 0) {
|
||||
res += (p1 - p0) + 1;
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
void llama_kv_self_clear(llama_context * ctx) {
|
||||
|
||||
@@ -72,7 +72,7 @@ uint32_t llama_hparams::n_embd_v_s() const {
|
||||
|
||||
bool llama_hparams::is_swa(uint32_t il) const {
|
||||
if (il < n_layer) {
|
||||
return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1);
|
||||
return n_swa_pattern == 0 || (il % n_swa_pattern < (n_swa_pattern - 1));
|
||||
}
|
||||
|
||||
GGML_ABORT("fatal error");
|
||||
|
||||
+12
-1
@@ -104,7 +104,18 @@ struct llama_hparams {
|
||||
llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
|
||||
uint32_t n_swa = 0; // the size of the sliding window (0 - no SWA)
|
||||
uint32_t n_swa_pattern = 1; // by default, all layers use non-sliding-window attention
|
||||
uint32_t n_swa_pattern = 1; // this value n means that every nth layer is dense (i.e. non-SWA)
|
||||
// by default n == 1, all layers are dense
|
||||
// note that if n_swa_pattern == 0, all layers are SWA
|
||||
// example: n_swa_pattern = 3
|
||||
// il == 0: swa
|
||||
// il == 1: swa
|
||||
// il == 2: dense
|
||||
// il == 3: swa
|
||||
// il == 4: swa
|
||||
// il == 5: dense
|
||||
// il == 6: swa
|
||||
// etc ...
|
||||
|
||||
// for State Space Models
|
||||
uint32_t ssm_d_conv = 0;
|
||||
|
||||
+24
-87
@@ -30,13 +30,14 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
||||
bool v_trans,
|
||||
bool offload,
|
||||
uint32_t kv_size,
|
||||
uint32_t padding,
|
||||
uint32_t n_seq_max,
|
||||
uint32_t n_pad,
|
||||
uint32_t n_swa,
|
||||
llama_swa_type swa_type) : model(model), hparams(model.hparams), v_trans(v_trans), padding(padding), n_swa(n_swa), swa_type(swa_type) {
|
||||
GGML_ASSERT(kv_size % padding == 0 && "kv_size must be a multiple of padding");
|
||||
llama_swa_type swa_type) :
|
||||
model(model), hparams(model.hparams), v_trans(v_trans),
|
||||
n_seq_max(n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) {
|
||||
|
||||
this->type_k = type_k;
|
||||
this->type_v = type_v;
|
||||
GGML_ASSERT(kv_size % n_pad == 0);
|
||||
|
||||
// create a context for each buffer type
|
||||
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
||||
@@ -129,8 +130,8 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
||||
const size_t memory_size_k = size_k_bytes();
|
||||
const size_t memory_size_v = size_v_bytes();
|
||||
|
||||
LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6d cells, %3d layers), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
|
||||
(float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(),
|
||||
LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u seqs), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
|
||||
(float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(), n_seq_max,
|
||||
ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
|
||||
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
|
||||
}
|
||||
@@ -442,7 +443,7 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
|
||||
void llama_kv_cache_unified::defrag_sched(float thold) {
|
||||
// - do not defrag small contexts (i.e. < 2048 tokens)
|
||||
// - count the padding towards the number of used tokens
|
||||
const float fragmentation = n >= 2048 ? std::max(0.0f, 1.0f - (float(used + padding)/n)) : 0.0f;
|
||||
const float fragmentation = n >= 2048 ? std::max(0.0f, 1.0f - (float(used + n_pad)/n)) : 0.0f;
|
||||
|
||||
// queue defragmentation for next llama_kv_cache_update
|
||||
if (fragmentation > thold) {
|
||||
@@ -558,7 +559,7 @@ bool llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) {
|
||||
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
||||
// after enough generations, the benefit from this heuristic disappears
|
||||
// if we start defragmenting the cache, the benefit from this will be more important
|
||||
n = std::min(size, std::max(padding, GGML_PAD(cell_max(), padding)));
|
||||
n = std::min(size, std::max(n_pad, GGML_PAD(cell_max(), n_pad)));
|
||||
|
||||
#ifdef FIND_SLOT_DEBUG
|
||||
LLAMA_LOG_WARN("end: n = %5d, used = %5d, head = %5d, n_swa = %5d\n", n, used, head, n_swa);
|
||||
@@ -567,20 +568,6 @@ bool llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) {
|
||||
return true;
|
||||
}
|
||||
|
||||
int32_t llama_kv_cache_unified::get_n_tokens() const {
|
||||
int32_t result = 0;
|
||||
|
||||
for (uint32_t i = 0; i < size; i++) {
|
||||
result += cells[i].seq_id.size();
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
int32_t llama_kv_cache_unified::get_used_cells() const {
|
||||
return used;
|
||||
}
|
||||
|
||||
bool llama_kv_cache_unified::get_can_shift() const {
|
||||
return true;
|
||||
}
|
||||
@@ -802,16 +789,6 @@ void llama_kv_cache_unified::set_input_pos_bucket(ggml_tensor * dst, const llama
|
||||
}
|
||||
}
|
||||
|
||||
llama_pos llama_kv_cache_unified::get_pos_max() const {
|
||||
llama_pos pos_max = -1;
|
||||
|
||||
for (const auto & cell : cells) {
|
||||
pos_max = std::max(pos_max, cell.pos);
|
||||
}
|
||||
|
||||
return pos_max;
|
||||
}
|
||||
|
||||
size_t llama_kv_cache_unified::total_size() const {
|
||||
size_t size = 0;
|
||||
|
||||
@@ -1501,11 +1478,8 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
|
||||
llama_seq_id seq_id;
|
||||
io.read_to(&seq_id, sizeof(seq_id));
|
||||
|
||||
// TODO: llama_kv_cache_unified should have a notion of max sequences
|
||||
//if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
|
||||
if (seq_id < 0) {
|
||||
//LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
|
||||
LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, inf)\n", __func__, seq_id);
|
||||
if (seq_id < 0 || (uint32_t) seq_id >= n_seq_max) {
|
||||
LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, n_seq_max);
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1655,17 +1629,17 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
|
||||
ggml_type type_v,
|
||||
bool v_trans,
|
||||
bool offload,
|
||||
uint32_t kv_size,
|
||||
bool swa_full,
|
||||
uint32_t kv_size,
|
||||
uint32_t n_seq_max,
|
||||
uint32_t n_batch,
|
||||
uint32_t padding) : hparams(model.hparams) {
|
||||
uint32_t n_pad) : hparams(model.hparams) {
|
||||
llama_kv_cache_unified::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); };
|
||||
llama_kv_cache_unified::layer_filter_cb filter_swa = [&](int32_t il) { return model.hparams.is_swa(il); };
|
||||
|
||||
const uint32_t size_base = kv_size;
|
||||
|
||||
uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*n_seq_max + n_batch, padding));
|
||||
uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*n_seq_max + n_batch, n_pad));
|
||||
|
||||
// when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size and disable pruning
|
||||
if (swa_full) {
|
||||
@@ -1680,14 +1654,14 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
|
||||
|
||||
kv_base = std::make_unique<llama_kv_cache_unified>(
|
||||
model, std::move(filter_base), type_k, type_v,
|
||||
v_trans, offload, size_base, padding,
|
||||
v_trans, offload, size_base, n_seq_max, n_pad,
|
||||
0, LLAMA_SWA_TYPE_NONE);
|
||||
|
||||
LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa);
|
||||
|
||||
kv_swa = std::make_unique<llama_kv_cache_unified>(
|
||||
model, std::move(filter_swa), type_k, type_v,
|
||||
v_trans, offload, size_swa, padding,
|
||||
v_trans, offload, size_swa, n_seq_max, n_pad,
|
||||
hparams.n_swa, hparams.swa_type);
|
||||
}
|
||||
|
||||
@@ -1810,18 +1784,6 @@ bool llama_kv_cache_unified_iswa::find_slot(const llama_ubatch & batch) {
|
||||
return res;
|
||||
}
|
||||
|
||||
int32_t llama_kv_cache_unified_iswa::get_n_tokens() const {
|
||||
return kv_base->get_n_tokens();
|
||||
}
|
||||
|
||||
int32_t llama_kv_cache_unified_iswa::get_used_cells() const {
|
||||
return kv_base->get_used_cells();
|
||||
}
|
||||
|
||||
llama_pos llama_kv_cache_unified_iswa::get_pos_max() const {
|
||||
return kv_base->get_pos_max();
|
||||
}
|
||||
|
||||
bool llama_kv_cache_unified_iswa::get_can_shift() const {
|
||||
return kv_base->get_size() == kv_swa->get_size();
|
||||
}
|
||||
@@ -1853,19 +1815,17 @@ llama_kv_cache_recurrent::llama_kv_cache_recurrent(
|
||||
ggml_type type_k,
|
||||
ggml_type type_v,
|
||||
bool offload,
|
||||
uint32_t kv_size) : hparams(model.hparams) {
|
||||
uint32_t kv_size,
|
||||
uint32_t n_seq_max) : hparams(model.hparams), n_seq_max(n_seq_max) {
|
||||
const int32_t n_layer = hparams.n_layer;
|
||||
|
||||
LLAMA_LOG_INFO("%s: kv_size = %d, type_k = '%s', type_v = '%s', n_layer = %d\n",
|
||||
__func__, kv_size, ggml_type_name(type_k), ggml_type_name(type_v), n_layer);
|
||||
LLAMA_LOG_INFO("%s: kv_size = %u, n_seq_max = %u, type_k = '%s', type_v = '%s', n_layer = %d\n",
|
||||
__func__, kv_size, n_seq_max, ggml_type_name(type_k), ggml_type_name(type_v), n_layer);
|
||||
|
||||
head = 0;
|
||||
size = kv_size;
|
||||
used = 0;
|
||||
|
||||
this->type_k = type_k;
|
||||
this->type_v = type_v;
|
||||
|
||||
cells.clear();
|
||||
cells.resize(kv_size);
|
||||
|
||||
@@ -2203,8 +2163,8 @@ void llama_kv_cache_recurrent::commit() {
|
||||
pending.ranges.clear();
|
||||
}
|
||||
|
||||
bool llama_kv_cache_recurrent::update(llama_context & lctx) {
|
||||
GGML_UNUSED(lctx);
|
||||
bool llama_kv_cache_recurrent::update(llama_context & ctx) {
|
||||
GGML_UNUSED(ctx);
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -2265,7 +2225,7 @@ bool llama_kv_cache_recurrent::find_slot(
|
||||
if (seq_id < 0 || (uint32_t) seq_id >= size) {
|
||||
// too big seq_id
|
||||
// TODO: would it be possible to resize the cache instead?
|
||||
LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, size);
|
||||
LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%u Try using a bigger --parallel value\n", __func__, seq_id, n_seq_max);
|
||||
return false;
|
||||
}
|
||||
if (j > 0) {
|
||||
@@ -2408,29 +2368,6 @@ bool llama_kv_cache_recurrent::find_slot(
|
||||
return n >= n_seqs;
|
||||
}
|
||||
|
||||
int32_t llama_kv_cache_recurrent::get_n_tokens() const {
|
||||
int32_t result = 0;
|
||||
|
||||
for (uint32_t i = 0; i < size; i++) {
|
||||
result += cells[i].seq_id.size();
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
int32_t llama_kv_cache_recurrent::get_used_cells() const {
|
||||
return used;
|
||||
}
|
||||
|
||||
llama_pos llama_kv_cache_recurrent::get_pos_max() const {
|
||||
llama_pos pos_max = -1;
|
||||
for (const auto & cell : cells) {
|
||||
pos_max = std::max(pos_max, cell.pos);
|
||||
}
|
||||
|
||||
return pos_max;
|
||||
}
|
||||
|
||||
bool llama_kv_cache_recurrent::get_can_shift() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
+15
-36
@@ -55,10 +55,7 @@ struct llama_kv_cache : public llama_memory_i {
|
||||
// =============================================================================================================
|
||||
|
||||
// getters
|
||||
virtual int32_t get_n_tokens() const = 0;
|
||||
virtual int32_t get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache
|
||||
virtual llama_pos get_pos_max() const = 0;
|
||||
virtual bool get_can_shift() const = 0;
|
||||
virtual bool get_can_shift() const = 0;
|
||||
|
||||
bool get_can_edit() const override { return get_can_shift(); }
|
||||
|
||||
@@ -108,7 +105,8 @@ public:
|
||||
bool v_trans,
|
||||
bool offload,
|
||||
uint32_t kv_size,
|
||||
uint32_t padding,
|
||||
uint32_t n_seq_max,
|
||||
uint32_t n_pad,
|
||||
uint32_t n_swa,
|
||||
llama_swa_type swa_type);
|
||||
|
||||
@@ -150,12 +148,6 @@ public:
|
||||
// to the first cell of the slot.
|
||||
bool find_slot(const llama_ubatch & batch) override;
|
||||
|
||||
int32_t get_n_tokens() const override;
|
||||
int32_t get_used_cells() const override;
|
||||
|
||||
// TODO: better data structures to reduce the cost of this operation
|
||||
llama_pos get_pos_max() const override;
|
||||
|
||||
bool get_can_shift() const override;
|
||||
|
||||
// state write/load
|
||||
@@ -228,16 +220,15 @@ private:
|
||||
// computed before each graph build
|
||||
uint32_t n = 0;
|
||||
|
||||
// required padding
|
||||
uint32_t padding = 1;
|
||||
const uint32_t n_seq_max = 1;
|
||||
|
||||
ggml_type type_k = GGML_TYPE_F16;
|
||||
ggml_type type_v = GGML_TYPE_F16;
|
||||
// required padding
|
||||
const uint32_t n_pad = 1;
|
||||
|
||||
// SWA
|
||||
uint32_t n_swa = 0;
|
||||
const uint32_t n_swa = 0;
|
||||
|
||||
llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
|
||||
std::vector<ggml_context_ptr> ctxs;
|
||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||
@@ -317,11 +308,11 @@ public:
|
||||
ggml_type type_v,
|
||||
bool v_trans,
|
||||
bool offload,
|
||||
uint32_t kv_size,
|
||||
bool swa_full,
|
||||
uint32_t kv_size,
|
||||
uint32_t n_seq_max,
|
||||
uint32_t n_batch,
|
||||
uint32_t padding);
|
||||
uint32_t n_pad);
|
||||
|
||||
~llama_kv_cache_unified_iswa() = default;
|
||||
|
||||
@@ -358,12 +349,6 @@ public:
|
||||
|
||||
bool find_slot(const llama_ubatch & batch) override;
|
||||
|
||||
int32_t get_n_tokens() const override;
|
||||
int32_t get_used_cells() const override;
|
||||
|
||||
// TODO: better data structures to reduce the cost of this operation
|
||||
llama_pos get_pos_max() const override;
|
||||
|
||||
bool get_can_shift() const override;
|
||||
|
||||
// state write/load
|
||||
@@ -432,7 +417,8 @@ public:
|
||||
ggml_type type_k,
|
||||
ggml_type type_v,
|
||||
bool offload,
|
||||
uint32_t kv_size);
|
||||
uint32_t kv_size,
|
||||
uint32_t n_seq_max);
|
||||
|
||||
~llama_kv_cache_recurrent() = default;
|
||||
|
||||
@@ -444,7 +430,7 @@ public:
|
||||
|
||||
bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override;
|
||||
void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
|
||||
void seq_keep(llama_seq_id seq_id) override;
|
||||
void seq_keep(llama_seq_id seq_id) override;
|
||||
void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) override;
|
||||
void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override;
|
||||
|
||||
@@ -458,7 +444,7 @@ public:
|
||||
void restore() override;
|
||||
void commit() override;
|
||||
|
||||
bool update(llama_context & lctx) override;
|
||||
bool update(llama_context & ctx) override;
|
||||
|
||||
void defrag_sched(float thold) override;
|
||||
|
||||
@@ -469,12 +455,6 @@ public:
|
||||
|
||||
bool find_slot(const llama_ubatch & batch) override;
|
||||
|
||||
int32_t get_n_tokens() const override;
|
||||
int32_t get_used_cells() const override;
|
||||
|
||||
// TODO: better data structures to reduce the cost of this operation
|
||||
llama_pos get_pos_max() const override;
|
||||
|
||||
bool get_can_shift() const override;
|
||||
|
||||
// TODO: temporary methods - they are not really const as they do const_cast<>, fix this
|
||||
@@ -514,8 +494,7 @@ private:
|
||||
std::vector<slot_range> ranges;
|
||||
} pending;
|
||||
|
||||
ggml_type type_k = GGML_TYPE_F16;
|
||||
ggml_type type_v = GGML_TYPE_F16;
|
||||
const uint32_t n_seq_max = 1;
|
||||
|
||||
std::vector<ggml_context_ptr> ctxs;
|
||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||
|
||||
+4
-2
@@ -13203,7 +13203,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
||||
GGML_TYPE_F32,
|
||||
GGML_TYPE_F32,
|
||||
cparams.offload_kqv,
|
||||
std::max((uint32_t) 1, cparams.n_seq_max));
|
||||
std::max((uint32_t) 1, cparams.n_seq_max),
|
||||
cparams.n_seq_max);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
@@ -13222,8 +13223,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
||||
params.type_v,
|
||||
!cparams.flash_attn,
|
||||
cparams.offload_kqv,
|
||||
cparams.n_ctx,
|
||||
params.swa_full,
|
||||
cparams.n_ctx,
|
||||
cparams.n_seq_max,
|
||||
cparams.n_batch,
|
||||
padding);
|
||||
@@ -13238,6 +13239,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
||||
!cparams.flash_attn,
|
||||
cparams.offload_kqv,
|
||||
cparams.n_ctx,
|
||||
cparams.n_seq_max,
|
||||
padding,
|
||||
hparams.n_swa,
|
||||
hparams.swa_type);
|
||||
|
||||
+2
-2
@@ -936,7 +936,7 @@ static int apply_chat_template(const struct common_chat_templates * tmpls, Llama
|
||||
// Function to tokenize the prompt
|
||||
static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt,
|
||||
std::vector<llama_token> & prompt_tokens, const LlamaData & llama_data) {
|
||||
const bool is_first = llama_kv_self_used_cells(llama_data.context.get()) == 0;
|
||||
const bool is_first = llama_kv_self_seq_pos_max(llama_data.context.get(), 0) == 0;
|
||||
|
||||
const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
|
||||
prompt_tokens.resize(n_prompt_tokens);
|
||||
@@ -952,7 +952,7 @@ static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt
|
||||
// Check if we have enough space in the context to evaluate this batch
|
||||
static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) {
|
||||
const int n_ctx = llama_n_ctx(ctx.get());
|
||||
const int n_ctx_used = llama_kv_self_used_cells(ctx.get());
|
||||
const int n_ctx_used = llama_kv_self_seq_pos_max(ctx.get(), 0);
|
||||
if (n_ctx_used + batch.n_tokens > n_ctx) {
|
||||
printf(LOG_COL_DEFAULT "\n");
|
||||
printe("context size exceeded\n");
|
||||
|
||||
+78
-28
@@ -951,7 +951,7 @@ struct server_task_result_cmpl_partial : server_task_result {
|
||||
}
|
||||
|
||||
json to_json_oaicompat_chat() {
|
||||
bool first = n_decoded == 0;
|
||||
bool first = n_decoded == 1;
|
||||
std::time_t t = std::time(0);
|
||||
json choices;
|
||||
|
||||
@@ -962,15 +962,18 @@ struct server_task_result_cmpl_partial : server_task_result {
|
||||
{"delta", json{{"role", "assistant"}}}}});
|
||||
} else {
|
||||
// We have to send this as two updates to conform to openai behavior
|
||||
// initial_ret is the role message for stream=True
|
||||
json initial_ret = json{{"choices", json::array({json{
|
||||
{"finish_reason", nullptr},
|
||||
{"index", 0},
|
||||
{"delta", json{
|
||||
{"role", "assistant"}
|
||||
{"role", "assistant"},
|
||||
{"content", ""}
|
||||
}}}})},
|
||||
{"created", t},
|
||||
{"id", oaicompat_cmpl_id},
|
||||
{"model", oaicompat_model},
|
||||
{"system_fingerprint", build_info},
|
||||
{"object", "chat.completion.chunk"}};
|
||||
|
||||
json second_ret = json{
|
||||
@@ -982,8 +985,19 @@ struct server_task_result_cmpl_partial : server_task_result {
|
||||
{"created", t},
|
||||
{"id", oaicompat_cmpl_id},
|
||||
{"model", oaicompat_model},
|
||||
{"system_fingerprint", build_info},
|
||||
{"object", "chat.completion.chunk"}};
|
||||
|
||||
if (prob_output.probs.size() > 0) {
|
||||
second_ret["choices"][0]["logprobs"] = json{
|
||||
{"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)},
|
||||
};
|
||||
}
|
||||
|
||||
if (timings.prompt_n >= 0) {
|
||||
second_ret.push_back({"timings", timings.to_json()});
|
||||
}
|
||||
|
||||
return std::vector<json>({initial_ret, second_ret});
|
||||
}
|
||||
} else {
|
||||
@@ -1137,9 +1151,6 @@ struct server_task_result_metrics : server_task_result {
|
||||
int n_tasks_deferred;
|
||||
int64_t t_start;
|
||||
|
||||
int32_t kv_cache_tokens_count;
|
||||
int32_t kv_cache_used_cells;
|
||||
|
||||
// TODO: somehow reuse server_metrics in the future, instead of duplicating the fields
|
||||
uint64_t n_prompt_tokens_processed_total = 0;
|
||||
uint64_t t_prompt_processing_total = 0;
|
||||
@@ -1179,9 +1190,6 @@ struct server_task_result_metrics : server_task_result {
|
||||
{ "n_decode_total", n_decode_total },
|
||||
{ "n_busy_slots_total", n_busy_slots_total },
|
||||
|
||||
{ "kv_cache_tokens_count", kv_cache_tokens_count },
|
||||
{ "kv_cache_used_cells", kv_cache_used_cells },
|
||||
|
||||
{ "slots", slots_data },
|
||||
};
|
||||
}
|
||||
@@ -2771,9 +2779,6 @@ struct server_context {
|
||||
res->n_tasks_deferred = queue_tasks.queue_tasks_deferred.size();
|
||||
res->t_start = metrics.t_start;
|
||||
|
||||
res->kv_cache_tokens_count = llama_kv_self_n_tokens(ctx);
|
||||
res->kv_cache_used_cells = llama_kv_self_used_cells(ctx);
|
||||
|
||||
res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total;
|
||||
res->t_prompt_processing_total = metrics.t_prompt_processing_total;
|
||||
res->n_tokens_predicted_total = metrics.n_tokens_predicted_total;
|
||||
@@ -3361,14 +3366,29 @@ struct server_context {
|
||||
metrics.on_decoded(slots);
|
||||
|
||||
if (ret != 0) {
|
||||
if (n_batch == 1 || ret < 0) {
|
||||
// if you get here, it means the KV cache is full - try increasing it via the context size
|
||||
SRV_ERR("failed to decode the batch: KV cache is full - try increasing it via the context size, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret);
|
||||
for (auto & slot : slots) {
|
||||
slot.release();
|
||||
send_error(slot, "Input prompt is too big compared to KV size. Please try increasing KV size.");
|
||||
{
|
||||
std::string err;
|
||||
|
||||
if (n_batch == 1 && ret == 1) {
|
||||
err = "Context size has been exceeded.";
|
||||
}
|
||||
|
||||
if (ret == -1) {
|
||||
err = "Invalid input batch.";
|
||||
}
|
||||
|
||||
if (ret < -1) {
|
||||
err = "Compute error.";
|
||||
}
|
||||
|
||||
if (!err.empty()) {
|
||||
SRV_ERR("%s, i = %d, n_batch = %d, ret = %d\n", err.c_str(), i, n_batch, ret);
|
||||
for (auto & slot : slots) {
|
||||
slot.release();
|
||||
send_error(slot, err);
|
||||
}
|
||||
break;
|
||||
}
|
||||
break; // break loop of n_batch
|
||||
}
|
||||
|
||||
// retry with half the batch size to try to find a free slot in the KV cache
|
||||
@@ -3702,6 +3722,7 @@ int main(int argc, char ** argv) {
|
||||
"/health",
|
||||
"/models",
|
||||
"/v1/models",
|
||||
"/api/tags"
|
||||
};
|
||||
|
||||
// If API key is not set, skip validation
|
||||
@@ -3740,7 +3761,7 @@ int main(int argc, char ** argv) {
|
||||
if (req.path == "/" || tmp.back() == "html") {
|
||||
res.set_content(reinterpret_cast<const char*>(loading_html), loading_html_len, "text/html; charset=utf-8");
|
||||
res.status = 503;
|
||||
} else if (req.path == "/models" || req.path == "/v1/models") {
|
||||
} else if (req.path == "/models" || req.path == "/v1/models" || req.path == "/api/tags") {
|
||||
// allow the models endpoint to be accessed during loading
|
||||
return true;
|
||||
} else {
|
||||
@@ -3883,14 +3904,6 @@ int main(int argc, char ** argv) {
|
||||
{"name", "predicted_tokens_seconds"},
|
||||
{"help", "Average generation throughput in tokens/s."},
|
||||
{"value", res_metrics->n_tokens_predicted ? 1.e3 / res_metrics->t_tokens_generation * res_metrics->n_tokens_predicted : 0.}
|
||||
},{
|
||||
{"name", "kv_cache_usage_ratio"},
|
||||
{"help", "KV-cache usage. 1 means 100 percent usage."},
|
||||
{"value", 1. * res_metrics->kv_cache_used_cells / params.n_ctx}
|
||||
},{
|
||||
{"name", "kv_cache_tokens"},
|
||||
{"help", "KV-cache tokens."},
|
||||
{"value", (uint64_t) res_metrics->kv_cache_tokens_count}
|
||||
},{
|
||||
{"name", "requests_processing"},
|
||||
{"help", "Number of requests processing."},
|
||||
@@ -4086,6 +4099,19 @@ int main(int argc, char ** argv) {
|
||||
{ "llama.context_length", ctx_server.slots.back().n_ctx, },
|
||||
}
|
||||
},
|
||||
{"modelfile", ""},
|
||||
{"parameters", ""},
|
||||
{"template", common_chat_templates_source(ctx_server.chat_templates.get())},
|
||||
{"details", {
|
||||
{"parent_model", ""},
|
||||
{"format", "gguf"},
|
||||
{"family", ""},
|
||||
{"families", {""}},
|
||||
{"parameter_size", ""},
|
||||
{"quantization_level", ""}
|
||||
}},
|
||||
{"model_info", ""},
|
||||
{"capabilities", {"completion"}}
|
||||
};
|
||||
|
||||
res_ok(res, data);
|
||||
@@ -4411,6 +4437,28 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
json models = {
|
||||
{"models", {
|
||||
{
|
||||
{"name", params.model_alias.empty() ? params.model.path : params.model_alias},
|
||||
{"model", params.model_alias.empty() ? params.model.path : params.model_alias},
|
||||
{"modified_at", ""},
|
||||
{"size", ""},
|
||||
{"digest", ""}, // dummy value, llama.cpp does not support managing model file's hash
|
||||
{"type", "model"},
|
||||
{"description", ""},
|
||||
{"tags", {""}},
|
||||
{"capabilities", {"completion"}},
|
||||
{"parameters", ""},
|
||||
{"details", {
|
||||
{"parent_model", ""},
|
||||
{"format", "gguf"},
|
||||
{"family", ""},
|
||||
{"families", {""}},
|
||||
{"parameter_size", ""},
|
||||
{"quantization_level", ""}
|
||||
}}
|
||||
}
|
||||
}},
|
||||
{"object", "list"},
|
||||
{"data", {
|
||||
{
|
||||
@@ -4420,7 +4468,7 @@ int main(int argc, char ** argv) {
|
||||
{"owned_by", "llamacpp"},
|
||||
{"meta", model_meta},
|
||||
},
|
||||
}}
|
||||
}}
|
||||
};
|
||||
|
||||
res_ok(res, models);
|
||||
@@ -4748,11 +4796,13 @@ int main(int argc, char ** argv) {
|
||||
svr->Post("/api/show", handle_api_show);
|
||||
svr->Get ("/models", handle_models); // public endpoint (no API key check)
|
||||
svr->Get ("/v1/models", handle_models); // public endpoint (no API key check)
|
||||
svr->Get ("/api/tags", handle_models); // ollama specific endpoint. public endpoint (no API key check)
|
||||
svr->Post("/completion", handle_completions); // legacy
|
||||
svr->Post("/completions", handle_completions);
|
||||
svr->Post("/v1/completions", handle_completions_oai);
|
||||
svr->Post("/chat/completions", handle_chat_completions);
|
||||
svr->Post("/v1/chat/completions", handle_chat_completions);
|
||||
svr->Post("/api/chat", handle_chat_completions); // ollama specific endpoint
|
||||
svr->Post("/infill", handle_infill);
|
||||
svr->Post("/embedding", handle_embeddings); // legacy
|
||||
svr->Post("/embeddings", handle_embeddings);
|
||||
|
||||
@@ -71,8 +71,14 @@ def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_conte
|
||||
})
|
||||
content = ""
|
||||
last_cmpl_id = None
|
||||
for data in res:
|
||||
for i, data in enumerate(res):
|
||||
choice = data["choices"][0]
|
||||
if i == 0:
|
||||
# Check first role message for stream=True
|
||||
assert choice["delta"]["content"] == ""
|
||||
assert choice["delta"]["role"] == "assistant"
|
||||
else:
|
||||
assert "role" not in choice["delta"]
|
||||
assert data["system_fingerprint"].startswith("b")
|
||||
assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future
|
||||
if last_cmpl_id is None:
|
||||
@@ -242,12 +248,18 @@ def test_chat_completion_with_timings_per_token():
|
||||
"stream": True,
|
||||
"timings_per_token": True,
|
||||
})
|
||||
for data in res:
|
||||
assert "timings" in data
|
||||
assert "prompt_per_second" in data["timings"]
|
||||
assert "predicted_per_second" in data["timings"]
|
||||
assert "predicted_n" in data["timings"]
|
||||
assert data["timings"]["predicted_n"] <= 10
|
||||
for i, data in enumerate(res):
|
||||
if i == 0:
|
||||
# Check first role message for stream=True
|
||||
assert data["choices"][0]["delta"]["content"] == ""
|
||||
assert data["choices"][0]["delta"]["role"] == "assistant"
|
||||
else:
|
||||
assert "role" not in data["choices"][0]["delta"]
|
||||
assert "timings" in data
|
||||
assert "prompt_per_second" in data["timings"]
|
||||
assert "predicted_per_second" in data["timings"]
|
||||
assert "predicted_n" in data["timings"]
|
||||
assert data["timings"]["predicted_n"] <= 10
|
||||
|
||||
|
||||
def test_logprobs():
|
||||
@@ -295,17 +307,23 @@ def test_logprobs_stream():
|
||||
)
|
||||
output_text = ''
|
||||
aggregated_text = ''
|
||||
for data in res:
|
||||
for i, data in enumerate(res):
|
||||
choice = data.choices[0]
|
||||
if choice.finish_reason is None:
|
||||
if choice.delta.content:
|
||||
output_text += choice.delta.content
|
||||
assert choice.logprobs is not None
|
||||
assert choice.logprobs.content is not None
|
||||
for token in choice.logprobs.content:
|
||||
aggregated_text += token.token
|
||||
assert token.logprob <= 0.0
|
||||
assert token.bytes is not None
|
||||
assert token.top_logprobs is not None
|
||||
assert len(token.top_logprobs) > 0
|
||||
if i == 0:
|
||||
# Check first role message for stream=True
|
||||
assert choice.delta.content == ""
|
||||
assert choice.delta.role == "assistant"
|
||||
else:
|
||||
assert choice.delta.role is None
|
||||
if choice.finish_reason is None:
|
||||
if choice.delta.content:
|
||||
output_text += choice.delta.content
|
||||
assert choice.logprobs is not None
|
||||
assert choice.logprobs.content is not None
|
||||
for token in choice.logprobs.content:
|
||||
aggregated_text += token.token
|
||||
assert token.logprob <= 0.0
|
||||
assert token.bytes is not None
|
||||
assert token.top_logprobs is not None
|
||||
assert len(token.top_logprobs) > 0
|
||||
assert aggregated_text == output_text
|
||||
|
||||
Reference in New Issue
Block a user