mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-30 01:27:42 +02:00
Compare commits
46 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 611aa914ef | |||
| 0cf6725e9f | |||
| 27ebfcacba | |||
| 5c86c9ed3e | |||
| efb8b47eda | |||
| 0527771dd8 | |||
| 2189fd3b63 | |||
| 3f96aeff39 | |||
| b486ba05bf | |||
| 02115dcd9a | |||
| d9c4accaff | |||
| 15e03282bb | |||
| f05a6d71a0 | |||
| ee01d71e58 | |||
| 8c83449cb7 | |||
| 1a844be132 | |||
| 0ccc121354 | |||
| 6562e5a4d6 | |||
| 51fb96b1ff | |||
| 70a6991edf | |||
| f061021206 | |||
| 8733e0cf6e | |||
| 814f795e06 | |||
| d879433824 | |||
| 13b0a04597 | |||
| bba9d945c1 | |||
| bc4e1128f7 | |||
| 39e73ae0d6 | |||
| 1f73301b63 | |||
| 4773d7a02f | |||
| 6c7fd67b64 | |||
| 141a908a59 | |||
| 32916a4907 | |||
| ffc727203a | |||
| 91a86a6f35 | |||
| f4ed10b69c | |||
| 1e333d5bba | |||
| 2f54e348ad | |||
| 2356fb1d53 | |||
| 764b85627b | |||
| 15a28ec8c7 | |||
| a7366faa5b | |||
| 9070365020 | |||
| 233461f812 | |||
| b34c859146 | |||
| 9b61acf060 |
@@ -0,0 +1,22 @@
|
||||
name: "Determine tag name"
|
||||
description: "Determine the tag name to use for a release"
|
||||
outputs:
|
||||
name:
|
||||
description: "The name of the tag"
|
||||
value: ${{ steps.tag.outputs.name }}
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
steps:
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
shell: bash
|
||||
run: |
|
||||
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
||||
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
||||
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
|
||||
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
|
||||
else
|
||||
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
|
||||
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
@@ -0,0 +1,67 @@
|
||||
name: "Windows - Setup CUDA Toolkit"
|
||||
description: "Setup CUDA Toolkit for Windows"
|
||||
inputs:
|
||||
cuda_version:
|
||||
description: "CUDA toolkit version"
|
||||
required: true
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
steps:
|
||||
- name: Install Cuda Toolkit 11.7
|
||||
if: ${{ inputs.cuda_version == '11.7' }}
|
||||
shell: pwsh
|
||||
run: |
|
||||
mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
|
||||
choco install unzip -y
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.7.99-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-11.7.99-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-11.7.99-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.7.4.6-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-11.7.91-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-11.7.91-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-11.7.101-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-11.7.91-archive.zip"
|
||||
unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cudart-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvcc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvrtc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libcublas-windows-x86_64-11.7.4.6-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvtx-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\visual_studio_integration-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvprof-windows-x86_64-11.7.101-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cccl-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
echo "CUDA_PATH_V11_7=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
|
||||
- name: Install Cuda Toolkit 12.4
|
||||
if: ${{ inputs.cuda_version == '12.4' }}
|
||||
shell: pwsh
|
||||
run: |
|
||||
mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
|
||||
choco install unzip -y
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip"
|
||||
unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
+7
-713
@@ -2,30 +2,19 @@ name: CI
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
inputs:
|
||||
create_release:
|
||||
description: 'Create new release'
|
||||
required: true
|
||||
type: boolean
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: ['.github/workflows/build.yml', '.github/workflows/build-linux-cross.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
|
||||
paths: ['.github/workflows/build.yml', '.github/workflows/build-linux-cross.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
|
||||
paths: ['.github/workflows/build.yml', '.github/workflows/build-linux-cross.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
# Fine-grant permission
|
||||
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
|
||||
permissions:
|
||||
contents: write # for creating release
|
||||
|
||||
env:
|
||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
@@ -40,8 +29,6 @@ jobs:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
@@ -74,33 +61,6 @@ jobs:
|
||||
cd build
|
||||
ctest -L 'main|curl' --verbose --timeout 900
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
shell: bash
|
||||
run: |
|
||||
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
||||
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
||||
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
|
||||
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
|
||||
else
|
||||
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
|
||||
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
|
||||
|
||||
- name: Upload artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
|
||||
name: llama-bin-macos-arm64.zip
|
||||
|
||||
macOS-latest-cmake-x64:
|
||||
runs-on: macos-13
|
||||
|
||||
@@ -108,8 +68,6 @@ jobs:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
@@ -143,33 +101,6 @@ jobs:
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
shell: bash
|
||||
run: |
|
||||
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
||||
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
||||
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
|
||||
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
|
||||
else
|
||||
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
|
||||
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
|
||||
|
||||
- name: Upload artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
|
||||
name: llama-bin-macos-x64.zip
|
||||
|
||||
ubuntu-cpu-cmake:
|
||||
strategy:
|
||||
matrix:
|
||||
@@ -185,8 +116,6 @@ jobs:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
@@ -225,33 +154,6 @@ jobs:
|
||||
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
|
||||
./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
shell: bash
|
||||
run: |
|
||||
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
||||
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
||||
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
|
||||
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
|
||||
else
|
||||
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
|
||||
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
|
||||
|
||||
- name: Upload artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
|
||||
name: llama-bin-ubuntu-${{ matrix.build }}.zip
|
||||
|
||||
ubuntu-latest-cmake-sanitizer:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
@@ -378,8 +280,6 @@ jobs:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
@@ -409,33 +309,6 @@ jobs:
|
||||
# This is using llvmpipe and runs slower than other backends
|
||||
ctest -L main --verbose --timeout 2700
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
shell: bash
|
||||
run: |
|
||||
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
||||
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
||||
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
|
||||
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
|
||||
else
|
||||
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
|
||||
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
|
||||
|
||||
- name: Upload artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
|
||||
name: llama-bin-ubuntu-vulkan-x64.zip
|
||||
|
||||
ubuntu-22-cmake-hip:
|
||||
runs-on: ubuntu-22.04
|
||||
container: rocm/dev-ubuntu-22.04:6.0.2
|
||||
@@ -831,8 +704,6 @@ jobs:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
@@ -935,35 +806,6 @@ jobs:
|
||||
# $env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1
|
||||
# & $sde -future -- ctest -L main -C Release --verbose --timeout 900
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
shell: bash
|
||||
run: |
|
||||
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
||||
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
||||
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
|
||||
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
|
||||
else
|
||||
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
|
||||
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
Copy-Item $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
|
||||
|
||||
- name: Upload artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
|
||||
name: llama-bin-win-${{ matrix.build }}.zip
|
||||
|
||||
ubuntu-latest-cmake-cuda:
|
||||
runs-on: ubuntu-latest
|
||||
container: nvidia/cuda:12.6.2-devel-ubuntu24.04
|
||||
@@ -972,8 +814,6 @@ jobs:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install dependencies
|
||||
env:
|
||||
@@ -1005,77 +845,23 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
cuda: ['12.4', '11.7']
|
||||
build: ['cuda']
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ${{ github.job }}-${{ matrix.cuda }}-${{ matrix.build }}
|
||||
key: windows-cuda-${{ matrix.cuda }}
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Install Cuda Toolkit 11.7
|
||||
if: ${{ matrix.cuda == '11.7' }}
|
||||
run: |
|
||||
mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
|
||||
choco install unzip -y
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.7.99-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-11.7.99-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-11.7.99-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.7.4.6-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-11.7.91-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-11.7.91-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-11.7.101-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-11.7.91-archive.zip"
|
||||
unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cudart-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvcc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvrtc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libcublas-windows-x86_64-11.7.4.6-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvtx-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\visual_studio_integration-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvprof-windows-x86_64-11.7.101-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cccl-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
echo "CUDA_PATH_V11_7=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
|
||||
- name: Install Cuda Toolkit 12.4
|
||||
if: ${{ matrix.cuda == '12.4' }}
|
||||
run: |
|
||||
mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
|
||||
choco install unzip -y
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip"
|
||||
unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
- name: Install Cuda Toolkit
|
||||
uses: ./.github/actions/windows-setup-cuda
|
||||
with:
|
||||
cuda_version: ${{ matrix.cuda }}
|
||||
|
||||
- name: Install Ninja
|
||||
id: install_ninja
|
||||
@@ -1105,51 +891,6 @@ jobs:
|
||||
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
|
||||
cmake --build build --config Release
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
shell: bash
|
||||
run: |
|
||||
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
||||
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
||||
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
|
||||
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
|
||||
else
|
||||
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
|
||||
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
|
||||
|
||||
- name: Upload artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
|
||||
name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip
|
||||
|
||||
- name: Copy and pack Cuda runtime
|
||||
if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
run: |
|
||||
echo "Cuda install location: ${{ env.CUDA_PATH }}"
|
||||
$dst='.\build\bin\cudart\'
|
||||
robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||
robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||
7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*
|
||||
|
||||
- name: Upload Cuda runtime
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
|
||||
name: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
|
||||
|
||||
windows-latest-cmake-sycl:
|
||||
runs-on: windows-latest
|
||||
|
||||
@@ -1165,8 +906,6 @@ jobs:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
@@ -1185,52 +924,6 @@ jobs:
|
||||
id: cmake_build
|
||||
run: examples/sycl/win-build-sycl.bat
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
shell: bash
|
||||
run: |
|
||||
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
||||
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
||||
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
|
||||
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
|
||||
else
|
||||
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
|
||||
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Build the release package
|
||||
id: pack_artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
run: |
|
||||
echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
|
||||
|
||||
echo "cp oneAPI running time dll files to ./build/bin done"
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
|
||||
|
||||
- name: Upload the release package
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
|
||||
name: llama-bin-win-sycl-x64.zip
|
||||
|
||||
windows-latest-cmake-hip:
|
||||
if: ${{ github.event.inputs.create_release != 'true' }}
|
||||
runs-on: windows-latest
|
||||
@@ -1288,110 +981,12 @@ jobs:
|
||||
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
|
||||
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
|
||||
|
||||
# TODO: reuse windows-latest-cmake-hip instead of duplicating this job
|
||||
windows-latest-cmake-hip-release:
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
runs-on: windows-latest
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
gpu_target: [gfx1100, gfx1101, gfx1030]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Clone rocWMMA repository
|
||||
id: clone_rocwmma
|
||||
run: |
|
||||
git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-latest-cmake-hip-release
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Install
|
||||
id: depends
|
||||
run: |
|
||||
$ErrorActionPreference = "Stop"
|
||||
write-host "Downloading AMD HIP SDK Installer"
|
||||
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
|
||||
write-host "Installing AMD HIP SDK"
|
||||
Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
|
||||
write-host "Completed AMD HIP SDK installation"
|
||||
|
||||
- name: Verify ROCm
|
||||
id: verify
|
||||
run: |
|
||||
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
|
||||
|
||||
- name: libCURL
|
||||
id: get_libcurl
|
||||
uses: ./.github/actions/windows-setup-curl
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
|
||||
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
|
||||
cmake -G "Unix Makefiles" -B build -S . `
|
||||
-DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
|
||||
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
|
||||
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" `
|
||||
-DCMAKE_BUILD_TYPE=Release `
|
||||
-DAMDGPU_TARGETS=${{ matrix.gpu_target }} `
|
||||
-DGGML_HIP_ROCWMMA_FATTN=ON `
|
||||
-DGGML_HIP=ON `
|
||||
-DGGML_RPC=ON `
|
||||
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
|
||||
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
|
||||
md "build\bin\rocblas\library\"
|
||||
cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
|
||||
cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
|
||||
cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
shell: bash
|
||||
run: |
|
||||
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
||||
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
||||
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
|
||||
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
|
||||
else
|
||||
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
|
||||
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\libcurl-x64.dll
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
|
||||
name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
|
||||
|
||||
ios-xcode-build:
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
@@ -1418,32 +1013,6 @@ jobs:
|
||||
- name: Build Xcode project
|
||||
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
shell: bash
|
||||
run: |
|
||||
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
||||
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
||||
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
|
||||
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
|
||||
else
|
||||
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
|
||||
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
run: |
|
||||
zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
|
||||
|
||||
- name: Upload artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
|
||||
name: llama-${{ steps.tag.outputs.name }}-xcframework
|
||||
|
||||
android-build:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
@@ -1471,283 +1040,8 @@ jobs:
|
||||
- name: Build
|
||||
run: |
|
||||
cd examples/llama.android
|
||||
|
||||
./gradlew build --no-daemon
|
||||
|
||||
release:
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
needs:
|
||||
- ubuntu-cpu-cmake
|
||||
- ubuntu-22-cmake-vulkan
|
||||
- windows-latest-cmake
|
||||
- windows-2019-cmake-cuda
|
||||
- windows-latest-cmake-sycl
|
||||
- windows-latest-cmake-hip-release
|
||||
- macOS-latest-cmake-arm64
|
||||
- macOS-latest-cmake-x64
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: release
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
shell: bash
|
||||
run: |
|
||||
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
||||
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
||||
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
|
||||
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
|
||||
else
|
||||
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
|
||||
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Download artifacts
|
||||
id: download-artifact
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
path: ./artifact
|
||||
|
||||
- name: Move artifacts
|
||||
id: move_artifacts
|
||||
run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release
|
||||
|
||||
- name: Create release
|
||||
id: create_release
|
||||
uses: ggml-org/action-create-release@v1
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
with:
|
||||
tag_name: ${{ steps.tag.outputs.name }}
|
||||
|
||||
- name: Upload release
|
||||
id: upload_release
|
||||
uses: actions/github-script@v3
|
||||
with:
|
||||
github-token: ${{secrets.GITHUB_TOKEN}}
|
||||
script: |
|
||||
const path = require('path');
|
||||
const fs = require('fs');
|
||||
const release_id = '${{ steps.create_release.outputs.id }}';
|
||||
for (let file of await fs.readdirSync('./artifact/release')) {
|
||||
if (path.extname(file) === '.zip') {
|
||||
console.log('uploadReleaseAsset', file);
|
||||
await github.repos.uploadReleaseAsset({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
release_id: release_id,
|
||||
name: file,
|
||||
data: await fs.readFileSync(`./artifact/release/${file}`)
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
# ubuntu-latest-gcc:
|
||||
# runs-on: ubuntu-latest
|
||||
#
|
||||
# strategy:
|
||||
# matrix:
|
||||
# build: [Debug, Release]
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v4
|
||||
#
|
||||
# - name: Dependencies
|
||||
# run: |
|
||||
# sudo apt-get update
|
||||
# sudo apt-get install build-essential
|
||||
# sudo apt-get install cmake
|
||||
#
|
||||
# - name: Configure
|
||||
# run: cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
|
||||
#
|
||||
# - name: Build
|
||||
# run: |
|
||||
# make
|
||||
#
|
||||
# ubuntu-latest-clang:
|
||||
# runs-on: ubuntu-latest
|
||||
#
|
||||
# strategy:
|
||||
# matrix:
|
||||
# build: [Debug, Release]
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v4
|
||||
#
|
||||
# - name: Dependencies
|
||||
# run: |
|
||||
# sudo apt-get update
|
||||
# sudo apt-get install build-essential
|
||||
# sudo apt-get install cmake
|
||||
#
|
||||
# - name: Configure
|
||||
# run: cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
|
||||
#
|
||||
# - name: Build
|
||||
# run: |
|
||||
# make
|
||||
#
|
||||
# ubuntu-latest-gcc-sanitized:
|
||||
# runs-on: ubuntu-latest
|
||||
#
|
||||
# strategy:
|
||||
# matrix:
|
||||
# sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v4
|
||||
#
|
||||
# - name: Dependencies
|
||||
# run: |
|
||||
# sudo apt-get update
|
||||
# sudo apt-get install build-essential
|
||||
# sudo apt-get install cmake
|
||||
#
|
||||
# - name: Configure
|
||||
# run: cmake . -DCMAKE_BUILD_TYPE=Debug -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON
|
||||
#
|
||||
# - name: Build
|
||||
# run: |
|
||||
# make
|
||||
#
|
||||
# windows:
|
||||
# runs-on: windows-latest
|
||||
#
|
||||
# strategy:
|
||||
# matrix:
|
||||
# build: [Release]
|
||||
# arch: [Win32, x64]
|
||||
# include:
|
||||
# - arch: Win32
|
||||
# s2arc: x86
|
||||
# - arch: x64
|
||||
# s2arc: x64
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v4
|
||||
#
|
||||
# - name: Add msbuild to PATH
|
||||
# uses: microsoft/setup-msbuild@v1
|
||||
#
|
||||
# - name: Configure
|
||||
# run: >
|
||||
# cmake -S . -B ./build -A ${{ matrix.arch }}
|
||||
# -DCMAKE_BUILD_TYPE=${{ matrix.build }}
|
||||
#
|
||||
# - name: Build
|
||||
# run: |
|
||||
# cd ./build
|
||||
# msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
|
||||
#
|
||||
# - name: Upload binaries
|
||||
# uses: actions/upload-artifact@v4
|
||||
# with:
|
||||
# name: llama-bin-${{ matrix.arch }}
|
||||
# path: build/bin/${{ matrix.build }}
|
||||
#
|
||||
# windows-blas:
|
||||
# runs-on: windows-latest
|
||||
#
|
||||
# strategy:
|
||||
# matrix:
|
||||
# build: [Release]
|
||||
# arch: [Win32, x64]
|
||||
# blas: [ON]
|
||||
# include:
|
||||
# - arch: Win32
|
||||
# obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x86.zip
|
||||
# s2arc: x86
|
||||
# - arch: x64
|
||||
# obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x64.zip
|
||||
# s2arc: x64
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v4
|
||||
#
|
||||
# - name: Add msbuild to PATH
|
||||
# uses: microsoft/setup-msbuild@v1
|
||||
#
|
||||
# - name: Fetch OpenBLAS
|
||||
# if: matrix.blas == 'ON'
|
||||
# run: |
|
||||
# C:/msys64/usr/bin/wget.exe -qO blas.zip ${{ matrix.obzip }}
|
||||
# 7z x blas.zip -oblas -y
|
||||
# copy blas/include/cblas.h .
|
||||
# copy blas/include/openblas_config.h .
|
||||
# echo "blasdir=$env:GITHUB_WORKSPACE/blas" >> $env:GITHUB_ENV
|
||||
#
|
||||
# - name: Configure
|
||||
# run: >
|
||||
# cmake -S . -B ./build -A ${{ matrix.arch }}
|
||||
# -DCMAKE_BUILD_TYPE=${{ matrix.build }}
|
||||
# -DLLAMA_SUPPORT_OPENBLAS=${{ matrix.blas }}
|
||||
# -DCMAKE_LIBRARY_PATH="$env:blasdir/lib"
|
||||
#
|
||||
# - name: Build
|
||||
# run: |
|
||||
# cd ./build
|
||||
# msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
|
||||
#
|
||||
# - name: Copy libopenblas.dll
|
||||
# if: matrix.blas == 'ON'
|
||||
# run: copy "$env:blasdir/bin/libopenblas.dll" build/bin/${{ matrix.build }}
|
||||
#
|
||||
# - name: Upload binaries
|
||||
# if: matrix.blas == 'ON'
|
||||
# uses: actions/upload-artifact@v4
|
||||
# with:
|
||||
# name: llama-blas-bin-${{ matrix.arch }}
|
||||
# path: build/bin/${{ matrix.build }}
|
||||
#
|
||||
# emscripten:
|
||||
# runs-on: ubuntu-latest
|
||||
#
|
||||
# strategy:
|
||||
# matrix:
|
||||
# build: [Release]
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v4
|
||||
#
|
||||
# - name: Dependencies
|
||||
# run: |
|
||||
# wget -q https://github.com/emscripten-core/emsdk/archive/master.tar.gz
|
||||
# tar -xvf master.tar.gz
|
||||
# emsdk-master/emsdk update
|
||||
# emsdk-master/emsdk install latest
|
||||
# emsdk-master/emsdk activate latest
|
||||
#
|
||||
# - name: Configure
|
||||
# run: echo "tmp"
|
||||
#
|
||||
# - name: Build
|
||||
# run: |
|
||||
# pushd emsdk-master
|
||||
# source ./emsdk_env.sh
|
||||
# popd
|
||||
# emcmake cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
|
||||
# make
|
||||
|
||||
openEuler-latest-cmake-cann:
|
||||
if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'Ascend NPU') }}
|
||||
defaults:
|
||||
|
||||
@@ -36,10 +36,14 @@ jobs:
|
||||
matrix:
|
||||
config:
|
||||
# Multi-stage build
|
||||
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
|
||||
# Note: the arm64 images are failing, which prevents the amd64 images from being built
|
||||
# https://github.com/ggml-org/llama.cpp/issues/11888
|
||||
#- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
|
||||
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
|
||||
- { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
|
||||
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
|
||||
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
|
||||
# Note: the intel images are failing due to an out of disk space error
|
||||
# - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
|
||||
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
|
||||
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
|
||||
#- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true }
|
||||
|
||||
@@ -0,0 +1,709 @@
|
||||
name: Create Release
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
inputs:
|
||||
create_release:
|
||||
description: 'Create new release'
|
||||
required: true
|
||||
type: boolean
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: ['.github/workflows/release.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON"
|
||||
|
||||
jobs:
|
||||
macOS-arm64:
|
||||
runs-on: macos-14
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: macOS-latest-cmake-arm64
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
continue-on-error: true
|
||||
run: |
|
||||
brew update
|
||||
brew install curl
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
sysctl -a
|
||||
cmake -B build \
|
||||
-DCMAKE_BUILD_RPATH="@loader_path" \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DGGML_RPC=ON \
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
|
||||
name: llama-bin-macos-arm64.zip
|
||||
|
||||
macOS-x64:
|
||||
runs-on: macos-13
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: macOS-latest-cmake-x64
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
continue-on-error: true
|
||||
run: |
|
||||
brew update
|
||||
brew install curl
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
sysctl -a
|
||||
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
||||
# https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
||||
cmake -B build \
|
||||
-DCMAKE_BUILD_RPATH="@loader_path" \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DGGML_METAL=OFF \
|
||||
-DGGML_RPC=ON
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
|
||||
name: llama-bin-macos-x64.zip
|
||||
|
||||
ubuntu-22-cpu:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'x64'
|
||||
os: ubuntu-22.04
|
||||
- build: 'arm64'
|
||||
os: ubuntu-22.04-arm
|
||||
|
||||
runs-on: ${{ matrix.os }}
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ubuntu-cpu-cmake
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential libcurl4-openssl-dev
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
|
||||
name: llama-bin-ubuntu-${{ matrix.build }}.zip
|
||||
|
||||
ubuntu-22-vulkan:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ubuntu-22-cmake-vulkan
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
|
||||
sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DGGML_VULKAN=ON \
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
|
||||
name: llama-bin-ubuntu-vulkan-x64.zip
|
||||
|
||||
windows:
|
||||
runs-on: windows-latest
|
||||
|
||||
env:
|
||||
OPENBLAS_VERSION: 0.3.23
|
||||
VULKAN_VERSION: 1.4.309.0
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'cpu-x64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF'
|
||||
#- build: 'openblas-x64'
|
||||
# defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||
- build: 'vulkan-x64'
|
||||
defines: '-DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
|
||||
- build: 'cpu-arm64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF'
|
||||
- build: 'opencl-adreno-arm64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-latest-cmake-${{ matrix.build }}
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Download OpenBLAS
|
||||
id: get_openblas
|
||||
if: ${{ matrix.build == 'openblas-x64' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
|
||||
curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
|
||||
mkdir $env:RUNNER_TEMP/openblas
|
||||
tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas
|
||||
$vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
|
||||
$msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
|
||||
$lib = $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
|
||||
& $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
|
||||
|
||||
- name: Install Vulkan SDK
|
||||
id: get_vulkan
|
||||
if: ${{ matrix.build == 'vulkan-x64' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
|
||||
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
|
||||
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
|
||||
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
|
||||
|
||||
- name: Install Ninja
|
||||
id: install_ninja
|
||||
run: |
|
||||
choco install ninja
|
||||
|
||||
- name: Install OpenCL Headers and Libs
|
||||
id: install_opencl
|
||||
if: ${{ matrix.build == 'opencl-adreno-arm64' }}
|
||||
run: |
|
||||
git clone https://github.com/KhronosGroup/OpenCL-Headers
|
||||
cd OpenCL-Headers
|
||||
cmake -B build `
|
||||
-DBUILD_TESTING=OFF `
|
||||
-DOPENCL_HEADERS_BUILD_TESTING=OFF `
|
||||
-DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
|
||||
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
|
||||
cmake --build build --target install
|
||||
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
|
||||
cd OpenCL-ICD-Loader
|
||||
cmake -B build-arm64-release `
|
||||
-A arm64 `
|
||||
-DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
|
||||
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
|
||||
cmake --build build-arm64-release --target install --config release
|
||||
|
||||
- name: libCURL
|
||||
id: get_libcurl
|
||||
uses: ./.github/actions/windows-setup-curl
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
cmake -S . -B build ${{ matrix.defines }} `
|
||||
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" `
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||
|
||||
- name: Add libopenblas.dll
|
||||
id: add_libopenblas_dll
|
||||
if: ${{ matrix.build == 'openblas-x64' }}
|
||||
run: |
|
||||
cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
|
||||
cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
Copy-Item $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
|
||||
name: llama-bin-win-${{ matrix.build }}.zip
|
||||
|
||||
windows-cuda:
|
||||
runs-on: windows-2019
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
cuda: ['12.4', '11.7']
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-cuda-${{ matrix.cuda }}
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Install Cuda Toolkit
|
||||
uses: ./.github/actions/windows-setup-cuda
|
||||
with:
|
||||
cuda_version: ${{ matrix.cuda }}
|
||||
|
||||
- name: Install Ninja
|
||||
id: install_ninja
|
||||
run: |
|
||||
choco install ninja
|
||||
|
||||
- name: libCURL
|
||||
id: get_libcurl
|
||||
uses: ./.github/actions/windows-setup-curl
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
shell: cmd
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
|
||||
cmake -S . -B build -G "Ninja Multi-Config" ^
|
||||
-DGGML_NATIVE=OFF ^
|
||||
-DGGML_BACKEND_DL=ON ^
|
||||
-DGGML_CPU_ALL_VARIANTS=ON ^
|
||||
-DGGML_CUDA=ON ^
|
||||
-DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" ^
|
||||
${{ env.CMAKE_ARGS }}
|
||||
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
|
||||
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
|
||||
cmake --build build --config Release
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-cuda${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-cuda${{ matrix.cuda }}-x64.zip
|
||||
name: llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
|
||||
|
||||
- name: Copy and pack Cuda runtime
|
||||
run: |
|
||||
echo "Cuda install location: ${{ env.CUDA_PATH }}"
|
||||
$dst='.\build\bin\cudart\'
|
||||
robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||
robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||
7z a cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip $dst\*
|
||||
|
||||
- name: Upload Cuda runtime
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
|
||||
name: cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
|
||||
|
||||
windows-sycl:
|
||||
runs-on: windows-latest
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
env:
|
||||
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe
|
||||
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
|
||||
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-latest-cmake-sycl
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Install
|
||||
run: |
|
||||
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
|
||||
# TODO: add libcurl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: examples/sycl/win-build-sycl.bat
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Build the release package
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
|
||||
|
||||
echo "cp oneAPI running time dll files to ./build/bin done"
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
|
||||
|
||||
- name: Upload the release package
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
|
||||
name: llama-bin-win-sycl-x64.zip
|
||||
|
||||
windows-hip:
|
||||
runs-on: windows-latest
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
gpu_target: [gfx1100, gfx1101, gfx1030]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Clone rocWMMA repository
|
||||
id: clone_rocwmma
|
||||
run: |
|
||||
git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-latest-cmake-hip-release
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Install
|
||||
id: depends
|
||||
run: |
|
||||
$ErrorActionPreference = "Stop"
|
||||
write-host "Downloading AMD HIP SDK Installer"
|
||||
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
|
||||
write-host "Installing AMD HIP SDK"
|
||||
Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
|
||||
write-host "Completed AMD HIP SDK installation"
|
||||
|
||||
- name: Verify ROCm
|
||||
id: verify
|
||||
run: |
|
||||
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
|
||||
|
||||
- name: libCURL
|
||||
id: get_libcurl
|
||||
uses: ./.github/actions/windows-setup-curl
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
|
||||
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
|
||||
cmake -G "Unix Makefiles" -B build -S . `
|
||||
-DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
|
||||
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
|
||||
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" `
|
||||
-DCMAKE_BUILD_TYPE=Release `
|
||||
-DAMDGPU_TARGETS=${{ matrix.gpu_target }} `
|
||||
-DGGML_HIP_ROCWMMA_FATTN=ON `
|
||||
-DGGML_HIP=ON `
|
||||
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" `
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
|
||||
md "build\bin\rocblas\library\"
|
||||
cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
|
||||
cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
|
||||
cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\libcurl-x64.dll
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
|
||||
name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
|
||||
|
||||
ios-xcode-build:
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
sysctl -a
|
||||
cmake -B build -G Xcode \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=iOS \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||
|
||||
- name: xcodebuild for swift package
|
||||
id: xcodebuild
|
||||
run: |
|
||||
./build-xcframework.sh
|
||||
|
||||
- name: Build Xcode project
|
||||
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
|
||||
name: llama-${{ steps.tag.outputs.name }}-xcframework
|
||||
|
||||
release:
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
|
||||
# Fine-grant permission
|
||||
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
|
||||
permissions:
|
||||
contents: write # for creating release
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
needs:
|
||||
- ubuntu-22-cpu
|
||||
- ubuntu-22-vulkan
|
||||
- windows
|
||||
- windows-cuda
|
||||
- windows-sycl
|
||||
- windows-hip
|
||||
- macOS-arm64
|
||||
- macOS-x64
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Download artifacts
|
||||
id: download-artifact
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
path: ./artifact
|
||||
|
||||
- name: Move artifacts
|
||||
id: move_artifacts
|
||||
run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release
|
||||
|
||||
- name: Create release
|
||||
id: create_release
|
||||
uses: ggml-org/action-create-release@v1
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
with:
|
||||
tag_name: ${{ steps.tag.outputs.name }}
|
||||
|
||||
- name: Upload release
|
||||
id: upload_release
|
||||
uses: actions/github-script@v3
|
||||
with:
|
||||
github-token: ${{secrets.GITHUB_TOKEN}}
|
||||
script: |
|
||||
const path = require('path');
|
||||
const fs = require('fs');
|
||||
const release_id = '${{ steps.create_release.outputs.id }}';
|
||||
for (let file of await fs.readdirSync('./artifact/release')) {
|
||||
if (path.extname(file) === '.zip') {
|
||||
console.log('uploadReleaseAsset', file);
|
||||
await github.repos.uploadReleaseAsset({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
release_id: release_id,
|
||||
name: file,
|
||||
data: await fs.readFileSync(`./artifact/release/${file}`)
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -252,20 +252,3 @@ configure_file(cmake/llama.pc.in
|
||||
|
||||
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
|
||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
|
||||
|
||||
#
|
||||
# copy the license files
|
||||
#
|
||||
|
||||
# Check if running in GitHub Actions
|
||||
if(DEFINED ENV{GITHUB_ACTIONS} AND "$ENV{GITHUB_ACTIONS}" STREQUAL "true")
|
||||
message(STATUS "Running inside GitHub Actions - copying license files")
|
||||
|
||||
# Copy all files from licenses/ to build/bin/
|
||||
file(GLOB LICENSE_FILES "${CMAKE_SOURCE_DIR}/licenses/*")
|
||||
foreach(LICENSE_FILE ${LICENSE_FILES})
|
||||
get_filename_component(FILENAME ${LICENSE_FILE} NAME)
|
||||
configure_file(${LICENSE_FILE} "${CMAKE_BINARY_DIR}/bin/${FILENAME}" COPYONLY)
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
|
||||
@@ -38,15 +38,6 @@
|
||||
}
|
||||
},
|
||||
|
||||
{
|
||||
"name": "arm64-windows-msvc", "hidden": true,
|
||||
"architecture": { "value": "arm64", "strategy": "external" },
|
||||
"toolset": { "value": "host=x64", "strategy": "external" },
|
||||
"cacheVariables": {
|
||||
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
|
||||
}
|
||||
},
|
||||
|
||||
{
|
||||
"name": "arm64-windows-llvm", "hidden": true,
|
||||
"architecture": { "value": "arm64", "strategy": "external" },
|
||||
@@ -73,10 +64,6 @@
|
||||
{ "name": "arm64-apple-clang-release", "inherits": [ "base", "arm64-apple-clang", "reldbg" ] },
|
||||
{ "name": "arm64-apple-clang+static-release", "inherits": [ "base", "arm64-apple-clang", "reldbg", "static" ] },
|
||||
|
||||
{ "name": "arm64-windows-msvc-debug", "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
||||
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] },
|
||||
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] },
|
||||
|
||||
{ "name": "x64-windows-llvm-debug", "inherits": [ "base", "x64-windows-llvm", "debug" ] },
|
||||
{ "name": "x64-windows-llvm-release", "inherits": [ "base", "x64-windows-llvm", "release" ] },
|
||||
{ "name": "x64-windows-llvm-reldbg", "inherits": [ "base", "x64-windows-llvm", "reldbg" ] },
|
||||
|
||||
@@ -1187,11 +1187,6 @@ llama-cli: tools/main/main.cpp \
|
||||
@echo '==== Run ./llama-cli -h for help. ===='
|
||||
@echo
|
||||
|
||||
llama-infill: examples/infill/infill.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-run: tools/run/run.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
@@ -1394,36 +1389,36 @@ llama-gen-docs: examples/gen-docs/gen-docs.cpp \
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
libllava.a: tools/llava/llava.cpp \
|
||||
tools/llava/llava.h \
|
||||
tools/llava/clip.cpp \
|
||||
tools/llava/clip.h \
|
||||
libllava.a: tools/mtmd/llava.cpp \
|
||||
tools/mtmd/llava.h \
|
||||
tools/mtmd/clip.cpp \
|
||||
tools/mtmd/clip.h \
|
||||
common/stb_image.h \
|
||||
common/base64.hpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
|
||||
|
||||
llama-llava-cli: tools/llava/llava-cli.cpp \
|
||||
tools/llava/llava.cpp \
|
||||
tools/llava/llava.h \
|
||||
tools/llava/clip.cpp \
|
||||
tools/llava/clip.h \
|
||||
llama-llava-cli: tools/mtmd/llava-cli.cpp \
|
||||
tools/mtmd/llava.cpp \
|
||||
tools/mtmd/llava.h \
|
||||
tools/mtmd/clip.cpp \
|
||||
tools/mtmd/clip.h \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
|
||||
|
||||
llama-minicpmv-cli: tools/llava/minicpmv-cli.cpp \
|
||||
tools/llava/llava.cpp \
|
||||
tools/llava/llava.h \
|
||||
tools/llava/clip.cpp \
|
||||
tools/llava/clip.h \
|
||||
llama-minicpmv-cli: tools/mtmd/minicpmv-cli.cpp \
|
||||
tools/mtmd/llava.cpp \
|
||||
tools/mtmd/llava.h \
|
||||
tools/mtmd/clip.cpp \
|
||||
tools/mtmd/clip.h \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
|
||||
|
||||
llama-qwen2vl-cli: tools/llava/qwen2vl-cli.cpp \
|
||||
tools/llava/llava.cpp \
|
||||
tools/llava/llava.h \
|
||||
tools/llava/clip.cpp \
|
||||
tools/llava/clip.h \
|
||||
llama-qwen2vl-cli: tools/mtmd/qwen2vl-cli.cpp \
|
||||
tools/mtmd/llava.cpp \
|
||||
tools/mtmd/llava.h \
|
||||
tools/mtmd/clip.cpp \
|
||||
tools/mtmd/clip.h \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
|
||||
|
||||
|
||||
@@ -144,3 +144,27 @@ endif ()
|
||||
target_include_directories(${TARGET} PUBLIC .)
|
||||
target_compile_features (${TARGET} PUBLIC cxx_std_17)
|
||||
target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
|
||||
|
||||
|
||||
#
|
||||
# copy the license files
|
||||
#
|
||||
|
||||
# Check if running in GitHub Actions
|
||||
if (DEFINED ENV{GITHUB_ACTIONS} AND "$ENV{GITHUB_ACTIONS}" STREQUAL "true")
|
||||
message(STATUS "Running inside GitHub Actions - copying license files")
|
||||
|
||||
# Copy all files from licenses/ to build/bin/
|
||||
file(GLOB LICENSE_FILES "${CMAKE_SOURCE_DIR}/licenses/*")
|
||||
foreach(LICENSE_FILE ${LICENSE_FILES})
|
||||
get_filename_component(FILENAME ${LICENSE_FILE} NAME)
|
||||
add_custom_command(
|
||||
POST_BUILD
|
||||
TARGET ${TARGET}
|
||||
COMMAND ${CMAKE_COMMAND} -E copy_if_different
|
||||
"${LICENSE_FILE}"
|
||||
"$<TARGET_FILE_DIR:llama>/${FILENAME}"
|
||||
COMMENT "Copying ${FILENAME} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}")
|
||||
message(STATUS "Copying ${LICENSE_FILE} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${FILENAME}")
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
+15
-15
@@ -1283,7 +1283,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params) {
|
||||
params.use_color = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
|
||||
add_opt(common_arg(
|
||||
{"-t", "--threads"}, "N",
|
||||
string_format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
|
||||
@@ -1416,7 +1416,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
add_opt(common_arg(
|
||||
{"-n", "--predict", "--n-predict"}, "N",
|
||||
string_format(
|
||||
ex == LLAMA_EXAMPLE_MAIN || ex == LLAMA_EXAMPLE_INFILL
|
||||
ex == LLAMA_EXAMPLE_MAIN
|
||||
? "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)"
|
||||
: "number of tokens to predict (default: %d, -1 = infinity)",
|
||||
params.n_predict),
|
||||
@@ -1655,7 +1655,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.input_prefix = value;
|
||||
params.enable_chat_template = false;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
||||
add_opt(common_arg(
|
||||
{"--in-suffix"}, "STRING",
|
||||
"string to suffix after user inputs with (default: empty)",
|
||||
@@ -1663,7 +1663,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.input_suffix = value;
|
||||
params.enable_chat_template = false;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
||||
add_opt(common_arg(
|
||||
{"--no-warmup"},
|
||||
"skip warming up the model with an empty run",
|
||||
@@ -1680,7 +1680,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params) {
|
||||
params.spm_infill = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL}));
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"--samplers"}, "SAMPLERS",
|
||||
string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
|
||||
@@ -2097,13 +2097,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.cache_type_v = kv_cache_type_from_str(value);
|
||||
}
|
||||
).set_env("LLAMA_ARG_CACHE_TYPE_V"));
|
||||
add_opt(common_arg(
|
||||
{"--perplexity", "--all-logits"},
|
||||
string_format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
|
||||
[](common_params & params) {
|
||||
params.logits_all = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
||||
add_opt(common_arg(
|
||||
{"--hellaswag"},
|
||||
"compute HellaSwag score over random tasks from datafile supplied with -f",
|
||||
@@ -2211,14 +2204,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
|
||||
add_opt(common_arg(
|
||||
{"--mmproj"}, "FILE",
|
||||
"path to a multimodal projector file. see tools/llava/README.md",
|
||||
"path to a multimodal projector file. see tools/mtmd/README.md",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.mmproj.path = value;
|
||||
}
|
||||
).set_examples(mmproj_examples));
|
||||
add_opt(common_arg(
|
||||
{"--mmproj-url"}, "URL",
|
||||
"URL to a multimodal projector file. see tools/llava/README.md",
|
||||
"URL to a multimodal projector file. see tools/mtmd/README.md",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.mmproj.url = value;
|
||||
}
|
||||
@@ -2634,6 +2627,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.i_chunk = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
||||
add_opt(common_arg(
|
||||
{"--parse-special"},
|
||||
string_format("prase special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"),
|
||||
[](common_params & params) {
|
||||
params.parse_special = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
||||
add_opt(common_arg(
|
||||
{"-pps"},
|
||||
string_format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
|
||||
@@ -2892,7 +2892,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params) {
|
||||
params.simple_io = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
||||
add_opt(common_arg(
|
||||
{"--positive-file"}, "FNAME",
|
||||
string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
|
||||
|
||||
+3
-1
@@ -125,7 +125,9 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
|
||||
msgs.push_back(msg);
|
||||
}
|
||||
} catch (const std::exception & e) {
|
||||
throw std::runtime_error("Failed to parse messages: " + std::string(e.what()) + "; messages = " + messages.dump(2));
|
||||
// @ngxson : disable otherwise it's bloating the API response
|
||||
// printf("%s\n", std::string("; messages = ") + messages.dump(2));
|
||||
throw std::runtime_error("Failed to parse messages: " + std::string(e.what()));
|
||||
}
|
||||
|
||||
return msgs;
|
||||
|
||||
@@ -1096,7 +1096,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
||||
cparams.n_threads = params.cpuparams.n_threads;
|
||||
cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
|
||||
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
|
||||
cparams.logits_all = params.logits_all;
|
||||
cparams.embeddings = params.embedding;
|
||||
cparams.rope_scaling_type = params.rope_scaling_type;
|
||||
cparams.rope_freq_base = params.rope_freq_base;
|
||||
|
||||
+4
-3
@@ -66,7 +66,6 @@ enum llama_example {
|
||||
LLAMA_EXAMPLE_COMMON,
|
||||
LLAMA_EXAMPLE_SPECULATIVE,
|
||||
LLAMA_EXAMPLE_MAIN,
|
||||
LLAMA_EXAMPLE_INFILL,
|
||||
LLAMA_EXAMPLE_EMBEDDING,
|
||||
LLAMA_EXAMPLE_PERPLEXITY,
|
||||
LLAMA_EXAMPLE_RETRIEVAL,
|
||||
@@ -96,6 +95,7 @@ enum common_sampler_type {
|
||||
COMMON_SAMPLER_TYPE_XTC = 8,
|
||||
COMMON_SAMPLER_TYPE_INFILL = 9,
|
||||
COMMON_SAMPLER_TYPE_PENALTIES = 10,
|
||||
COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
|
||||
};
|
||||
|
||||
// dimensionality reduction methods, used by cvector-generator
|
||||
@@ -161,6 +161,7 @@ struct common_params_sampling {
|
||||
std::vector<enum common_sampler_type> samplers = {
|
||||
COMMON_SAMPLER_TYPE_PENALTIES,
|
||||
COMMON_SAMPLER_TYPE_DRY,
|
||||
COMMON_SAMPLER_TYPE_TOP_N_SIGMA,
|
||||
COMMON_SAMPLER_TYPE_TOP_K,
|
||||
COMMON_SAMPLER_TYPE_TYPICAL_P,
|
||||
COMMON_SAMPLER_TYPE_TOP_P,
|
||||
@@ -323,7 +324,6 @@ struct common_params {
|
||||
bool ctx_shift = true; // context shift on inifinite text generation
|
||||
|
||||
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
||||
bool logits_all = false; // return logits for all tokens in the batch
|
||||
bool use_mmap = true; // use mmap for faster loads
|
||||
bool use_mlock = false; // use mlock to keep model in memory
|
||||
bool verbose_prompt = false; // print prompt tokens before generation
|
||||
@@ -340,7 +340,7 @@ struct common_params {
|
||||
|
||||
common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
|
||||
|
||||
// multimodal models (see tools/llava)
|
||||
// multimodal models (see tools/mtmd)
|
||||
struct common_params_model mmproj;
|
||||
bool mmproj_use_gpu = true; // use GPU for multimodal model
|
||||
bool no_mmproj = false; // explicitly disable multimodal model
|
||||
@@ -409,6 +409,7 @@ struct common_params {
|
||||
|
||||
bool process_output = false; // collect data for the output tensor
|
||||
bool compute_ppl = true; // whether to compute perplexity
|
||||
bool parse_special = false; // whether to parse special tokens during imatrix tokenization
|
||||
|
||||
// cvector-generator params
|
||||
int n_pca_batch = 100;
|
||||
|
||||
+57
-50
@@ -1,6 +1,7 @@
|
||||
#include "sampling.h"
|
||||
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <unordered_map>
|
||||
@@ -229,51 +230,48 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||
params.logit_bias.data()));
|
||||
|
||||
if (params.mirostat == 0) {
|
||||
if (params.top_n_sigma >= 0) {
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp (params.temp));
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
|
||||
} else {
|
||||
for (const auto & cnstr : params.samplers) {
|
||||
switch (cnstr) {
|
||||
case COMMON_SAMPLER_TYPE_DRY:
|
||||
{
|
||||
std::vector<const char *> c_breakers;
|
||||
c_breakers.reserve(params.dry_sequence_breakers.size());
|
||||
for (const auto & str : params.dry_sequence_breakers) {
|
||||
c_breakers.push_back(str.c_str());
|
||||
}
|
||||
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
||||
for (const auto & cnstr : params.samplers) {
|
||||
switch (cnstr) {
|
||||
case COMMON_SAMPLER_TYPE_DRY:
|
||||
{
|
||||
std::vector<const char *> c_breakers;
|
||||
c_breakers.reserve(params.dry_sequence_breakers.size());
|
||||
for (const auto & str : params.dry_sequence_breakers) {
|
||||
c_breakers.push_back(str.c_str());
|
||||
}
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TOP_K:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TOP_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_MIN_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_XTC:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TYPICAL_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_INFILL:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_PENALTIES:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false && "unknown sampler type");
|
||||
}
|
||||
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
||||
}
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TOP_K:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TOP_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_MIN_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_XTC:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TYPICAL_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_INFILL:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_PENALTIES:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false && "unknown sampler type");
|
||||
}
|
||||
}
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
|
||||
@@ -475,6 +473,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
|
||||
case COMMON_SAMPLER_TYPE_TOP_K: return 'k';
|
||||
case COMMON_SAMPLER_TYPE_TYPICAL_P: return 'y';
|
||||
case COMMON_SAMPLER_TYPE_TOP_P: return 'p';
|
||||
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return 's';
|
||||
case COMMON_SAMPLER_TYPE_MIN_P: return 'm';
|
||||
case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
|
||||
case COMMON_SAMPLER_TYPE_XTC: return 'x';
|
||||
@@ -490,6 +489,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
|
||||
case COMMON_SAMPLER_TYPE_TOP_K: return "top_k";
|
||||
case COMMON_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
|
||||
case COMMON_SAMPLER_TYPE_TOP_P: return "top_p";
|
||||
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return "top_n_sigma";
|
||||
case COMMON_SAMPLER_TYPE_MIN_P: return "min_p";
|
||||
case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
|
||||
case COMMON_SAMPLER_TYPE_XTC: return "xtc";
|
||||
@@ -504,6 +504,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
|
||||
{ "dry", COMMON_SAMPLER_TYPE_DRY },
|
||||
{ "top_k", COMMON_SAMPLER_TYPE_TOP_K },
|
||||
{ "top_p", COMMON_SAMPLER_TYPE_TOP_P },
|
||||
{ "top_n_sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
|
||||
{ "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||
{ "min_p", COMMON_SAMPLER_TYPE_MIN_P },
|
||||
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
|
||||
@@ -517,6 +518,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
|
||||
std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
|
||||
{ "top-k", COMMON_SAMPLER_TYPE_TOP_K },
|
||||
{ "top-p", COMMON_SAMPLER_TYPE_TOP_P },
|
||||
{ "top-n-sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
|
||||
{ "nucleus", COMMON_SAMPLER_TYPE_TOP_P },
|
||||
{ "typical-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||
{ "typical", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||
@@ -533,14 +535,16 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
|
||||
auto sampler = sampler_canonical_name_map.find(name);
|
||||
if (sampler != sampler_canonical_name_map.end()) {
|
||||
samplers.push_back(sampler->second);
|
||||
} else {
|
||||
if (allow_alt_names) {
|
||||
sampler = sampler_alt_name_map.find(name);
|
||||
if (sampler != sampler_alt_name_map.end()) {
|
||||
samplers.push_back(sampler->second);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if (allow_alt_names) {
|
||||
sampler = sampler_alt_name_map.find(name);
|
||||
if (sampler != sampler_alt_name_map.end()) {
|
||||
samplers.push_back(sampler->second);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
LOG_WRN("%s: unable to match sampler by name '%s'\n", __func__, name.c_str());
|
||||
}
|
||||
|
||||
return samplers;
|
||||
@@ -552,6 +556,7 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K), COMMON_SAMPLER_TYPE_TOP_K },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_N_SIGMA), COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
|
||||
@@ -566,6 +571,8 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
|
||||
const auto sampler = sampler_name_map.find(c);
|
||||
if (sampler != sampler_name_map.end()) {
|
||||
samplers.push_back(sampler->second);
|
||||
} else {
|
||||
LOG_WRN("%s: unable to match sampler by char '%c'\n", __func__, c);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
+73
-52
@@ -1388,10 +1388,10 @@ class BaichuanModel(TextModel):
|
||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
|
||||
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||
if self.hparams["rope_scaling"].get("type") == "linear":
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
head_count = self.hparams["num_attention_heads"]
|
||||
@@ -1512,10 +1512,10 @@ class XverseModel(TextModel):
|
||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
|
||||
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||
if self.hparams["rope_scaling"].get("type") == "linear":
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
del bid # unused
|
||||
@@ -1828,10 +1828,10 @@ class LlamaModel(TextModel):
|
||||
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||
|
||||
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||
if self.hparams["rope_scaling"].get("type") == "linear":
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
|
||||
@staticmethod
|
||||
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
||||
@@ -2206,10 +2206,10 @@ class DeciModel(TextModel):
|
||||
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||
|
||||
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||
if self.hparams["rope_scaling"].get("type") == "linear":
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
|
||||
@staticmethod
|
||||
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
||||
@@ -2449,10 +2449,10 @@ class MiniCPMModel(TextModel):
|
||||
logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"]
|
||||
self.gguf_writer.add_logit_scale(logit_scale)
|
||||
logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}")
|
||||
if self.hparams.get("rope_scaling") is not None:
|
||||
if self.hparams["rope_scaling"].get("type") == "longrope":
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE)
|
||||
logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}")
|
||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "longrope":
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE)
|
||||
logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}")
|
||||
|
||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||
rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||
@@ -2597,11 +2597,11 @@ class Qwen2Model(TextModel):
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self._try_set_pooling_type()
|
||||
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||
if self.hparams["rope_scaling"].get("type") == "yarn":
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
|
||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if self.hf_arch == "Qwen2Model":
|
||||
@@ -2761,6 +2761,13 @@ class Qwen2MoeModel(TextModel):
|
||||
if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None:
|
||||
self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size)
|
||||
logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}")
|
||||
# YaRN is not enabled by default
|
||||
# To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
|
||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
||||
|
||||
_experts: list[dict[str, Tensor]] | None = None
|
||||
|
||||
@@ -3028,7 +3035,7 @@ class Phi3MiniModel(TextModel):
|
||||
|
||||
scale = max_pos_embds / orig_max_pos_embds
|
||||
|
||||
rope_scaling_type = rope_scaling.get('type', '').lower()
|
||||
rope_scaling_type = rope_scaling.get('rope_type', rope_scaling.get('type', '')).lower()
|
||||
if len(rope_scaling_type) == 0:
|
||||
raise KeyError('Missing the required key rope_scaling.type')
|
||||
|
||||
@@ -3340,10 +3347,10 @@ class InternLM2Model(TextModel):
|
||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
||||
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||
if self.hparams["rope_scaling"].get("type") == "linear":
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
num_heads = self.hparams["num_attention_heads"]
|
||||
@@ -3418,10 +3425,10 @@ class InternLM3Model(TextModel):
|
||||
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||
|
||||
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||
if self.hparams["rope_scaling"].get("type") == "linear" or self.hparams["rope_scaling"].get("rope_type") == "linear":
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
n_head = self.hparams["num_attention_heads"]
|
||||
@@ -3908,6 +3915,16 @@ class Gemma3VisionModel(VisionModel):
|
||||
# default values below are taken from HF tranformers code
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
|
||||
self.gguf_writer.add_vision_use_gelu(True)
|
||||
# calculate proj_scale_factor (used by tinygemma3 test model)
|
||||
image_seq_length = self.preprocessor_config.get("image_seq_length", 256)
|
||||
n_per_side = int(image_seq_length ** 0.5)
|
||||
image_size = self.hparams["image_size"]
|
||||
patch_size = self.hparams["patch_size"]
|
||||
proj_scale_factor = (image_size // patch_size) // n_per_side
|
||||
if proj_scale_factor > 0 and proj_scale_factor != 4:
|
||||
# we only need to write this if it's not the default value
|
||||
# in this case, we are converting a test model
|
||||
self.gguf_writer.add_vision_projector_scale_factor(proj_scale_factor)
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
del bid, new_name, n_dims # unused
|
||||
@@ -3921,6 +3938,9 @@ class Gemma3VisionModel(VisionModel):
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
del bid # unused
|
||||
|
||||
if "vision_model.head." in name:
|
||||
return [] # skip redundant tensors for tinygemma3
|
||||
|
||||
if name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \
|
||||
or name.startswith("multimodal_projector.") or name.startswith("vision_model."):
|
||||
# process vision tensors
|
||||
@@ -4846,12 +4866,12 @@ class DeepseekV2Model(TextModel):
|
||||
|
||||
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
||||
|
||||
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||
if self.hparams["rope_scaling"].get("type") == "yarn":
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
|
||||
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * hparams["rope_scaling"]["mscale_all_dim"])
|
||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
||||
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_scaling["mscale_all_dim"])
|
||||
|
||||
_experts: list[dict[str, Tensor]] | None = None
|
||||
|
||||
@@ -5343,11 +5363,11 @@ class Glm4Model(TextModel):
|
||||
super().set_gguf_parameters()
|
||||
rope_dim = self.hparams["head_dim"]
|
||||
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
|
||||
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||
if self.hparams["rope_scaling"].get("type") == "yarn":
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
|
||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
||||
|
||||
|
||||
@ModelBase.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
|
||||
@@ -5580,10 +5600,10 @@ class ExaoneModel(TextModel):
|
||||
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
|
||||
rotary_factor = rotary_factor if rotary_factor is not None else 1.0
|
||||
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
|
||||
if hparams.get("rope_scaling") is not None and "factor" in hparams["rope_scaling"]:
|
||||
if hparams["rope_scaling"].get("type") == "linear":
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
|
||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
|
||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
||||
@@ -5686,10 +5706,11 @@ class BailingMoeModel(TextModel):
|
||||
rope_dim = hparams.get("head_dim") or hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||
|
||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||
if (self.hparams.get("rope_scaling") or {}).get("type") == "yarn" and "factor" in self.hparams["rope_scaling"]:
|
||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
||||
else:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
||||
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
|
||||
|
||||
@@ -33,13 +33,13 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336
|
||||
2. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
|
||||
|
||||
```sh
|
||||
python ./tools/llava/llava_surgery.py -m path/to/MobileVLM-1.7B
|
||||
python ./tools/mtmd/llava_surgery.py -m path/to/MobileVLM-1.7B
|
||||
```
|
||||
|
||||
3. Use `convert_image_encoder_to_gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
|
||||
|
||||
```sh
|
||||
python ./tools/llava/convert_image_encoder_to_gguf.py \
|
||||
python ./tools/mtmd/convert_image_encoder_to_gguf.py \
|
||||
-m path/to/clip-vit-large-patch14-336 \
|
||||
--llava-projector path/to/MobileVLM-1.7B/llava.projector \
|
||||
--output-dir path/to/MobileVLM-1.7B \
|
||||
@@ -47,7 +47,7 @@ python ./tools/llava/convert_image_encoder_to_gguf.py \
|
||||
```
|
||||
|
||||
```sh
|
||||
python ./tools/llava/convert_image_encoder_to_gguf.py \
|
||||
python ./tools/mtmd/convert_image_encoder_to_gguf.py \
|
||||
-m path/to/clip-vit-large-patch14-336 \
|
||||
--llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \
|
||||
--output-dir path/to/MobileVLM-1.7B_V2 \
|
||||
@@ -69,10 +69,10 @@ Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directo
|
||||
|
||||
## Android compile and run
|
||||
### compile
|
||||
refer to `tools/llava/android/build_64.sh`
|
||||
refer to `tools/mtmd/android/build_64.sh`
|
||||
```sh
|
||||
mkdir tools/llava/android/build_64
|
||||
cd tools/llava/android/build_64
|
||||
mkdir tools/mtmd/android/build_64
|
||||
cd tools/mtmd/android/build_64
|
||||
../build_64.sh
|
||||
```
|
||||
### run on Android
|
||||
|
||||
@@ -25,13 +25,13 @@ git clone https://huggingface.co/THUDM/glm-edge-v-5b or https://huggingface.co/T
|
||||
2. Use `glmedge-surgery.py` to split the GLMV-EDGE model to LLM and multimodel projector constituents:
|
||||
|
||||
```sh
|
||||
python ./tools/llava/glmedge-surgery.py -m ../model_path
|
||||
python ./tools/mtmd/glmedge-surgery.py -m ../model_path
|
||||
```
|
||||
|
||||
4. Use `glmedge-convert-image-encoder-to-gguf.py` to convert the GLMV-EDGE image encoder to GGUF:
|
||||
|
||||
```sh
|
||||
python ./tools/llava/glmedge-convert-image-encoder-to-gguf.py -m ../model_path --llava-projector ../model_path/glm.projector --output-dir ../model_path
|
||||
python ./tools/mtmd/glmedge-convert-image-encoder-to-gguf.py -m ../model_path --llava-projector ../model_path/glm.projector --output-dir ../model_path
|
||||
```
|
||||
|
||||
5. Use `examples/convert_hf_to_gguf.py` to convert the LLM part of GLMV-EDGE to GGUF:
|
||||
|
||||
@@ -37,19 +37,19 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336
|
||||
2. Install the required Python packages:
|
||||
|
||||
```sh
|
||||
pip install -r tools/llava/requirements.txt
|
||||
pip install -r tools/mtmd/requirements.txt
|
||||
```
|
||||
|
||||
3. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
|
||||
|
||||
```sh
|
||||
python ./tools/llava/llava_surgery.py -m ../llava-v1.5-7b
|
||||
python ./tools/mtmd/llava_surgery.py -m ../llava-v1.5-7b
|
||||
```
|
||||
|
||||
4. Use `convert_image_encoder_to_gguf.py` to convert the LLaVA image encoder to GGUF:
|
||||
|
||||
```sh
|
||||
python ./tools/llava/convert_image_encoder_to_gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
|
||||
python ./tools/mtmd/convert_image_encoder_to_gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
|
||||
```
|
||||
|
||||
5. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:
|
||||
@@ -69,12 +69,12 @@ git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b
|
||||
2) Install the required Python packages:
|
||||
|
||||
```sh
|
||||
pip install -r tools/llava/requirements.txt
|
||||
pip install -r tools/mtmd/requirements.txt
|
||||
```
|
||||
|
||||
3) Use `llava_surgery_v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
|
||||
```console
|
||||
python tools/llava/llava_surgery_v2.py -C -m ../llava-v1.6-vicuna-7b/
|
||||
python tools/mtmd/llava_surgery_v2.py -C -m ../llava-v1.6-vicuna-7b/
|
||||
```
|
||||
- you will find a llava.projector and a llava.clip file in your model directory
|
||||
|
||||
@@ -88,7 +88,7 @@ curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.jso
|
||||
|
||||
5) Create the visual gguf model:
|
||||
```console
|
||||
python ./tools/llava/convert_image_encoder_to_gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
|
||||
python ./tools/mtmd/convert_image_encoder_to_gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
|
||||
```
|
||||
- This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP
|
||||
|
||||
|
||||
@@ -29,8 +29,8 @@ cmake --build build --config Release
|
||||
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf) by us)
|
||||
|
||||
```bash
|
||||
python ./tools/llava/minicpmv-surgery.py -m ../MiniCPM-o-2_6
|
||||
python ./tools/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmv.projector --output-dir ../MiniCPM-o-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4
|
||||
python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-o-2_6
|
||||
python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmv.projector --output-dir ../MiniCPM-o-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4
|
||||
python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model
|
||||
|
||||
# quantize int4 version
|
||||
|
||||
@@ -28,8 +28,8 @@ cmake --build build --config Release
|
||||
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)
|
||||
|
||||
```bash
|
||||
python ./tools/llava/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
|
||||
python ./tools/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
|
||||
python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
|
||||
python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
|
||||
python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model
|
||||
|
||||
# quantize int4 version
|
||||
|
||||
@@ -28,8 +28,8 @@ cmake --build build --config Release
|
||||
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) by us)
|
||||
|
||||
```bash
|
||||
python ./tools/llava/minicpmv-surgery.py -m ../MiniCPM-V-2_6
|
||||
python ./tools/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 3
|
||||
python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-V-2_6
|
||||
python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 3
|
||||
python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model
|
||||
|
||||
# quantize int4 version
|
||||
|
||||
@@ -21,7 +21,6 @@ else()
|
||||
add_subdirectory(gguf-hash)
|
||||
add_subdirectory(gguf)
|
||||
add_subdirectory(gritlm)
|
||||
add_subdirectory(infill)
|
||||
add_subdirectory(lookahead)
|
||||
add_subdirectory(lookup)
|
||||
add_subdirectory(parallel)
|
||||
|
||||
@@ -35,23 +35,14 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
|
||||
|
||||
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
|
||||
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
|
||||
const struct llama_model * model = llama_get_model(ctx);
|
||||
|
||||
// clear previous kv_cache values (irrelevant for embeddings)
|
||||
llama_kv_self_clear(ctx);
|
||||
|
||||
// run model
|
||||
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
|
||||
if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) {
|
||||
// encoder-only model
|
||||
if (llama_encode(ctx, batch) < 0) {
|
||||
LOG_ERR("%s : failed to encode\n", __func__);
|
||||
}
|
||||
} else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
|
||||
// decoder-only model
|
||||
if (llama_decode(ctx, batch) < 0) {
|
||||
LOG_ERR("%s : failed to decode\n", __func__);
|
||||
}
|
||||
if (llama_encode(ctx, batch) < 0) {
|
||||
LOG_ERR("%s : failed to encode\n", __func__);
|
||||
}
|
||||
|
||||
for (int i = 0; i < batch.n_tokens; i++) {
|
||||
|
||||
@@ -1,5 +0,0 @@
|
||||
set(TARGET llama-infill)
|
||||
add_executable(${TARGET} infill.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
@@ -1,47 +0,0 @@
|
||||
# llama.cpp/example/infill
|
||||
|
||||
This example shows how to use the infill mode with Code Llama models supporting infill mode.
|
||||
Currently the 7B and 13B models support infill mode.
|
||||
|
||||
Infill supports most of the options available in the main example.
|
||||
|
||||
For further information have a look at the main README.md in llama.cpp/example/main/README.md
|
||||
|
||||
## Common Options
|
||||
|
||||
In this section, we cover the most commonly used options for running the `infill` program with the LLaMA models:
|
||||
|
||||
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
|
||||
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
|
||||
- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
|
||||
- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 4096, but if a LLaMA model was built with a longer context, increasing this value will provide better results for longer input/inference.
|
||||
- `--spm-infill`: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.
|
||||
|
||||
## Input Prompts
|
||||
|
||||
The `infill` program provides several ways to interact with the LLaMA models using input prompts:
|
||||
|
||||
- `--in-prefix PROMPT_BEFORE_CURSOR`: Provide the prefix directly as a command-line option.
|
||||
- `--in-suffix PROMPT_AFTER_CURSOR`: Provide the suffix directly as a command-line option.
|
||||
- `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.)
|
||||
|
||||
## Interaction
|
||||
|
||||
The `infill` program offers a seamless way to interact with LLaMA models, allowing users to receive real-time infill suggestions. The interactive mode can be triggered using `--interactive`, and `--interactive-first`
|
||||
|
||||
### Interaction Options
|
||||
|
||||
- `-i, --interactive`: Run the program in interactive mode, allowing users to get real time code suggestions from model.
|
||||
- `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation.
|
||||
- `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text.
|
||||
|
||||
### Example
|
||||
|
||||
Download a model that supports infill, for example CodeLlama:
|
||||
```console
|
||||
scripts/hf.sh --repo TheBloke/CodeLlama-13B-GGUF --file codellama-13b.Q5_K_S.gguf --outdir models
|
||||
```
|
||||
|
||||
```bash
|
||||
./llama-infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n print(\"hell" --in-suffix "\n print(\"goodbye world\")\n "
|
||||
```
|
||||
@@ -1,590 +0,0 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "console.h"
|
||||
#include "sampling.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cinttypes>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||
#include <signal.h>
|
||||
#include <unistd.h>
|
||||
#elif defined (_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#ifndef NOMINMAX
|
||||
#define NOMINMAX
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#include <signal.h>
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
static llama_context ** g_ctx;
|
||||
static llama_model ** g_model;
|
||||
static common_sampler ** g_smpl;
|
||||
static common_params * g_params;
|
||||
static std::vector<llama_token> * g_input_tokens;
|
||||
static std::ostringstream * g_output_ss;
|
||||
static std::vector<llama_token> * g_output_tokens;
|
||||
|
||||
static bool is_interacting = false;
|
||||
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
||||
static void sigint_handler(int signo) {
|
||||
if (signo == SIGINT) {
|
||||
if (!is_interacting) {
|
||||
is_interacting = true;
|
||||
} else {
|
||||
console::cleanup();
|
||||
LOG("\n");
|
||||
common_perf_print(*g_ctx, *g_smpl);
|
||||
|
||||
// make sure all logs are flushed
|
||||
LOG("Interrupted by user\n");
|
||||
common_log_pause(common_log_main());
|
||||
|
||||
_exit(130);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
common_params params;
|
||||
g_params = ¶ms;
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
common_init();
|
||||
|
||||
auto & sparams = params.sampling;
|
||||
|
||||
console::init(params.simple_io, params.use_color);
|
||||
atexit([]() { console::cleanup(); });
|
||||
|
||||
if (params.logits_all) {
|
||||
LOG_ERR("\n************\n");
|
||||
LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
|
||||
LOG_ERR("************\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (params.embedding) {
|
||||
LOG_ERR("\n************\n");
|
||||
LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
|
||||
LOG_ERR("************\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (params.n_ctx != 0 && params.n_ctx < 8) {
|
||||
LOG_WRN("%s: minimum context size is 8, using minimum size.\n", __func__);
|
||||
params.n_ctx = 8;
|
||||
}
|
||||
|
||||
if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
|
||||
LOG_ERR("\n************\n");
|
||||
LOG_ERR("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
|
||||
LOG_ERR("************\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (params.rope_freq_base != 0.0) {
|
||||
LOG_WRN("%s: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
|
||||
}
|
||||
|
||||
if (params.rope_freq_scale != 0.0) {
|
||||
LOG_WRN("%s: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
||||
}
|
||||
|
||||
LOG_INF("%s: llama backend init\n", __func__);
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
llama_model * model = nullptr;
|
||||
llama_context * ctx = nullptr;
|
||||
common_sampler * smpl = nullptr;
|
||||
|
||||
g_model = &model;
|
||||
g_ctx = &ctx;
|
||||
g_smpl = &smpl;
|
||||
|
||||
// load the model and apply lora adapter, if any
|
||||
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
|
||||
common_init_result llama_init = common_init_from_params(params);
|
||||
|
||||
model = llama_init.model.get();
|
||||
ctx = llama_init.context.get();
|
||||
|
||||
if (model == NULL) {
|
||||
LOG_ERR("%s: unable to load model\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
const int n_ctx_train = llama_model_n_ctx_train(model);
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
LOG_DBG("n_ctx: %d\n", n_ctx);
|
||||
|
||||
if (n_ctx > n_ctx_train) {
|
||||
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
|
||||
}
|
||||
|
||||
// print system information
|
||||
{
|
||||
LOG_INF("\n");
|
||||
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
||||
}
|
||||
const bool add_bos = llama_vocab_get_add_bos(vocab);
|
||||
GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
|
||||
|
||||
std::vector<llama_token> embd_inp;
|
||||
std::vector<llama_token> embd_end;
|
||||
std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
|
||||
std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
|
||||
|
||||
GGML_ASSERT(llama_vocab_fim_pre(vocab) >= 0);
|
||||
GGML_ASSERT(llama_vocab_fim_suf(vocab) >= 0);
|
||||
|
||||
inp_pfx.insert(inp_pfx.begin(), llama_vocab_fim_pre(vocab));
|
||||
inp_sfx.insert(inp_sfx.begin(), llama_vocab_fim_suf(vocab));
|
||||
|
||||
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
|
||||
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
|
||||
if (add_bos) {
|
||||
embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
|
||||
}
|
||||
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
|
||||
|
||||
const llama_token middle_token = llama_vocab_fim_mid(vocab);
|
||||
if (middle_token >= 0) {
|
||||
embd_inp.push_back(middle_token);
|
||||
}
|
||||
|
||||
LOG_DBG("add_bos: %d\n", add_bos);
|
||||
LOG_DBG("prefix: \"%s\"\n", params.input_prefix.c_str());
|
||||
LOG_DBG("suffix: \"%s\"\n", params.input_suffix.c_str());
|
||||
LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
|
||||
|
||||
// Should not run without any tokens
|
||||
if (embd_inp.empty()) {
|
||||
embd_inp.push_back(llama_vocab_bos(vocab));
|
||||
LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
|
||||
}
|
||||
|
||||
if ((int) embd_inp.size() > n_ctx - 4) {
|
||||
LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// number of tokens to keep when resetting context
|
||||
if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size()) {
|
||||
params.n_keep = (int)embd_inp.size();
|
||||
}
|
||||
|
||||
LOG_INF("inp_pfx: %s\n", string_from(ctx, inp_pfx).c_str());
|
||||
LOG_INF("inp_sfx: %s\n", string_from(ctx, inp_sfx).c_str());
|
||||
|
||||
// enable interactive mode if interactive start is specified
|
||||
if (params.interactive_first) {
|
||||
params.interactive = true;
|
||||
}
|
||||
|
||||
if (params.verbose_prompt) {
|
||||
LOG_INF("\n");
|
||||
LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
||||
LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||
}
|
||||
|
||||
if (params.n_keep > 0) {
|
||||
LOG_INF("%s: static prompt based on n_keep: '", __func__);
|
||||
for (int i = 0; i < params.n_keep; i++) {
|
||||
LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||
}
|
||||
LOG_CNT("'\n");
|
||||
}
|
||||
LOG_INF("\n");
|
||||
}
|
||||
|
||||
if (params.interactive) {
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||
struct sigaction sigint_action;
|
||||
sigint_action.sa_handler = sigint_handler;
|
||||
sigemptyset (&sigint_action.sa_mask);
|
||||
sigint_action.sa_flags = 0;
|
||||
sigaction(SIGINT, &sigint_action, NULL);
|
||||
#elif defined (_WIN32)
|
||||
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
|
||||
return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
|
||||
};
|
||||
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
||||
#endif
|
||||
|
||||
LOG_INF("%s: interactive mode on.\n", __func__);
|
||||
|
||||
if (params.input_prefix_bos) {
|
||||
LOG_INF("Input prefix with BOS\n");
|
||||
}
|
||||
|
||||
if (!params.input_prefix.empty()) {
|
||||
LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
|
||||
}
|
||||
|
||||
if (!params.input_suffix.empty()) {
|
||||
LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
|
||||
}
|
||||
}
|
||||
smpl = common_sampler_init(model, sparams);
|
||||
|
||||
LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl));
|
||||
LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
|
||||
LOG_INF("sampler chain: %s\n", common_sampler_print(smpl).c_str());
|
||||
|
||||
LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||
|
||||
LOG_INF("\n");
|
||||
LOG_INF("\n##### Infill mode #####\n\n");
|
||||
if (params.interactive) {
|
||||
const char *control_message;
|
||||
if (params.multiline_input) {
|
||||
control_message = " - To return control to LLaMA, end your input with '\\'.\n"
|
||||
" - To return control without starting a new line, end your input with '/'.\n";
|
||||
} else {
|
||||
control_message = " - Press Return to return control to LLaMA.\n"
|
||||
" - To return control without starting a new line, end your input with '/'.\n"
|
||||
" - If you want to submit another line, end your input with '\\'.\n";
|
||||
}
|
||||
LOG_INF("== Running in interactive mode. ==\n");
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
||||
LOG_INF( " - Press Ctrl+C to interject at any time.\n");
|
||||
#endif
|
||||
LOG_INF( "%s\n", control_message);
|
||||
|
||||
is_interacting = params.interactive_first;
|
||||
}
|
||||
|
||||
bool input_echo = true;
|
||||
|
||||
int n_past = 0;
|
||||
int n_remain = params.n_predict;
|
||||
int n_consumed = 0;
|
||||
|
||||
std::vector<int> input_tokens; g_input_tokens = &input_tokens;
|
||||
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
|
||||
std::ostringstream output_ss; g_output_ss = &output_ss;
|
||||
|
||||
// the first thing we will do is to output the prompt, so set color accordingly
|
||||
console::set_display(console::prompt);
|
||||
|
||||
std::vector<llama_token> embd;
|
||||
|
||||
while (n_remain != 0 || params.interactive) {
|
||||
// predict
|
||||
if (!embd.empty()) {
|
||||
// Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
|
||||
// --prompt or --file which uses the same value.
|
||||
int max_embd_size = n_ctx - 4;
|
||||
|
||||
// Ensure the input doesn't exceed the context size by truncating embd if necessary.
|
||||
if ((int) embd.size() > max_embd_size) {
|
||||
const int skipped_tokens = (int) embd.size() - max_embd_size;
|
||||
embd.resize(max_embd_size);
|
||||
|
||||
console::set_display(console::error);
|
||||
LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
|
||||
console::set_display(console::reset);
|
||||
}
|
||||
|
||||
// infinite text generation via context swapping
|
||||
// if we run out of context:
|
||||
// - take the n_keep first tokens from the original prompt (via n_past)
|
||||
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
|
||||
if (n_past + (int) embd.size() > n_ctx) {
|
||||
if (params.n_predict == -2) {
|
||||
LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
||||
break;
|
||||
}
|
||||
|
||||
const int n_left = n_past - params.n_keep - 1;
|
||||
const int n_discard = n_left/2;
|
||||
|
||||
LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
||||
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
||||
|
||||
llama_kv_self_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
||||
llama_kv_self_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
|
||||
|
||||
n_past -= n_discard;
|
||||
|
||||
LOG_DBG("after swap: n_past = %d\n", n_past);
|
||||
|
||||
LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
|
||||
|
||||
}
|
||||
|
||||
// evaluate tokens in batches
|
||||
// embd is typically prepared beforehand to fit within a batch, but not always
|
||||
for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
|
||||
int n_eval = (int) embd.size() - i;
|
||||
if (n_eval > params.n_batch) {
|
||||
n_eval = params.n_batch;
|
||||
}
|
||||
|
||||
LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
|
||||
|
||||
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
|
||||
LOG_ERR("%s : failed to eval\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
n_past += n_eval;
|
||||
|
||||
LOG_DBG("n_past = %d\n", n_past);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
embd.clear();
|
||||
|
||||
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
|
||||
const llama_token id = common_sampler_sample(smpl, ctx, -1);
|
||||
|
||||
common_sampler_accept(smpl, id, true);
|
||||
|
||||
// LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
|
||||
|
||||
embd.push_back(id);
|
||||
|
||||
// echo this to console
|
||||
input_echo = true;
|
||||
|
||||
// decrement remaining sampling budget
|
||||
--n_remain;
|
||||
|
||||
LOG_DBG("n_remain: %d\n", n_remain);
|
||||
} else {
|
||||
// some user input remains from prompt or interaction, forward it to processing
|
||||
LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
|
||||
while ((int) embd_inp.size() > n_consumed) {
|
||||
embd.push_back(embd_inp[n_consumed]);
|
||||
|
||||
// push the prompt in the sampling context in order to apply repetition penalties later
|
||||
// for the prompt, we don't apply grammar rules
|
||||
common_sampler_accept(smpl, embd_inp[n_consumed], false);
|
||||
|
||||
++n_consumed;
|
||||
if ((int) embd.size() >= params.n_batch) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// display text
|
||||
if (input_echo) {
|
||||
for (auto id : embd) {
|
||||
const std::string token_str = common_token_to_piece(ctx, id);
|
||||
LOG("%s", token_str.c_str());
|
||||
|
||||
if (embd.size() > 1) {
|
||||
input_tokens.push_back(id);
|
||||
} else {
|
||||
output_tokens.push_back(id);
|
||||
output_ss << token_str;
|
||||
}
|
||||
}
|
||||
}
|
||||
// reset color to default if we there is no pending user input
|
||||
if (input_echo && (int) embd_inp.size() == n_consumed) {
|
||||
console::set_display(console::reset);
|
||||
}
|
||||
|
||||
// if not currently processing queued inputs;
|
||||
if ((int) embd_inp.size() <= n_consumed) {
|
||||
// deal with eot token in infill mode
|
||||
if ((common_sampler_last(smpl) == llama_vocab_eot(vocab) || is_interacting) && params.interactive){
|
||||
if (is_interacting && !params.interactive_first) {
|
||||
// print an eot token
|
||||
LOG("%s", common_token_to_piece(ctx, llama_vocab_eot(vocab)).c_str());
|
||||
}
|
||||
LOG("\n");
|
||||
console::set_display(console::user_input);
|
||||
std::string buffer;
|
||||
std::string line;
|
||||
bool another_line=true;
|
||||
// set a new prefix via stdin
|
||||
do {
|
||||
another_line = console::readline(line, params.multiline_input);
|
||||
buffer += line;
|
||||
} while (another_line);
|
||||
// check if we got an empty line, if so we use the old input
|
||||
if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
|
||||
params.input_prefix = buffer;
|
||||
}
|
||||
buffer.clear();
|
||||
// set a new suffix via stdin
|
||||
do {
|
||||
another_line = console::readline(line, params.multiline_input);
|
||||
buffer += line;
|
||||
} while (another_line);
|
||||
// check if we got an empty line
|
||||
if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
|
||||
params.input_suffix = buffer;
|
||||
}
|
||||
buffer.clear();
|
||||
// done taking input, reset color
|
||||
console::set_display(console::reset);
|
||||
|
||||
if (params.escape) {
|
||||
//process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
|
||||
string_process_escapes(params.input_prefix);
|
||||
string_process_escapes(params.input_suffix);
|
||||
}
|
||||
|
||||
// tokenize new prefix and suffix
|
||||
std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
|
||||
std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
|
||||
|
||||
inp_pfx.insert(inp_pfx.begin(), llama_vocab_fim_pre(vocab));
|
||||
inp_sfx.insert(inp_sfx.begin(), llama_vocab_fim_suf(vocab));
|
||||
|
||||
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
|
||||
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
|
||||
if (add_bos) {
|
||||
embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
|
||||
}
|
||||
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
|
||||
|
||||
if (middle_token >= 0) {
|
||||
embd_inp.push_back(middle_token);
|
||||
}
|
||||
|
||||
embd.clear();
|
||||
n_remain = params.n_predict;
|
||||
n_past = 0;
|
||||
n_consumed = 0;
|
||||
is_interacting = false;
|
||||
}
|
||||
// deal with end of generation tokens in interactive mode
|
||||
else if (llama_vocab_is_eog(vocab, common_sampler_last(smpl))) {
|
||||
LOG_DBG("found EOS token\n");
|
||||
|
||||
if (params.interactive) {
|
||||
|
||||
is_interacting = true;
|
||||
LOG("\n");
|
||||
console::set_display(console::user_input);
|
||||
}
|
||||
}
|
||||
|
||||
if (n_past > 0 && is_interacting && !params.interactive) {
|
||||
LOG_DBG("waiting for user input\n");
|
||||
|
||||
if (params.input_prefix_bos) {
|
||||
LOG_DBG("adding input prefix BOS token\n");
|
||||
embd_inp.push_back(llama_vocab_bos(vocab));
|
||||
}
|
||||
|
||||
std::string buffer;
|
||||
if (!params.input_prefix.empty()) {
|
||||
LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
|
||||
buffer += params.input_prefix;
|
||||
LOG("%s", buffer.c_str());
|
||||
}
|
||||
|
||||
std::string line;
|
||||
bool another_line = true;
|
||||
do {
|
||||
another_line = console::readline(line, params.multiline_input);
|
||||
buffer += line;
|
||||
} while (another_line);
|
||||
|
||||
// done taking input, reset color
|
||||
console::set_display(console::reset);
|
||||
|
||||
// Add tokens to embd only if the input buffer is non-empty
|
||||
// Entering a empty line lets the user pass control back
|
||||
if (buffer.length() > 1) {
|
||||
// append input suffix if any
|
||||
if (!params.input_suffix.empty()) {
|
||||
LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
|
||||
buffer += params.input_suffix;
|
||||
LOG("%s", params.input_suffix.c_str());
|
||||
}
|
||||
|
||||
LOG_DBG("buffer: '%s'\n", buffer.c_str());
|
||||
|
||||
const size_t original_size = embd_inp.size();
|
||||
|
||||
const auto line_inp = common_tokenize(ctx, buffer, false);
|
||||
LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
|
||||
|
||||
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
|
||||
|
||||
for (size_t i = original_size; i < embd_inp.size(); ++i) {
|
||||
const llama_token token = embd_inp[i];
|
||||
output_tokens.push_back(token);
|
||||
output_ss << common_token_to_piece(ctx, token);
|
||||
}
|
||||
|
||||
n_remain -= line_inp.size();
|
||||
LOG_DBG("n_remain: %d\n", n_remain);
|
||||
} else {
|
||||
LOG_DBG("empty line, passing control back\n");
|
||||
}
|
||||
|
||||
input_echo = false; // do not echo this again
|
||||
}
|
||||
|
||||
if (n_past > 0) {
|
||||
if (is_interacting) {
|
||||
common_sampler_reset(smpl);
|
||||
}
|
||||
is_interacting = false;
|
||||
}
|
||||
}
|
||||
|
||||
// end of generation
|
||||
if (!embd.empty() && llama_vocab_is_eog(vocab, embd.back()) && !params.interactive) {
|
||||
break;
|
||||
}
|
||||
|
||||
// In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
|
||||
// We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size).
|
||||
if (params.interactive && n_remain <= 0 && params.n_predict >= 0) {
|
||||
n_remain = params.n_predict;
|
||||
is_interacting = true;
|
||||
}
|
||||
}
|
||||
if (!params.interactive && n_remain <= 0) {
|
||||
LOG("%s", common_token_to_piece(ctx, llama_vocab_eot(vocab)).c_str());
|
||||
}
|
||||
|
||||
LOG("\n");
|
||||
common_perf_print(ctx, smpl);
|
||||
|
||||
common_sampler_free(smpl);
|
||||
llama_backend_free();
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -366,6 +366,8 @@ if (MSVC)
|
||||
/wd4005 # Macro redefinition
|
||||
/wd4244 # Conversion from one type to another type, possible loss of data
|
||||
/wd4267 # Conversion from 'size_t' to a smaller type, possible loss of data
|
||||
/wd4996 # Disable POSIX deprecation warnings
|
||||
/wd4702 # Unreachable code warnings
|
||||
)
|
||||
function(disable_msvc_warnings target_name)
|
||||
if(TARGET ${target_name})
|
||||
|
||||
@@ -38,7 +38,7 @@ extern "C" {
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
||||
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
||||
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
|
||||
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
||||
GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft);
|
||||
|
||||
@@ -59,7 +59,7 @@ extern "C" {
|
||||
GGML_API enum ggml_status ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
||||
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
||||
|
||||
@@ -673,11 +673,15 @@ extern "C" {
|
||||
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
||||
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
||||
|
||||
// returns whether the tensor elements can be iterated over with a flattened index (no gaps, no permutation)
|
||||
GGML_API bool ggml_is_contiguous (const struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
|
||||
GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
|
||||
GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
|
||||
|
||||
// returns whether the tensor elements are allocated as one contiguous block of memory (no gaps, but permutation ok)
|
||||
GGML_API bool ggml_is_contiguously_allocated(const struct ggml_tensor * tensor);
|
||||
|
||||
// true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
|
||||
GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
|
||||
|
||||
|
||||
@@ -214,7 +214,7 @@ add_library(ggml
|
||||
target_link_libraries(ggml PUBLIC ggml-base)
|
||||
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
target_link_libraries(ggml PRIVATE dl stdc++fs)
|
||||
target_link_libraries(ggml PRIVATE dl)
|
||||
endif()
|
||||
|
||||
function(ggml_add_backend_library backend)
|
||||
|
||||
@@ -56,7 +56,7 @@ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
|
||||
return SIZE_MAX;
|
||||
}
|
||||
|
||||
size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
|
||||
size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
|
||||
// get_alloc_size is optional, defaults to ggml_nbytes
|
||||
if (buft->iface.get_alloc_size) {
|
||||
size_t size = buft->iface.get_alloc_size(buft, tensor);
|
||||
@@ -152,7 +152,7 @@ size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
|
||||
return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
|
||||
}
|
||||
|
||||
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor) {
|
||||
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
|
||||
}
|
||||
|
||||
|
||||
@@ -72,8 +72,6 @@ static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wro
|
||||
|
||||
#if defined(__GNUC__)
|
||||
#pragma GCC diagnostic ignored "-Woverlength-strings"
|
||||
#elif defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
#define UNUSED GGML_UNUSED
|
||||
|
||||
@@ -20,12 +20,6 @@
|
||||
#define GROUP_MAX_EPS_IQ1_M 1e-7f
|
||||
#define GROUP_MAX_EPS_IQ1_S 1e-12f
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
// disable "possible loss of data" to avoid warnings for hundreds of casts
|
||||
// we should just be careful :)
|
||||
#pragma warning(disable: 4244 4267)
|
||||
#endif
|
||||
|
||||
#define UNUSED GGML_UNUSED
|
||||
|
||||
// some compilers don't provide _mm256_set_m128i, e.g. gcc 7
|
||||
|
||||
@@ -50,19 +50,6 @@
|
||||
#include "llamafile/sgemm.h"
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
// disable "possible loss of data" to avoid hundreds of casts
|
||||
// we should just be careful :)
|
||||
#pragma warning(disable: 4244 4267)
|
||||
|
||||
// disable POSIX deprecation warnings
|
||||
// these functions are never going away, anyway
|
||||
#pragma warning(disable: 4996)
|
||||
|
||||
// unreachable code because of multiple instances of code after GGML_ABORT
|
||||
#pragma warning(disable: 4702)
|
||||
#endif
|
||||
|
||||
// Note: once we move threading into a separate C++ file
|
||||
// will use std::hardware_destructive_interference_size instead of hardcoding it here
|
||||
// and we'll use C++ attribute syntax.
|
||||
|
||||
@@ -8,19 +8,6 @@
|
||||
|
||||
#include <float.h>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
// disable "possible loss of data" to avoid hundreds of casts
|
||||
// we should just be careful :)
|
||||
#pragma warning(disable: 4244 4267)
|
||||
|
||||
// disable POSIX deprecation warnings
|
||||
// these functions are never going away, anyway
|
||||
#pragma warning(disable: 4996)
|
||||
|
||||
// unreachable code because of multiple instances of code after GGML_ABORT
|
||||
#pragma warning(disable: 4702)
|
||||
#endif
|
||||
|
||||
// ggml_compute_forward_dup
|
||||
|
||||
static void ggml_compute_forward_dup_same_cont(
|
||||
|
||||
@@ -2,12 +2,6 @@
|
||||
|
||||
#include <cassert>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
// disable "possible loss of data" to avoid hundreds of casts
|
||||
// we should just be careful :)
|
||||
#pragma warning(disable: 4244 4267)
|
||||
#endif
|
||||
|
||||
// precomputed gelu table for f16 (128 KB)
|
||||
ggml_fp16_t ggml_table_gelu_f16[1 << 16];
|
||||
|
||||
|
||||
@@ -12,12 +12,30 @@ if (CUDAToolkit_FOUND)
|
||||
# 61 == Pascal, __dp4a instruction (per-byte integer dot product)
|
||||
# 70 == V100, FP16 tensor cores
|
||||
# 75 == Turing, int8 tensor cores
|
||||
# 80 == Ampere, asynchronous data loading, faster tensor core instructions
|
||||
# 86 == RTX 3000, needs CUDA v11.1
|
||||
# 89 == RTX 4000, needs CUDA v11.8
|
||||
#
|
||||
# XX-virtual == compile CUDA code as PTX, do JIT compilation to binary code on first run
|
||||
# XX-real == compile CUDA code as device code for this specific architecture
|
||||
# no suffix == compile as both PTX and device code
|
||||
#
|
||||
# The default behavior for a non-native is to build virtual architectures as needed to cover all features needed
|
||||
# for best performance and to also build real architectures for the most commonly used GPUs.
|
||||
if (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6" AND CMAKE_VERSION VERSION_GREATER_EQUAL "3.24")
|
||||
set(CMAKE_CUDA_ARCHITECTURES "native")
|
||||
elseif(GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
|
||||
set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75;80")
|
||||
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.8")
|
||||
set(CMAKE_CUDA_ARCHITECTURES "60-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real;89-real")
|
||||
else()
|
||||
set(CMAKE_CUDA_ARCHITECTURES "60-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real")
|
||||
endif()
|
||||
else()
|
||||
set(CMAKE_CUDA_ARCHITECTURES "50;61;70;75;80")
|
||||
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.8")
|
||||
set(CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real;89-real")
|
||||
else()
|
||||
set(CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
||||
@@ -100,7 +118,7 @@ if (CUDAToolkit_FOUND)
|
||||
|
||||
set(CUDA_CXX_FLAGS "")
|
||||
|
||||
set(CUDA_FLAGS -use_fast_math)
|
||||
set(CUDA_FLAGS -use_fast_math -extended-lambda)
|
||||
|
||||
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")
|
||||
# Options are:
|
||||
|
||||
@@ -130,10 +130,6 @@ static int ggml_cuda_highest_compiled_arch(const int arch) {
|
||||
|
||||
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
#define GGML_CUDA_MAX_STREAMS 8
|
||||
|
||||
[[noreturn]]
|
||||
@@ -300,6 +296,25 @@ static __device__ void no_device_code(
|
||||
#define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.")
|
||||
#endif // __CUDA_ARCH__
|
||||
|
||||
// The compiler is always able to unroll loops if they contain continue expressions.
|
||||
// In such cases loop unrolling can still be achieved via recursion:
|
||||
template <int n>
|
||||
struct ggml_cuda_unroll {
|
||||
template <typename Func, typename... Args>
|
||||
__device__ void operator()(const Func & f, Args... args) const {
|
||||
f(n - 1, args...);
|
||||
ggml_cuda_unroll<n - 1>{}(f, args...);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct ggml_cuda_unroll<1> {
|
||||
template <typename Func, typename... Args>
|
||||
__device__ void operator()(const Func & f, Args... args) const {
|
||||
f(0, args...);
|
||||
}
|
||||
};
|
||||
|
||||
template<int width = WARP_SIZE>
|
||||
static __device__ __forceinline__ int warp_reduce_sum(int x) {
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||
|
||||
@@ -2,6 +2,17 @@
|
||||
|
||||
#include "common.cuh"
|
||||
|
||||
|
||||
static __device__ __forceinline__ unsigned int ggml_cuda_cvta_generic_to_shared(void * generic_ptr) {
|
||||
#ifdef CP_ASYNC_AVAILABLE
|
||||
return __cvta_generic_to_shared(generic_ptr);
|
||||
#else
|
||||
GGML_UNUSED(generic_ptr);
|
||||
NO_DEVICE_CODE;
|
||||
return 0;
|
||||
#endif // CP_ASYNC_AVAILABLE
|
||||
}
|
||||
|
||||
// Copies data from global to shared memory, cg == cache global.
|
||||
// Both the src and dst pointers must be aligned to 16 bit.
|
||||
// Shared memory uses 32 bit addressing, the pointer is passed as unsigned int.
|
||||
|
||||
@@ -516,7 +516,7 @@ constexpr __device__ dequantize_1_f32_t get_dequantize_1_f32(ggml_type type_V) {
|
||||
nullptr;
|
||||
}
|
||||
|
||||
template<int D, int ncols1, int ncols2, int KQ_stride> // D == head size
|
||||
template<int D, int ncols1, int ncols2> // D == head size
|
||||
__launch_bounds__(D, 1)
|
||||
static __global__ void flash_attn_stream_k_fixup(
|
||||
float * __restrict__ dst, const float2 * __restrict__ dst_fixup, const int ne01, const int ne02, const int ne11) {
|
||||
@@ -665,13 +665,13 @@ static void on_no_fattn_vec_case(const int D) {
|
||||
fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for all combinations of q4_0, q4_1, q5_0, q5_1, q8_0, and f16.\n");
|
||||
GGML_ABORT("fatal error");
|
||||
} else {
|
||||
fprintf(stderr, "Unsupported KV type combination for head_size 256.\n");
|
||||
fprintf(stderr, "Unsupported KV type combination for head_size %d.\n", D);
|
||||
fprintf(stderr, "Only f16 is supported.\n");
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
template <int D, int ncols1, int ncols2, int KQ_stride>
|
||||
template <int DV, int ncols1, int ncols2>
|
||||
void launch_fattn(
|
||||
ggml_backend_cuda_context & ctx, ggml_tensor * dst, fattn_kernel_t fattn_kernel, const int nwarps, const size_t nbytes_shared,
|
||||
const int KQ_row_granularity, const bool need_f16_K, const bool need_f16_V, const bool stream_k, const int warp_size = WARP_SIZE
|
||||
@@ -691,7 +691,7 @@ void launch_fattn(
|
||||
|
||||
GGML_ASSERT(!mask || mask->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(!mask || mask->ne[1] >= GGML_PAD(Q->ne[1], 16) &&
|
||||
"the Flash-Attention CUDA kernel requires the mask to be padded to 16 and at least n_queries big");
|
||||
"the Flash-Attention CUDA kernel requires the mask to be padded to 16 and at least n_queries big");
|
||||
|
||||
GGML_ASSERT(K->ne[1] % FATTN_KQ_STRIDE == 0 && "Incorrect KV cache padding.");
|
||||
|
||||
@@ -719,6 +719,7 @@ void launch_fattn(
|
||||
size_t nb23 = V->nb[3];
|
||||
|
||||
if (need_f16_K && K->type != GGML_TYPE_F16) {
|
||||
GGML_ASSERT(ggml_is_contiguously_allocated(K));
|
||||
K_f16.alloc(ggml_nelements(K));
|
||||
to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(K->type);
|
||||
to_fp16(K_data, K_f16.ptr, ggml_nelements(K), main_stream);
|
||||
@@ -733,6 +734,7 @@ void launch_fattn(
|
||||
}
|
||||
|
||||
if (need_f16_V && V->type != GGML_TYPE_F16) {
|
||||
GGML_ASSERT(ggml_is_contiguously_allocated(V));
|
||||
V_f16.alloc(ggml_nelements(V));
|
||||
to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(V->type);
|
||||
to_fp16(V_data, V_f16.ptr, ggml_nelements(V), main_stream);
|
||||
@@ -752,10 +754,13 @@ void launch_fattn(
|
||||
const int ntiles_total = ntiles_x * (Q->ne[2] / ncols2) * Q->ne[3];
|
||||
|
||||
const dim3 block_dim(warp_size, nwarps, 1);
|
||||
int max_blocks_per_sm = 1; // Max. number of active blocks limited by occupancy.
|
||||
CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, fattn_kernel, block_dim.x * block_dim.y * block_dim.z, nbytes_shared));
|
||||
|
||||
dim3 blocks_num;
|
||||
if (stream_k) {
|
||||
// For short contexts it can be faster to have the SMs work on whole tiles because this lets us skip the fixup.
|
||||
const int max_blocks = 2*nsm;
|
||||
const int max_blocks = max_blocks_per_sm*nsm;
|
||||
const int tiles_nwaves = (ntiles_total + max_blocks - 1) / max_blocks;
|
||||
const int tiles_efficiency_percent = 100 * ntiles_total / (max_blocks*tiles_nwaves);
|
||||
|
||||
@@ -767,14 +772,11 @@ void launch_fattn(
|
||||
blocks_num.y = 1;
|
||||
blocks_num.z = 1;
|
||||
|
||||
dst_tmp_meta.alloc(blocks_num.x*ncols * (2*2 + D) * sizeof(float));
|
||||
dst_tmp_meta.alloc(blocks_num.x*ncols * (2*2 + DV) * sizeof(float));
|
||||
} else {
|
||||
GGML_ASSERT(K->ne[1] % KQ_row_granularity == 0);
|
||||
const int ntiles_KQ = K->ne[1] / KQ_row_granularity; // Max. number of parallel blocks limited by tensor size.
|
||||
|
||||
int max_blocks_per_sm = 1; // Max. number of active blocks limited by occupancy.
|
||||
CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, fattn_kernel, block_dim.x * block_dim.y * block_dim.z, nbytes_shared));
|
||||
|
||||
// parallel_blocks should be at least large enough to achieve max. occupancy for a single wave:
|
||||
parallel_blocks = std::max((nsm * max_blocks_per_sm) / ntiles_total, 1);
|
||||
|
||||
@@ -851,19 +853,19 @@ void launch_fattn(
|
||||
|
||||
if (stream_k) {
|
||||
if (ntiles_total % blocks_num.x != 0) { // Fixup is only needed if the SMs work on fractional tiles.
|
||||
const dim3 block_dim_combine(D, 1, 1);
|
||||
const dim3 block_dim_combine(DV, 1, 1);
|
||||
const dim3 blocks_num_combine = {blocks_num.x, ncols1, ncols2};
|
||||
|
||||
flash_attn_stream_k_fixup<D, ncols1, ncols2, KQ_stride>
|
||||
flash_attn_stream_k_fixup<DV, ncols1, ncols2>
|
||||
<<<blocks_num_combine, block_dim_combine, 0, main_stream>>>
|
||||
((float *) KQV->data, dst_tmp_meta.ptr, Q->ne[1], Q->ne[2], K->ne[1]);
|
||||
}
|
||||
} else if (parallel_blocks > 1) {
|
||||
const dim3 block_dim_combine(D, 1, 1);
|
||||
const dim3 block_dim_combine(DV, 1, 1);
|
||||
const dim3 blocks_num_combine(Q->ne[1], 1, blocks_num.z);
|
||||
const size_t nbytes_shared_combine = parallel_blocks*sizeof(float2);
|
||||
|
||||
flash_attn_combine_results<D>
|
||||
flash_attn_combine_results<DV>
|
||||
<<<blocks_num_combine, block_dim_combine, nbytes_shared_combine, main_stream>>>
|
||||
(dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data, parallel_blocks);
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -307,7 +307,7 @@ void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
constexpr int nwarps = 8;
|
||||
constexpr size_t nbytes_shared = 0;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, use_logit_softcap>;
|
||||
launch_fattn<D, cols_per_block, 1, -1>
|
||||
launch_fattn<D, cols_per_block, 1>
|
||||
(ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F16, true, true, false);
|
||||
} break;
|
||||
case 128: {
|
||||
@@ -315,7 +315,7 @@ void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
constexpr int nwarps = 8;
|
||||
constexpr size_t nbytes_shared = 0;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, use_logit_softcap>;
|
||||
launch_fattn<D, cols_per_block, 1, -1>
|
||||
launch_fattn<D, cols_per_block, 1>
|
||||
(ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F16, true, true, false);
|
||||
} break;
|
||||
default: {
|
||||
|
||||
@@ -318,7 +318,7 @@ void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
constexpr int nwarps = 8;
|
||||
constexpr size_t nbytes_shared = 0;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, use_logit_softcap>;
|
||||
launch_fattn<D, cols_per_block, 1, -1>
|
||||
launch_fattn<D, cols_per_block, 1>
|
||||
(ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F32, true, true, false);
|
||||
} break;
|
||||
case 128: {
|
||||
@@ -326,7 +326,7 @@ void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
constexpr int nwarps = 8;
|
||||
constexpr size_t nbytes_shared = 0;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, use_logit_softcap>;
|
||||
launch_fattn<D, cols_per_block, 1, -1>
|
||||
launch_fattn<D, cols_per_block, 1>
|
||||
(ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F32, true, true, false);
|
||||
} break;
|
||||
default: {
|
||||
|
||||
@@ -315,7 +315,7 @@ void ggml_cuda_flash_attn_ext_vec_f16_case_impl(ggml_backend_cuda_context & ctx,
|
||||
constexpr bool need_f16_K = D != 128;
|
||||
constexpr bool need_f16_V = D != 128 && D != 64;
|
||||
constexpr size_t nbytes_shared = 0;
|
||||
launch_fattn<D, cols_per_block, 1, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, D, need_f16_K, need_f16_V, false);
|
||||
launch_fattn<D, cols_per_block, 1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, D, need_f16_K, need_f16_V, false);
|
||||
}
|
||||
|
||||
template <int D, ggml_type type_K, ggml_type type_V>
|
||||
|
||||
@@ -310,7 +310,7 @@ void ggml_cuda_flash_attn_ext_vec_f32_case_impl(ggml_backend_cuda_context & ctx,
|
||||
constexpr bool need_f16_K = D != 128;
|
||||
constexpr bool need_f16_V = D != 128 && D != 64;
|
||||
constexpr size_t nbytes_shared = 0;
|
||||
launch_fattn<D, cols_per_block, 1, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, D, need_f16_K, need_f16_V, false);
|
||||
launch_fattn<D, cols_per_block, 1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, D, need_f16_K, need_f16_V, false);
|
||||
}
|
||||
|
||||
template <int D, ggml_type type_K, ggml_type type_V>
|
||||
|
||||
@@ -490,7 +490,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggm
|
||||
fattn_kernel = flash_attn_ext_f16<
|
||||
D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), KQ_acc_t, use_logit_softcap>;
|
||||
}
|
||||
launch_fattn<D, cols_per_block, 1, -1>(ctx, dst, fattn_kernel, nwarps, 0, FATTN_KQ_STRIDE, true, true, false, warp_size);
|
||||
launch_fattn<D, cols_per_block, 1>(ctx, dst, fattn_kernel, nwarps, 0, FATTN_KQ_STRIDE, true, true, false, warp_size);
|
||||
}
|
||||
|
||||
void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
+71
-45
@@ -8,58 +8,32 @@
|
||||
#include "fattn-wmma-f16.cuh"
|
||||
#include "fattn.cuh"
|
||||
|
||||
template <int D, int ncols2>
|
||||
template <int DKQ, int DV, int ncols2>
|
||||
static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
|
||||
if (Q->ne[1] <= 8/ncols2) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_case<D, 8/ncols2, ncols2>(ctx, dst);
|
||||
return;
|
||||
if constexpr (ncols2 <= 8) {
|
||||
if (Q->ne[1] <= 8/ncols2) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 8/ncols2, ncols2>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (Q->ne[1] <= 16/ncols2) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_case<D, 16/ncols2, ncols2>(ctx, dst);
|
||||
ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 16/ncols2, ncols2>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
if (Q->ne[1] <= 32/ncols2) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_case<D, 32/ncols2, ncols2>(ctx, dst);
|
||||
ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 32/ncols2, ncols2>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_cuda_flash_attn_ext_mma_f16_case<D, 64/ncols2, ncols2>(ctx, dst);
|
||||
ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 64/ncols2, ncols2>(ctx, dst);
|
||||
}
|
||||
|
||||
template <int ncols2>
|
||||
static void ggml_cuda_flash_attn_ext_mma_f16_switch_hs(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
|
||||
switch (Q->ne[0]) {
|
||||
case 64:
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1< 64, ncols2>(ctx, dst);
|
||||
break;
|
||||
case 80:
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1< 80, ncols2>(ctx, dst);
|
||||
break;
|
||||
case 96:
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1< 96, ncols2>(ctx, dst);
|
||||
break;
|
||||
case 112:
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<112, ncols2>(ctx, dst);
|
||||
break;
|
||||
case 128:
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<128, ncols2>(ctx, dst);
|
||||
break;
|
||||
case 256:
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<256, ncols2>(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
template <int DKQ, int DV>
|
||||
static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * KQV = dst;
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
const ggml_tensor * K = dst->src[1];
|
||||
@@ -68,27 +42,79 @@ static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, gg
|
||||
float max_bias = 0.0f;
|
||||
memcpy(&max_bias, (const float *) KQV->op_params + 1, sizeof(float));
|
||||
|
||||
const float use_gqa_opt = mask && max_bias == 0.0f;
|
||||
const bool use_gqa_opt = mask && max_bias == 0.0f;
|
||||
|
||||
GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
|
||||
const int gqa_ratio = Q->ne[2] / K->ne[2];
|
||||
|
||||
if (use_gqa_opt && gqa_ratio % 8 == 0) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_hs<8>(ctx, dst);
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 8>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
if (use_gqa_opt && gqa_ratio == 4) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_hs<4>(ctx, dst);
|
||||
if (use_gqa_opt && gqa_ratio % 4 == 0) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 4>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
if (use_gqa_opt && gqa_ratio == 2) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_hs<2>(ctx, dst);
|
||||
if (use_gqa_opt && gqa_ratio % 2 == 0) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 2>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_hs<1>(ctx, dst);
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 1>(ctx, dst);
|
||||
}
|
||||
|
||||
static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * KQV = dst;
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
const ggml_tensor * K = dst->src[1];
|
||||
const ggml_tensor * V = dst->src[2];
|
||||
const ggml_tensor * mask = dst->src[3];
|
||||
|
||||
switch (Q->ne[0]) {
|
||||
case 64:
|
||||
GGML_ASSERT(V->ne[0] == 64);
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2< 64, 64>(ctx, dst);
|
||||
break;
|
||||
case 80:
|
||||
GGML_ASSERT(V->ne[0] == 80);
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2< 80, 80>(ctx, dst);
|
||||
break;
|
||||
case 96:
|
||||
GGML_ASSERT(V->ne[0] == 96);
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2< 96, 96>(ctx, dst);
|
||||
break;
|
||||
case 112:
|
||||
GGML_ASSERT(V->ne[0] == 112);
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2<112, 112>(ctx, dst);
|
||||
break;
|
||||
case 128:
|
||||
GGML_ASSERT(V->ne[0] == 128);
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2<128, 128>(ctx, dst);
|
||||
break;
|
||||
case 256:
|
||||
GGML_ASSERT(V->ne[0] == 256);
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2<256, 256>(ctx, dst);
|
||||
break;
|
||||
case 576: {
|
||||
// For Deepseek, go straight to the ncols1 switch to avoid compiling unnecessary kernels.
|
||||
GGML_ASSERT(V->ne[0] == 512);
|
||||
float max_bias = 0.0f;
|
||||
memcpy(&max_bias, (const float *) KQV->op_params + 1, sizeof(float));
|
||||
|
||||
const bool use_gqa_opt = mask && max_bias == 0.0f;
|
||||
GGML_ASSERT(use_gqa_opt);
|
||||
|
||||
GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
|
||||
const int gqa_ratio = Q->ne[2] / K->ne[2];
|
||||
GGML_ASSERT(gqa_ratio % 16 == 0);
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 16>(ctx, dst);
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
#define FATTN_VEC_F16_CASE(D, type_K, type_V) \
|
||||
@@ -299,7 +325,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||
const bool gqa_opt_applies = ((Q->ne[2] / K->ne[2]) % 2 == 0) && mask; // The mma-based kernels have GQA-specific optimizations
|
||||
const bool mma_needs_data_conversion = K->type != GGML_TYPE_F16 || V->type != GGML_TYPE_F16;
|
||||
const bool mma_faster_for_bs1 = new_mma_available(cc) && gqa_opt_applies && cc < GGML_CUDA_CC_ADA_LOVELACE && !mma_needs_data_conversion;
|
||||
const bool can_use_vector_kernel = Q->ne[0] % (2*warp_size) == 0;
|
||||
const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % (2*warp_size) == 0;
|
||||
if (Q->ne[1] == 1 && can_use_vector_kernel && !mma_faster_for_bs1) {
|
||||
if (prec == GGML_PREC_DEFAULT) {
|
||||
ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
|
||||
|
||||
@@ -10,10 +10,11 @@ static __global__ void k_get_rows(
|
||||
/*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
|
||||
const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
|
||||
|
||||
const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2;
|
||||
const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
|
||||
const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
|
||||
const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
|
||||
// The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
|
||||
const int i00 = (blockIdx.y * blockDim.x + threadIdx.x)*2;
|
||||
const int i10 = blockIdx.x;
|
||||
const int i11 = blockIdx.z / ne12;
|
||||
const int i12 = blockIdx.z % ne12;
|
||||
|
||||
if (i00 >= ne00) {
|
||||
return;
|
||||
@@ -46,10 +47,11 @@ static __global__ void k_get_rows_float(
|
||||
/*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
|
||||
const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
|
||||
|
||||
const int i00 = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
|
||||
const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
|
||||
const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
|
||||
// The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
|
||||
const int i00 = blockIdx.y * blockDim.x + threadIdx.x;
|
||||
const int i10 = blockIdx.x;
|
||||
const int i11 = blockIdx.z / ne12;
|
||||
const int i12 = blockIdx.z % ne12;
|
||||
|
||||
if (i00 >= ne00) {
|
||||
return;
|
||||
@@ -94,8 +96,8 @@ static void get_rows_cuda_q(
|
||||
const size_t nb1, const size_t nb2, const size_t nb3,
|
||||
cudaStream_t stream) {
|
||||
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
|
||||
const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
|
||||
const dim3 block_nums(block_num_x, ne10, ne11*ne12);
|
||||
const int block_num_y = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
|
||||
const dim3 block_nums(ne10, block_num_y, ne11*ne12);
|
||||
|
||||
// strides in elements
|
||||
// const size_t s0 = nb0 / sizeof(dst_t);
|
||||
@@ -127,8 +129,8 @@ static void get_rows_cuda_float(
|
||||
const size_t nb1, const size_t nb2, const size_t nb3,
|
||||
cudaStream_t stream) {
|
||||
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
|
||||
const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
|
||||
const dim3 block_nums(block_num_x, ne10, ne11*ne12);
|
||||
const int block_num_y = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
|
||||
const dim3 block_nums(ne10, block_num_y, ne11*ne12);
|
||||
|
||||
// strides in elements
|
||||
// const size_t s0 = nb0 / sizeof(dst_t);
|
||||
|
||||
@@ -555,8 +555,8 @@ static enum ggml_status ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer
|
||||
|
||||
if (ggml_is_quantized(tensor->type) && tensor->view_src == nullptr && ggml_backend_buffer_get_usage(buffer) != GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
|
||||
// initialize padding to 0 to avoid possible NaN values
|
||||
size_t original_size = ggml_nbytes(tensor);
|
||||
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
|
||||
const size_t original_size = ggml_nbytes(tensor);
|
||||
const size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
|
||||
|
||||
if (padded_size > original_size) {
|
||||
ggml_cuda_set_device(ctx->device);
|
||||
@@ -679,6 +679,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
|
||||
|
||||
if (ggml_is_quantized(tensor->type)) {
|
||||
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
||||
GGML_ASSERT(tensor->nb[0] == ggml_element_size(tensor));
|
||||
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
||||
}
|
||||
}
|
||||
@@ -800,6 +801,7 @@ static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buff
|
||||
|
||||
static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
|
||||
GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
|
||||
|
||||
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
||||
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
||||
@@ -851,6 +853,7 @@ static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buff
|
||||
// split tensors must always be set in their entirety at once
|
||||
GGML_ASSERT(offset == 0);
|
||||
GGML_ASSERT(size == ggml_nbytes(tensor));
|
||||
GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
|
||||
|
||||
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
||||
|
||||
@@ -889,6 +892,7 @@ static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buff
|
||||
// split tensors must always be set in their entirety at once
|
||||
GGML_ASSERT(offset == 0);
|
||||
GGML_ASSERT(size == ggml_nbytes(tensor));
|
||||
GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
|
||||
|
||||
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
||||
|
||||
@@ -970,6 +974,7 @@ static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buf
|
||||
|
||||
static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
||||
ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
|
||||
GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
|
||||
|
||||
size_t total_size = 0;
|
||||
|
||||
@@ -1531,6 +1536,8 @@ static void ggml_cuda_op_mul_mat(
|
||||
|
||||
// If src0 is on a temporary compute buffer (partial offloading) there may be some padding that needs to be cleared:
|
||||
if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_is_quantized(src0->type) && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
|
||||
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
|
||||
GGML_ASSERT(!src0->view_src);
|
||||
const size_t nbytes_data = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00);
|
||||
const size_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
|
||||
CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data, 0, nbytes_padding, stream));
|
||||
@@ -2062,9 +2069,11 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
}
|
||||
|
||||
ggml_tensor src0_slice = *src0;
|
||||
src0_slice.ne[2] = 1;
|
||||
src0_slice.nb[3] = src0_slice.nb[2];
|
||||
src0_slice.data = (char *) src0->data + i02*nb02;
|
||||
src0_slice.ne[2] = 1;
|
||||
src0_slice.nb[3] = src0_slice.nb[2];
|
||||
src0_slice.op = GGML_OP_VIEW;
|
||||
src0_slice.view_src = dst->src[0]; // non-const pointer to src0
|
||||
src0_slice.data = (char *) src0->data + i02*nb02;
|
||||
|
||||
ggml_tensor src1_slice;
|
||||
memset(&src1_slice, 0, sizeof(src1_slice));
|
||||
@@ -3206,16 +3215,16 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
return false;
|
||||
#endif // FLASH_ATTN_AVAILABLE
|
||||
if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
|
||||
// different head sizes of K and V are not supported yet
|
||||
return false;
|
||||
const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
|
||||
if (!new_mma_available(cc) || cc < GGML_CUDA_CC_AMPERE) {
|
||||
return false;
|
||||
}
|
||||
const int gqa_ratio = op->src[0]->ne[2] / op->src[1]->ne[2];
|
||||
return op->src[1]->ne[0] == 576 && op->src[2]->ne[0] == 512 && op->src[3] && gqa_ratio % 16 == 0;
|
||||
}
|
||||
if (op->src[0]->ne[0] == 192) {
|
||||
return false;
|
||||
}
|
||||
if (op->src[0]->ne[0] == 576) {
|
||||
// DeepSeek MLA
|
||||
return false;
|
||||
}
|
||||
if (op->src[0]->ne[3] != 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -89,6 +89,17 @@ void ggml_cuda_mul_mat_q(
|
||||
const float * src1_d = (const float *) src1->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
// If src0 is a temporary compute buffer, clear any potential padding.
|
||||
if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
|
||||
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
|
||||
GGML_ASSERT(!src0->view_src);
|
||||
const size_t size_data = ggml_nbytes(src0);
|
||||
const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
|
||||
if (size_alloc > size_data) {
|
||||
CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
||||
|
||||
const int64_t s01 = src0->nb[1] / ts_src0;
|
||||
@@ -118,7 +129,7 @@ void ggml_cuda_mul_mat_q(
|
||||
|
||||
const mmq_args args = {
|
||||
src0_d, src0->type, (const int *) src1_q8_1.ptr, nullptr, nullptr, dst_d,
|
||||
ne00, ne01, ne1, s01, s1,
|
||||
ne00, ne01, ne1, s01, ne11, s1,
|
||||
ne02, ne12, s02, s12, s2,
|
||||
ne03, ne13, s03, s13, s3,
|
||||
use_stream_k};
|
||||
@@ -202,7 +213,7 @@ void ggml_cuda_mul_mat_q(
|
||||
// Note that ne02 is used instead of ne12 because the number of y channels determines the z dimension of the CUDA grid.
|
||||
const mmq_args args = {
|
||||
src0_d, src0->type, (const int *) src1_q8_1.ptr, ids_dst_dev, expert_bounds_dev, dst_d,
|
||||
ne00, ne01, ne_get_rows, s01, s1,
|
||||
ne00, ne01, ne_get_rows, s01, ne_get_rows, s1,
|
||||
ne02, ne02, s02, s12, s2,
|
||||
ne03, ne13, s03, s13, s3,
|
||||
use_stream_k};
|
||||
@@ -241,7 +252,7 @@ void ggml_cuda_op_mul_mat_q(
|
||||
ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && src1_ncols == ne11;
|
||||
const mmq_args args = {
|
||||
src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i,
|
||||
ne00, row_diff, src1_ncols, stride01, nrows_dst,
|
||||
ne00, row_diff, src1_ncols, stride01, ne11, nrows_dst,
|
||||
1, 1, 0, 0, 0,
|
||||
1, 1, 0, 0, 0,
|
||||
use_stream_k};
|
||||
|
||||
+27
-27
@@ -2522,7 +2522,7 @@ template <ggml_type type, int mmq_x, int nwarps, bool need_check, bool fixup>
|
||||
static __device__ __forceinline__ void mul_mat_q_process_tile(
|
||||
const char * __restrict__ x, const int offset_x, const int * __restrict__ y,
|
||||
const int * __restrict__ ids_dst, float * __restrict__ dst, float * __restrict__ tmp_fixup,
|
||||
const int nrows_x, const int ncols_y, const int stride_row_x, const int stride_col_dst,
|
||||
const int stride_row_x, const int ncols_y, const int stride_col_dst,
|
||||
const int tile_x_max_i, const int tile_y_max_j, const int kb0_start, const int kb0_stop) {
|
||||
|
||||
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
||||
@@ -2606,7 +2606,7 @@ template <ggml_type type, int mmq_x, int nwarps, bool need_check>
|
||||
static __global__ void mul_mat_q(
|
||||
const char * __restrict__ x, const int * __restrict__ y, const int32_t * __restrict__ ids_dst,
|
||||
const int32_t * __restrict__ expert_bounds, float * __restrict__ dst, float * __restrict__ tmp_fixup,
|
||||
const int ncols_x, const int nrows_x, const int ncols_y, const int stride_row_x, const int stride_col_dst,
|
||||
const int ncols_x, const int nrows_x, const int ncols_dst, const int stride_row_x, const int ncols_y, const int stride_col_dst,
|
||||
const int channel_ratio, const int nchannels_y, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
|
||||
const int sample_ratio, const int nsamples_y, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) {
|
||||
|
||||
@@ -2619,8 +2619,8 @@ static __global__ void mul_mat_q(
|
||||
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
||||
constexpr int mmq_y = get_mmq_y_device();
|
||||
|
||||
const int ntx = (ncols_y + mmq_x - 1) / mmq_x; // Number of tiles x
|
||||
const int nty = (nrows_x + mmq_y - 1) / mmq_y; // Number of tiles y
|
||||
const int ntx = (ncols_dst + mmq_x - 1) / mmq_x; // Number of tiles x
|
||||
const int nty = (nrows_x + mmq_y - 1) / mmq_y; // Number of tiles y
|
||||
|
||||
// Initialize the ids for writing back data with just the index.
|
||||
// For regular matrix multiplications this is never changed.
|
||||
@@ -2648,8 +2648,8 @@ static __global__ void mul_mat_q(
|
||||
|
||||
// Defaults for regular matrix multiplication:
|
||||
int col_low = 0;
|
||||
int col_high = ncols_y;
|
||||
int col_diff = ncols_y;
|
||||
int col_high = ncols_dst;
|
||||
int col_diff = ncols_dst;
|
||||
int offset_y = wt*stride_sample_y + zt*stride_channel_y;
|
||||
int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst;
|
||||
|
||||
@@ -2689,7 +2689,7 @@ static __global__ void mul_mat_q(
|
||||
|
||||
constexpr bool fixup = false;
|
||||
mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
|
||||
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, ncols_y, stride_row_x, stride_col_dst,
|
||||
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst,
|
||||
tile_x_max_i, tile_y_max_j, 0, ncols_x/qk);
|
||||
return;
|
||||
}
|
||||
@@ -2720,8 +2720,8 @@ static __global__ void mul_mat_q(
|
||||
|
||||
// Defaults for regular matrix multiplication:
|
||||
int col_low = 0;
|
||||
int col_high = ncols_y;
|
||||
int col_diff = ncols_y;
|
||||
int col_high = ncols_dst;
|
||||
int col_diff = ncols_dst;
|
||||
int offset_y = wt*stride_sample_y + zt*stride_channel_y;
|
||||
int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst;
|
||||
|
||||
@@ -2767,7 +2767,7 @@ static __global__ void mul_mat_q(
|
||||
|
||||
constexpr bool fixup = false; // All but (potentially) the last iterations write their data to dst rather than the fixup buffer.
|
||||
mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
|
||||
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, ncols_y, stride_row_x, stride_col_dst,
|
||||
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst,
|
||||
tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop);
|
||||
|
||||
kbc += blocks_per_ne00;
|
||||
@@ -2792,8 +2792,8 @@ static __global__ void mul_mat_q(
|
||||
|
||||
// Defaults for regular matrix multiplication:
|
||||
int col_low = 0;
|
||||
int col_high = ncols_y;
|
||||
int col_diff = ncols_y;
|
||||
int col_high = ncols_dst;
|
||||
int col_diff = ncols_dst;
|
||||
int offset_y = wt*stride_sample_y + zt*stride_channel_y;
|
||||
int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst;
|
||||
|
||||
@@ -2834,7 +2834,7 @@ static __global__ void mul_mat_q(
|
||||
|
||||
constexpr bool fixup = true; // Last index writes its data to fixup buffer to avoid data races with other blocks.
|
||||
mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
|
||||
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, ncols_y, stride_row_x, stride_col_dst,
|
||||
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst,
|
||||
tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop);
|
||||
}
|
||||
|
||||
@@ -2842,7 +2842,7 @@ static __global__ void mul_mat_q(
|
||||
template <ggml_type type, int mmq_x, int nwarps, bool need_check>
|
||||
static __global__ void mul_mat_q_stream_k_fixup(
|
||||
const int32_t * ids_dst, const int32_t * expert_bounds, float * __restrict__ dst, const float * __restrict__ tmp_last_tile,
|
||||
const int ncols_x, const int nrows_x, const int ncols_y, const int stride_col_dst,
|
||||
const int ncols_x, const int nrows_x, const int ncols_dst, const int stride_col_dst,
|
||||
const int nchannels_y, const int stride_channel_dst, const int nsamples_y, const int stride_sample_dst) {
|
||||
constexpr int mmq_y = get_mmq_y_device();
|
||||
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
||||
@@ -2851,8 +2851,8 @@ static __global__ void mul_mat_q_stream_k_fixup(
|
||||
|
||||
float sum[mmq_x*mmq_y / (nwarps*WARP_SIZE)] = {0.0f};
|
||||
|
||||
const int ntx = (ncols_y + mmq_x - 1) / mmq_x;
|
||||
const int nty = (nrows_x + mmq_y - 1) / mmq_y;
|
||||
const int ntx = (ncols_dst + mmq_x - 1) / mmq_x;
|
||||
const int nty = (nrows_x + mmq_y - 1) / mmq_y;
|
||||
|
||||
const int bidx0 = blockIdx.x;
|
||||
|
||||
@@ -2925,8 +2925,8 @@ static __global__ void mul_mat_q_stream_k_fixup(
|
||||
const int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst + it*mmq_y;
|
||||
dst += offset_dst;
|
||||
|
||||
const int i_max = nrows_x - it*mmq_y - 1;
|
||||
const int j_max = ncols_y - jt*mmq_x - 1;
|
||||
const int i_max = nrows_x - it*mmq_y - 1;
|
||||
const int j_max = ncols_dst - jt*mmq_x - 1;
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
|
||||
@@ -2989,7 +2989,7 @@ static __global__ void mul_mat_q_stream_k_fixup(
|
||||
|
||||
struct mmq_args {
|
||||
const char * x; ggml_type type_x; const int * y; const int32_t * ids_dst; const int32_t * expert_bounds; float * dst;
|
||||
int64_t ncols_x; int64_t nrows_x; int64_t ncols_y; int64_t stride_row_x; int64_t nrows_dst;
|
||||
int64_t ncols_x; int64_t nrows_x; int64_t ncols_dst; int64_t stride_row_x; int64_t ncols_y; int64_t nrows_dst;
|
||||
int64_t nchannels_x; int64_t nchannels_y; int64_t stride_channel_x; int64_t stride_channel_y; int64_t stride_channel_dst;
|
||||
int64_t nsamples_x; int64_t nsamples_y; int64_t stride_sample_x; int64_t stride_sample_y; int64_t stride_sample_dst;
|
||||
bool use_stream_k;
|
||||
@@ -3025,8 +3025,8 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
||||
}
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
||||
|
||||
const int nty = (args.nrows_x + mmq_y - 1) / mmq_y;
|
||||
const int ntx = (args.ncols_y + mmq_x - 1) / mmq_x;
|
||||
const int nty = (args.nrows_x + mmq_y - 1) / mmq_y;
|
||||
const int ntx = (args.ncols_dst + mmq_x - 1) / mmq_x;
|
||||
const int ntzw = args.nchannels_y * args.nsamples_y;
|
||||
const dim3 block_nums_xy_tiling(nty, ntx, ntzw);
|
||||
|
||||
@@ -3040,14 +3040,14 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
||||
constexpr bool need_check = false;
|
||||
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, nbytes_shared, stream>>>
|
||||
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr,
|
||||
args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst,
|
||||
args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
|
||||
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
|
||||
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst);
|
||||
} else {
|
||||
constexpr bool need_check = true;
|
||||
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, nbytes_shared, stream>>>
|
||||
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr,
|
||||
args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst,
|
||||
args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
|
||||
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
|
||||
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst);
|
||||
}
|
||||
@@ -3068,7 +3068,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
||||
|
||||
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_stream_k, block_dims, nbytes_shared, stream>>>
|
||||
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr,
|
||||
args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst,
|
||||
args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
|
||||
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
|
||||
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst);
|
||||
|
||||
@@ -3077,14 +3077,14 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
||||
}
|
||||
|
||||
mul_mat_q_stream_k_fixup<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_stream_k, block_dims, 0, stream>>>
|
||||
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_y,
|
||||
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_dst,
|
||||
args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst);
|
||||
} else {
|
||||
constexpr bool need_check = true;
|
||||
|
||||
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_stream_k, block_dims, nbytes_shared, stream>>>
|
||||
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr,
|
||||
args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst,
|
||||
args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
|
||||
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
|
||||
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst);
|
||||
|
||||
@@ -3093,7 +3093,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
||||
}
|
||||
|
||||
mul_mat_q_stream_k_fixup<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_stream_k, block_dims, 0, stream>>>
|
||||
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_y,
|
||||
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_dst,
|
||||
args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -513,6 +513,17 @@ void ggml_cuda_mul_mat_vec_q(
|
||||
const int32_t * ids_d = ids ? (const int32_t *) ids->data : nullptr;
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
// If src0 is a temporary compute buffer, clear any potential padding.
|
||||
if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
|
||||
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
|
||||
GGML_ASSERT(!src0->view_src);
|
||||
const size_t size_data = ggml_nbytes(src0);
|
||||
const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
|
||||
if (size_alloc > size_data) {
|
||||
CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
||||
ggml_cuda_pool_alloc<char> src1_q8_1(ctx.pool(), ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1);
|
||||
{
|
||||
|
||||
@@ -163,6 +163,7 @@ void quantize_mmq_q8_1_cuda(
|
||||
const float * x, const int32_t * ids, void * vy, const ggml_type type_src0,
|
||||
const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
|
||||
const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) {
|
||||
GGML_ASSERT(ne00 % 4 == 0);
|
||||
GGML_ASSERT(ne0 % (4*QK8_1) == 0);
|
||||
|
||||
const int64_t block_num_x = (ne0 + 4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ - 1) / (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ);
|
||||
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(576, 512, 1, 16);
|
||||
@@ -2,9 +2,9 @@
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(64, 1, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 1, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 1, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 1, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 1, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 1, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(64, 64, 1, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 80, 1, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 96, 1, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 1, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 1, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 1, 8);
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(64, 16, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 16, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 16, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 16, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 16, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 16, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(64, 64, 16, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 80, 16, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 96, 16, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 16, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 16, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 16, 1);
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(64, 16, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 16, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 16, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 16, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 16, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 16, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(64, 64, 16, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 80, 16, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 96, 16, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 16, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 16, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 16, 2);
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(64, 16, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 16, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 16, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 16, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 16, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 16, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(64, 64, 16, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 80, 16, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 96, 16, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 16, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 16, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 16, 4);
|
||||
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(576, 512, 2, 16);
|
||||
@@ -2,9 +2,9 @@
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(64, 2, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 2, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 2, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 2, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 2, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 2, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(64, 64, 2, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 80, 2, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 96, 2, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 2, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 2, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 2, 4);
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(64, 2, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 2, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 2, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 2, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 2, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 2, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(64, 64, 2, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 80, 2, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 96, 2, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 2, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 2, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 2, 8);
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(64, 32, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 32, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 32, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 32, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 32, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 32, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(64, 64, 32, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 80, 32, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 96, 32, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 32, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 32, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 32, 1);
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(64, 32, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 32, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 32, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 32, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 32, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 32, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(64, 64, 32, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 80, 32, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 96, 32, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 32, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 32, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 32, 2);
|
||||
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(576, 512, 4, 16);
|
||||
@@ -2,9 +2,9 @@
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(64, 4, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 4, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 4, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 4, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 4, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 4, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(64, 64, 4, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 80, 4, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 96, 4, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 4, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 4, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 4, 2);
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(64, 4, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 4, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 4, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 4, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 4, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 4, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(64, 64, 4, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 80, 4, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 96, 4, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 4, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 4, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 4, 4);
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(64, 4, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 4, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 4, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 4, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 4, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 4, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(64, 64, 4, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 80, 4, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 96, 4, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 4, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 4, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 4, 8);
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(64, 64, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 64, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 64, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 64, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 64, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 64, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(64, 64, 64, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 80, 64, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 96, 64, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 64, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 64, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 64, 1);
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(64, 8, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 8, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 8, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 8, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 8, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 8, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(64, 64, 8, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 80, 8, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 96, 8, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 8, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 8, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 8, 1);
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(64, 8, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 8, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 8, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 8, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 8, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 8, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(64, 64, 8, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 80, 8, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 96, 8, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 8, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 8, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 8, 2);
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(64, 8, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 8, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 8, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 8, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 8, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 8, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(64, 64, 8, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 80, 8, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 96, 8, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 8, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 8, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 8, 4);
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(64, 8, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 8, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 8, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 8, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 8, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 8, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(64, 64, 8, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 80, 8, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 96, 8, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 8, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 8, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 8, 8);
|
||||
|
||||
@@ -18,7 +18,7 @@ SOURCE_FATTN_MMA_START = """// This file has been autogenerated by generate_cu_f
|
||||
|
||||
"""
|
||||
|
||||
SOURCE_FATTN_MMA_CASE = "DECL_FATTN_MMA_F16_CASE({head_size}, {ncols1}, {ncols2});\n"
|
||||
SOURCE_FATTN_MMA_CASE = "DECL_FATTN_MMA_F16_CASE({head_size_kq}, {head_size_v}, {ncols1}, {ncols2});\n"
|
||||
|
||||
TYPES_MMQ = [
|
||||
"GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0",
|
||||
@@ -57,18 +57,21 @@ for vkq_size in [16, 32]:
|
||||
with open(f"fattn-vec-f{vkq_size}-instance-hs{head_size}-{get_short_name(type_k)}-{get_short_name(type_v)}.cu", "w") as f:
|
||||
f.write(SOURCE_FATTN_VEC.format(vkq_size=vkq_size, head_size=head_size, type_k=type_k, type_v=type_v))
|
||||
|
||||
for ncols in [8, 16, 32, 64, 128]:
|
||||
for ncols2 in [1, 2, 4, 8]:
|
||||
for ncols in [8, 16, 32, 64]:
|
||||
for ncols2 in [1, 2, 4, 8, 16]:
|
||||
if ncols2 > ncols:
|
||||
continue
|
||||
ncols1 = ncols // ncols2
|
||||
if ncols == 128:
|
||||
continue # Too much register pressure.
|
||||
with open(f"fattn-mma-f16-instance-ncols1_{ncols1}-ncols2_{ncols2}.cu", "w") as f:
|
||||
f.write(SOURCE_FATTN_MMA_START)
|
||||
|
||||
for head_size in [64, 80, 96, 112, 128, 256]:
|
||||
if ncols == 128 and head_size == 256:
|
||||
continue # Needs too much shared memory.
|
||||
f.write(SOURCE_FATTN_MMA_CASE.format(ncols1=ncols1, ncols2=ncols2, head_size=head_size))
|
||||
for head_size_kq in [64, 80, 96, 112, 128, 256, 576]:
|
||||
if head_size_kq != 576 and ncols2 == 16:
|
||||
continue
|
||||
if head_size_kq == 576 and ncols2 != 16:
|
||||
continue
|
||||
head_size_v = head_size_kq if head_size_kq != 576 else 512
|
||||
f.write(SOURCE_FATTN_MMA_CASE.format(ncols1=ncols1, ncols2=ncols2, head_size_kq=head_size_kq, head_size_v=head_size_v))
|
||||
|
||||
for type in TYPES_MMQ:
|
||||
with open(f"mmq-instance-{get_short_name(type)}.cu", "w") as f:
|
||||
|
||||
@@ -299,21 +299,42 @@ typedef struct {
|
||||
} ggml_metal_kargs_mul_mv_ext;
|
||||
|
||||
typedef struct {
|
||||
int32_t nei0;
|
||||
int32_t nei1;
|
||||
uint64_t nbi1;
|
||||
int32_t ne10;
|
||||
int32_t ne11; // n_expert_used (bcast)
|
||||
uint64_t nb11;
|
||||
uint64_t nb12;
|
||||
int32_t neh11; // n_tokens
|
||||
uint64_t nbh11;
|
||||
int32_t ne20; // n_expert_used
|
||||
uint64_t nb21;
|
||||
} ggml_metal_kargs_mul_mm_id_map0;
|
||||
|
||||
typedef struct {
|
||||
int32_t ne20; // n_expert_used
|
||||
int32_t neh0;
|
||||
int32_t neh1;
|
||||
uint64_t nbh1;
|
||||
uint64_t nbh2;
|
||||
int32_t ne0;
|
||||
uint64_t nb1;
|
||||
uint64_t nb2;
|
||||
} ggml_metal_kargs_mul_mm_id_map1;
|
||||
|
||||
typedef struct {
|
||||
int32_t ne00;
|
||||
int32_t ne02;
|
||||
uint64_t nb01;
|
||||
uint64_t nb02;
|
||||
int32_t ne11;
|
||||
int32_t ne12;
|
||||
int32_t ne13;
|
||||
uint64_t nb10;
|
||||
uint64_t nb11;
|
||||
uint64_t nb12;
|
||||
int32_t ne0;
|
||||
int32_t ne1;
|
||||
uint64_t nb03;
|
||||
int32_t neh12;
|
||||
uint64_t nbh10;
|
||||
uint64_t nbh11;
|
||||
uint64_t nbh12;
|
||||
uint64_t nbh13;
|
||||
int32_t neh0;
|
||||
int32_t neh1;
|
||||
int16_t r2;
|
||||
int16_t r3;
|
||||
} ggml_metal_kargs_mul_mm_id;
|
||||
|
||||
typedef struct {
|
||||
|
||||
+221
-110
@@ -306,28 +306,30 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP1_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F16,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F16,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F16,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F16,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F16,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F16,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F16,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F16,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F16,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F16,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F16,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F16,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F16,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F16,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F16,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F16,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F16,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F16,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F16,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F16,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F16,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F16,
|
||||
GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32,
|
||||
GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16,
|
||||
GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32,
|
||||
@@ -650,7 +652,8 @@ static void ggml_metal_mem_pool_reset(struct ggml_metal_mem_pool * mem_pool) {
|
||||
}
|
||||
|
||||
if (mem_pool->heaps_to_remove.count > 0) {
|
||||
for (NSUInteger i = 0; i < [mem_pool->heaps_to_remove count]; i++) {
|
||||
// remove in reverse order
|
||||
for (NSUInteger i = [mem_pool->heaps_to_remove count] - 1; ; --i) {
|
||||
NSUInteger index = [[mem_pool->heaps_to_remove objectAtIndex:i] intValue];
|
||||
ggml_metal_heap_ptr * ptr = [mem_pool->heaps objectAtIndex:index];
|
||||
|
||||
@@ -659,6 +662,10 @@ static void ggml_metal_mem_pool_reset(struct ggml_metal_mem_pool * mem_pool) {
|
||||
|
||||
[mem_pool->heaps removeObjectAtIndex:index];
|
||||
[ptr release];
|
||||
|
||||
if (i == 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
[mem_pool->heaps_to_remove removeAllObjects];
|
||||
@@ -672,7 +679,7 @@ static void ggml_metal_mem_pool_clear(struct ggml_metal_mem_pool * mem_pool) {
|
||||
}
|
||||
|
||||
static id<MTLBuffer> ggml_metal_mem_pool_alloc(struct ggml_metal_mem_pool * mem_pool, size_t size) {
|
||||
const size_t alignment = 32;
|
||||
const size_t alignment = 256;
|
||||
|
||||
const size_t size_aligned = GGML_PAD(size, alignment);
|
||||
|
||||
@@ -1242,28 +1249,30 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32, mul_mm_iq1_m_f32, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, mul_mm_iq4_nl_f32, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32, mul_mm_iq4_xs_f32, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, mul_mm_id_f16_f32, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F32, mul_mm_id_bf16_f32, has_simdgroup_mm && use_bfloat);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, mul_mm_id_q4_0_f32, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32, mul_mm_id_q4_1_f32, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32, mul_mm_id_q5_0_f32, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32, mul_mm_id_q5_1_f32, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32, mul_mm_id_q8_0_f32, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32, mul_mm_id_q2_K_f32, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32, mul_mm_id_q3_K_f32, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32, mul_mm_id_q4_K_f32, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32, mul_mm_id_q5_K_f32, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32, mul_mm_id_q6_K_f32, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32, mul_mm_id_iq2_xxs_f32, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, mul_mm_id_iq2_xs_f32, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, mul_mm_id_iq3_xxs_f32, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32, mul_mm_id_iq3_s_f32, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32, mul_mm_id_iq2_s_f32, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, mul_mm_id_iq1_s_f32, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32, mul_mm_id_iq1_m_f32, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32, mul_mm_id_iq4_nl_f32, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32, mul_mm_id_iq4_xs_f32, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16, mul_mm_id_map0_f16, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP1_F32, mul_mm_id_map1_f32, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F16, mul_mm_id_f32_f16, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F16, mul_mm_id_f16_f16, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F16, mul_mm_id_bf16_f16, has_simdgroup_mm && use_bfloat);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F16, mul_mm_id_q4_0_f16, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F16, mul_mm_id_q4_1_f16, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F16, mul_mm_id_q5_0_f16, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F16, mul_mm_id_q5_1_f16, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F16, mul_mm_id_q8_0_f16, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F16, mul_mm_id_q2_K_f16, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F16, mul_mm_id_q3_K_f16, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F16, mul_mm_id_q4_K_f16, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F16, mul_mm_id_q5_K_f16, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F16, mul_mm_id_q6_K_f16, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F16, mul_mm_id_iq2_xxs_f16, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F16, mul_mm_id_iq2_xs_f16, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F16, mul_mm_id_iq3_xxs_f16, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F16, mul_mm_id_iq3_s_f16, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F16, mul_mm_id_iq2_s_f16, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F16, mul_mm_id_iq1_s_f16, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F16, mul_mm_id_iq1_m_f16, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F16, mul_mm_id_iq4_nl_f16, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F16, mul_mm_id_iq4_xs_f16, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32, rope_norm_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16, rope_norm_f16, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32, rope_neox_f32, true);
|
||||
@@ -2999,7 +3008,7 @@ static bool ggml_metal_encode_node(
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:3];
|
||||
|
||||
[encoder setThreadgroupMemoryLength:8192 atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne11 + 31)/32, (ne01 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
|
||||
} else {
|
||||
id<MTLComputePipelineState> pipeline = nil;
|
||||
|
||||
@@ -3219,8 +3228,6 @@ static bool ggml_metal_encode_node(
|
||||
} break;
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
{
|
||||
const int n_as = src0->ne[2];
|
||||
|
||||
// src2 = ids
|
||||
const enum ggml_type src2t = src2->type; GGML_UNUSED(src2t);
|
||||
|
||||
@@ -3234,24 +3241,21 @@ static bool ggml_metal_encode_node(
|
||||
GGML_ASSERT(ne03 == 1);
|
||||
GGML_ASSERT(ne13 == 1);
|
||||
|
||||
const uint32_t r2 = 1;
|
||||
const uint32_t r3 = 1;
|
||||
|
||||
// find the break-even point where the matrix-matrix kernel becomes more efficient compared
|
||||
// to the matrix-vector kernel
|
||||
// ne20 = n_used_experts
|
||||
// ne21 = n_rows
|
||||
const int dst_rows = ne20*ne21;
|
||||
const int dst_rows_min = n_as;
|
||||
const int dst_rows_max = (device.maxThreadgroupMemoryLength/2 - 8192)/4;
|
||||
|
||||
// max size of the rowids array in the kernel shared buffer
|
||||
//GGML_ASSERT(dst_rows <= dst_rows_max);
|
||||
// ne21 = n_rows (batch size)
|
||||
const int ne21_mm_id_min = 32;
|
||||
|
||||
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
|
||||
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
|
||||
if ([device supportsFamily:MTLGPUFamilyApple7] &&
|
||||
ne00 % 32 == 0 && ne00 >= 64 &&
|
||||
//ne01 / ne02 >= 512 && // NOTE: this is based on Mixtral shapes, might need adjustments
|
||||
dst_rows > dst_rows_min &&
|
||||
dst_rows <= dst_rows_max) {
|
||||
(ne21 >= ne21_mm_id_min)) {
|
||||
GGML_ASSERT(ne00 % 4 == 0);
|
||||
|
||||
// some Metal matrix data types require aligned pointers
|
||||
// ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5)
|
||||
@@ -3262,62 +3266,169 @@ static bool ggml_metal_encode_node(
|
||||
default: break;
|
||||
}
|
||||
|
||||
id<MTLComputePipelineState> pipeline = nil;
|
||||
const int64_t neh10 = ne10; // n_embd
|
||||
const int64_t neh11 = ne21; // n_tokens
|
||||
const int64_t neh12 = ne02; // n_expert
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32 ].pipeline; break;
|
||||
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32 ].pipeline; break;
|
||||
case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F32 ].pipeline; break;
|
||||
case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32 ].pipeline; break;
|
||||
case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32 ].pipeline; break;
|
||||
case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32 ].pipeline; break;
|
||||
case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32 ].pipeline; break;
|
||||
case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32 ].pipeline; break;
|
||||
case GGML_TYPE_Q2_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32 ].pipeline; break;
|
||||
case GGML_TYPE_Q3_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32 ].pipeline; break;
|
||||
case GGML_TYPE_Q4_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32 ].pipeline; break;
|
||||
case GGML_TYPE_Q5_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32 ].pipeline; break;
|
||||
case GGML_TYPE_Q6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32].pipeline; break;
|
||||
case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32].pipeline; break;
|
||||
case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ2_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32 ].pipeline; break;
|
||||
default: GGML_ABORT("MUL_MAT_ID not implemented");
|
||||
const uint64_t nbh10 = ggml_type_size(GGML_TYPE_F16);
|
||||
const uint64_t nbh11 = nbh10*neh10;
|
||||
const uint64_t nbh12 = nbh11*neh11;
|
||||
const uint64_t nbh13 = nbh12*neh12;
|
||||
|
||||
const size_t s_src1 = ggml_type_size(GGML_TYPE_F16)*neh10*neh11*neh12;
|
||||
id<MTLBuffer> h_src1 = ggml_metal_mem_pool_alloc(mem_pool, s_src1);
|
||||
if (!h_src1) {
|
||||
GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_src1);
|
||||
return false;
|
||||
}
|
||||
|
||||
ggml_metal_kargs_mul_mm_id args = {
|
||||
/*.nei0 =*/ ne20,
|
||||
/*.nei1 =*/ ne21,
|
||||
/*.nbi1 =*/ nb21,
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne02 =*/ ne02,
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.nb02 =*/ nb02,
|
||||
/*.ne11 =*/ ne11,
|
||||
/*.ne12 =*/ ne12,
|
||||
/*.ne13 =*/ ne13,
|
||||
/*.nb10 =*/ nb10,
|
||||
/*.nb11 =*/ nb11,
|
||||
/*.nb12 =*/ nb12,
|
||||
/*.ne0 =*/ ne0,
|
||||
/*.ne1 =*/ ne1,
|
||||
};
|
||||
const int64_t neh0 = ne0;
|
||||
const int64_t neh1 = ne21;
|
||||
const int64_t neh2 = ne02;
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:0];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:2];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:3];
|
||||
[encoder setBuffer:id_src2 offset:offs_src2 atIndex:4];
|
||||
const uint64_t nbh0 = ggml_type_size(GGML_TYPE_F32);
|
||||
const uint64_t nbh1 = nbh0*neh0;
|
||||
const uint64_t nbh2 = nbh1*neh1;
|
||||
//const uint64_t nbh3 = nbh2*neh2;
|
||||
|
||||
[encoder setThreadgroupMemoryLength:GGML_PAD(8192 + dst_rows*4/*sizeof(ushort2)*/, 16) atIndex:0];
|
||||
const size_t s_dst = ggml_type_size(GGML_TYPE_F32)*neh0*neh1*neh2;
|
||||
id<MTLBuffer> h_dst = ggml_metal_mem_pool_alloc(mem_pool, s_dst);
|
||||
if (!h_dst) {
|
||||
GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_dst);
|
||||
return false;
|
||||
}
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 31)/32, (ne01 + 63)/64, n_as) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
|
||||
// tokens per expert
|
||||
const size_t s_tpe = ggml_type_size(GGML_TYPE_I32)*ne02;
|
||||
id<MTLBuffer> h_tpe = ggml_metal_mem_pool_alloc(mem_pool, s_tpe);
|
||||
if (!h_tpe) {
|
||||
GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_tpe);
|
||||
return false;
|
||||
}
|
||||
|
||||
// id map
|
||||
// [n_expert_used, n_tokens]
|
||||
const size_t s_ids = ggml_type_size(GGML_TYPE_I32)*ne20*ne21;
|
||||
id<MTLBuffer> h_ids = ggml_metal_mem_pool_alloc(mem_pool, s_ids);
|
||||
if (!h_ids) {
|
||||
GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_ids);
|
||||
return false;
|
||||
}
|
||||
|
||||
{
|
||||
const int nth = MIN(1024, ne10/4);
|
||||
|
||||
ggml_metal_kargs_mul_mm_id_map0 args = {
|
||||
ne10,
|
||||
ne11, // n_expert_used (bcast)
|
||||
nb11,
|
||||
nb12,
|
||||
neh11, // n_tokens
|
||||
nbh11,
|
||||
ne20, // n_expert_used
|
||||
nb21,
|
||||
};
|
||||
|
||||
id<MTLComputePipelineState> pipeline = nil;
|
||||
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16].pipeline;
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
[encoder setBuffer:id_src2 offset:offs_src2 atIndex:2];
|
||||
[encoder setBuffer: h_src1 offset:0 atIndex:3];
|
||||
[encoder setBuffer: h_tpe offset:0 atIndex:4];
|
||||
[encoder setBuffer: h_ids offset:0 atIndex:5];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne02, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
}
|
||||
|
||||
{
|
||||
id<MTLComputePipelineState> pipeline = nil;
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F16 ].pipeline; break;
|
||||
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F16 ].pipeline; break;
|
||||
case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F16 ].pipeline; break;
|
||||
case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F16 ].pipeline; break;
|
||||
case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F16 ].pipeline; break;
|
||||
case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F16 ].pipeline; break;
|
||||
case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F16 ].pipeline; break;
|
||||
case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F16 ].pipeline; break;
|
||||
case GGML_TYPE_Q2_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F16 ].pipeline; break;
|
||||
case GGML_TYPE_Q3_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F16 ].pipeline; break;
|
||||
case GGML_TYPE_Q4_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F16 ].pipeline; break;
|
||||
case GGML_TYPE_Q5_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F16 ].pipeline; break;
|
||||
case GGML_TYPE_Q6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F16 ].pipeline; break;
|
||||
case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F16].pipeline; break;
|
||||
case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F16 ].pipeline; break;
|
||||
case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F16].pipeline; break;
|
||||
case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F16 ].pipeline; break;
|
||||
case GGML_TYPE_IQ2_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F16 ].pipeline; break;
|
||||
case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F16 ].pipeline; break;
|
||||
case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F16 ].pipeline; break;
|
||||
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F16 ].pipeline; break;
|
||||
case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F16 ].pipeline; break;
|
||||
default: GGML_ABORT("MUL_MAT_ID not implemented");
|
||||
}
|
||||
|
||||
ggml_metal_kargs_mul_mm_id args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne02 =*/ ne02,
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.nb02 =*/ nb02,
|
||||
/*.nb03 =*/ nb03,
|
||||
/*.neh12 =*/ neh12,
|
||||
/*.nbh10 =*/ nbh10,
|
||||
/*.nbh11 =*/ nbh11,
|
||||
/*.nbh12 =*/ nbh12,
|
||||
/*.nbh13 =*/ nbh13,
|
||||
/*.neh0 =*/ neh0,
|
||||
/*.neh1 =*/ neh1,
|
||||
/*.r2 =*/ r2,
|
||||
/*.r3 =*/ r3,
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:0];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
|
||||
[encoder setBuffer: h_src1 offset:0 atIndex:2];
|
||||
[encoder setBuffer: h_tpe offset:0 atIndex:3];
|
||||
[encoder setBuffer: h_dst offset:0 atIndex:4];
|
||||
|
||||
[encoder setThreadgroupMemoryLength:8192 atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 31)/32, (ne01 + 63)/64, ne02) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
|
||||
}
|
||||
|
||||
{
|
||||
GGML_ASSERT(ne0 % 4 == 0);
|
||||
|
||||
const int nth = MIN(1024, ne0/4);
|
||||
|
||||
ggml_metal_kargs_mul_mm_id_map1 args = {
|
||||
ne20, // n_expert_used
|
||||
neh0,
|
||||
neh1,
|
||||
nbh1,
|
||||
nbh2,
|
||||
ne0,
|
||||
nb1,
|
||||
nb2,
|
||||
};
|
||||
|
||||
id<MTLComputePipelineState> pipeline = nil;
|
||||
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP1_F32].pipeline;
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:0];
|
||||
[encoder setBuffer: h_dst offset:0 atIndex:1];
|
||||
[encoder setBuffer: h_ids offset:0 atIndex:2];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:3];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne20, ne21, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
}
|
||||
} else {
|
||||
id<MTLComputePipelineState> pipeline = nil;
|
||||
|
||||
@@ -3511,7 +3622,7 @@ static bool ggml_metal_encode_node(
|
||||
[encoder setBuffer:id_src2 offset:offs_src2 atIndex:4];
|
||||
|
||||
const int64_t _ne1 = 1;
|
||||
const int64_t ne123 = dst_rows;
|
||||
const int64_t ne123 = ne20*ne21;
|
||||
|
||||
if (smem > 0) {
|
||||
[encoder setThreadgroupMemoryLength:smem atIndex:0];
|
||||
|
||||
@@ -6336,127 +6336,219 @@ kernel void kernel_mul_mm(
|
||||
}
|
||||
}
|
||||
|
||||
// same as kernel_mul_mm_impl, but src1 and dst are accessed via indices stored in rowids
|
||||
// TODO: this kernel needs to be reimplemented from scratch for better performance
|
||||
template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
|
||||
void kernel_mul_mm_id_impl(
|
||||
int32_t ne00,
|
||||
int32_t ne02,
|
||||
uint64_t nb01,
|
||||
uint64_t nb02,
|
||||
int32_t ne11,
|
||||
int32_t ne12,
|
||||
uint64_t nb10,
|
||||
uint64_t nb11,
|
||||
uint64_t nb12,
|
||||
int32_t ne0,
|
||||
int32_t ne1,
|
||||
int64_t ne0ne1,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
threadgroup ushort2 * rowids,
|
||||
device char * dst,
|
||||
threadgroup char * shmem,
|
||||
template<typename T4>
|
||||
kernel void kernel_mul_mm_id_map0(
|
||||
constant ggml_metal_kargs_mul_mm_id_map0 & args,
|
||||
device const char * src1,
|
||||
device const char * src2,
|
||||
device char * hsrc1,
|
||||
device char * htpe,
|
||||
device char * hids,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort3 tpitg[[thread_position_in_threadgroup]],
|
||||
ushort3 ntg[[threads_per_threadgroup]]) {
|
||||
const int ide = tgpig[0]; // expert id
|
||||
|
||||
int n_all = 0;
|
||||
|
||||
device int32_t * ids_i32 = (device int32_t *) (hids);
|
||||
|
||||
for (int i21 = 0; i21 < args.neh11; i21++) { // n_tokens
|
||||
device const int32_t * src2_i32 = (device const int32_t *) (src2 + i21*args.nb21);
|
||||
|
||||
for (int i20 = 0; i20 < args.ne20; i20++) { // n_expert_used
|
||||
if (src2_i32[i20] != ide) {
|
||||
continue;
|
||||
}
|
||||
|
||||
device const float4 * src1_f32x4 = (device const float4 *) ( src1 + i21*args.nb12 + (i20%args.ne11)*args.nb11);
|
||||
device T4 * hsrc1_f32x4 = (device T4 *) (hsrc1 + (ide*args.neh11 + n_all)*args.nbh11);
|
||||
|
||||
for (int64_t i00 = tpitg.x; i00 < args.ne10/4; i00 += ntg.x) {
|
||||
hsrc1_f32x4[i00] = (T4) (src1_f32x4[i00]);
|
||||
}
|
||||
|
||||
if (tpitg.x == 0) {
|
||||
ids_i32[i21*args.ne20 + i20] = ide*args.neh11 + n_all;
|
||||
}
|
||||
|
||||
++n_all;
|
||||
}
|
||||
}
|
||||
|
||||
if (tpitg.x == 0) {
|
||||
device int32_t * tpe_i32 = (device int32_t *) (htpe);
|
||||
tpe_i32[ide] = n_all;
|
||||
}
|
||||
}
|
||||
|
||||
typedef decltype(kernel_mul_mm_id_map0<half4>) kernel_mul_mm_id_map0_t;
|
||||
|
||||
template [[host_name("kernel_mul_mm_id_map0_f16")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<half4>;
|
||||
|
||||
template<typename T>
|
||||
kernel void kernel_mul_mm_id_map1(
|
||||
constant ggml_metal_kargs_mul_mm_id_map1 & args,
|
||||
device const char * hdst,
|
||||
device const char * hids,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort3 tpitg[[thread_position_in_threadgroup]],
|
||||
ushort3 ntg[[threads_per_threadgroup]]) {
|
||||
const int i20 = tgpig[0]; // used expert
|
||||
const int i21 = tgpig[1]; // token
|
||||
|
||||
device const int32_t * ids_i32 = (device const int32_t *) (hids);
|
||||
device float4 * dst_f32x4 = (device float4 *) (dst + i20*args.nb1 + i21*args.nb2);
|
||||
|
||||
const int id = ids_i32[i21*args.ne20 + i20];
|
||||
|
||||
const int ide = id / args.neh1;
|
||||
const int idt = id % args.neh1;
|
||||
|
||||
device const float4 * hdst_f32x4 = (device const float4 *) (hdst + idt*args.nbh1 + ide*args.nbh2);
|
||||
|
||||
for (int64_t i0 = tpitg.x; i0 < args.neh0/4; i0 += ntg.x) {
|
||||
dst_f32x4[i0] = hdst_f32x4[i0];
|
||||
}
|
||||
}
|
||||
|
||||
typedef decltype(kernel_mul_mm_id_map1<float>) kernel_mul_mm_id_map1_t;
|
||||
|
||||
template [[host_name("kernel_mul_mm_id_map1_f32")]] kernel kernel_mul_mm_id_map1_t kernel_mul_mm_id_map1<float>;
|
||||
|
||||
template<typename T, typename T4x4, typename simdgroup_T8x8, typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread T4x4 &)>
|
||||
kernel void kernel_mul_mm_id(
|
||||
constant ggml_metal_kargs_mul_mm_id & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device const char * tpe,
|
||||
device char * dst,
|
||||
threadgroup char * shmem [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort tiitg[[thread_index_in_threadgroup]],
|
||||
ushort sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
|
||||
threadgroup half * sa = (threadgroup half *)(shmem);
|
||||
threadgroup float * sb = (threadgroup float *)(shmem + 4096);
|
||||
threadgroup T * sa = (threadgroup T *)(shmem);
|
||||
threadgroup half * sb = (threadgroup half *)(shmem + 4096);
|
||||
|
||||
const int r0 = tgpig.y;
|
||||
const int r1 = tgpig.x;
|
||||
const int im = tgpig.z;
|
||||
|
||||
if (r1*BLOCK_SIZE_N >= ne1) return;
|
||||
device const int32_t * tpe_i32 = (device const int32_t *) (tpe);
|
||||
|
||||
const int neh1 = tpe_i32[im];
|
||||
|
||||
if (r1*BLOCK_SIZE_N >= neh1) {
|
||||
return;
|
||||
}
|
||||
|
||||
// if this block is of 64x32 shape or smaller
|
||||
short n_rows = (ne0 - r0 * BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0 * BLOCK_SIZE_M) : BLOCK_SIZE_M;
|
||||
short n_cols = (ne1 - r1 * BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1 * BLOCK_SIZE_N) : BLOCK_SIZE_N;
|
||||
const short n_rows = (args.neh0 - r0*BLOCK_SIZE_M < BLOCK_SIZE_M) ? (args.neh0 - r0*BLOCK_SIZE_M) : BLOCK_SIZE_M;
|
||||
const short n_cols = ( neh1 - r1*BLOCK_SIZE_N < BLOCK_SIZE_N) ? ( neh1 - r1*BLOCK_SIZE_N) : BLOCK_SIZE_N;
|
||||
|
||||
// a thread shouldn't load data outside of the matrix
|
||||
short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
|
||||
short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
|
||||
const short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
|
||||
const short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
|
||||
|
||||
simdgroup_half8x8 ma[4];
|
||||
simdgroup_float8x8 mb[2];
|
||||
simdgroup_T8x8 ma[4];
|
||||
simdgroup_half8x8 mb[2];
|
||||
simdgroup_float8x8 mc[8];
|
||||
for (int i = 0; i < 8; i++){
|
||||
|
||||
for (short i = 0; i < 8; i++){
|
||||
mc[i] = make_filled_simdgroup_matrix<float, 8>(0.f);
|
||||
}
|
||||
|
||||
short il = (tiitg % THREAD_PER_ROW);
|
||||
|
||||
ushort offset1 = il/nl;
|
||||
const int i12 = im%args.neh12;
|
||||
const int i13 = im/args.neh12;
|
||||
|
||||
threadgroup const auto & id = rowids[r1 * BLOCK_SIZE_N + thread_col];
|
||||
const uint64_t offset0 = (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
|
||||
const short offset1 = il/nl;
|
||||
|
||||
device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01) + offset1;
|
||||
device const float * y = (device const float *)(src1
|
||||
+ nb12 * id[1]
|
||||
+ nb11 * (id[0] % ne11)
|
||||
+ nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
|
||||
device const block_q * x = (device const block_q *)(src0
|
||||
+ args.nb01*(r0*BLOCK_SIZE_M + thread_row) + offset0) + offset1;
|
||||
|
||||
for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) {
|
||||
device const half * y = (device const half *)(src1
|
||||
+ args.nbh13*i13
|
||||
+ args.nbh12*i12
|
||||
+ args.nbh11*(r1*BLOCK_SIZE_N + thread_col)
|
||||
+ args.nbh10*(BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
|
||||
|
||||
for (int loop_k = 0; loop_k < args.ne00; loop_k += BLOCK_SIZE_K) {
|
||||
// load data and store to threadgroup memory
|
||||
half4x4 temp_a;
|
||||
T4x4 temp_a;
|
||||
dequantize_func(x, il, temp_a);
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
for (int i = 0; i < 16; i++) {
|
||||
*(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
|
||||
+ (tiitg % THREAD_PER_ROW) * 16 + (i / 8) * 8) \
|
||||
+ (tiitg / THREAD_PER_ROW) % 8 + (i & 7) * 8) = temp_a[i/4][i%4];
|
||||
#pragma unroll(16)
|
||||
for (short i = 0; i < 16; i++) {
|
||||
*(sa + SG_MAT_SIZE * ((tiitg/THREAD_PER_ROW/8) \
|
||||
+ (tiitg%THREAD_PER_ROW)*16 + (i/8)*8) \
|
||||
+ (tiitg/THREAD_PER_ROW)%8 + (i&7)*8) = temp_a[i/4][i%4];
|
||||
}
|
||||
|
||||
*(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) = *((device float2x4 *)y);
|
||||
*(threadgroup half2x4 *)(sb + 32*8*(tiitg%THREAD_PER_COL) + 8*(tiitg/THREAD_PER_COL)) = *((device half2x4 *) y);
|
||||
|
||||
il = (il + 2 < nl) ? il + 2 : il % 2;
|
||||
x = (il < 2) ? x + (2+nl-1)/nl : x;
|
||||
x = (il < 2) ? x + (2 + nl - 1)/nl : x;
|
||||
y += BLOCK_SIZE_K;
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// load matrices from threadgroup memory and conduct outer products
|
||||
threadgroup half * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2));
|
||||
threadgroup float * lsmb = (sb + THREAD_MAT_N * SG_MAT_SIZE * (sgitg / 2));
|
||||
threadgroup const T * lsma = (sa + THREAD_MAT_M*SG_MAT_SIZE*(sgitg%2));
|
||||
threadgroup const half * lsmb = (sb + THREAD_MAT_N*SG_MAT_SIZE*(sgitg/2));
|
||||
|
||||
#pragma unroll(BLOCK_SIZE_K/8)
|
||||
for (int ik = 0; ik < BLOCK_SIZE_K / 8; ik++) {
|
||||
#pragma unroll(4)
|
||||
for (short ik = 0; ik < BLOCK_SIZE_K/8; ik++) {
|
||||
#pragma unroll(4)
|
||||
for (int i = 0; i < 4; i++) {
|
||||
for (short i = 0; i < 4; i++) {
|
||||
simdgroup_load(ma[i], lsma + SG_MAT_SIZE * i);
|
||||
}
|
||||
|
||||
simdgroup_barrier(mem_flags::mem_none);
|
||||
|
||||
#pragma unroll(2)
|
||||
for (int i = 0; i < 2; i++) {
|
||||
for (short i = 0; i < 2; i++) {
|
||||
simdgroup_load(mb[i], lsmb + SG_MAT_SIZE * i);
|
||||
}
|
||||
|
||||
lsma += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE;
|
||||
lsmb += BLOCK_SIZE_N / SG_MAT_ROW * SG_MAT_SIZE;
|
||||
|
||||
#pragma unroll(8)
|
||||
for (int i = 0; i < 8; i++){
|
||||
for (short i = 0; i < 8; i++){
|
||||
simdgroup_multiply_accumulate(mc[i], mb[i/4], ma[i%4], mc[i]);
|
||||
}
|
||||
|
||||
lsma += (BLOCK_SIZE_M/SG_MAT_ROW)*SG_MAT_SIZE;
|
||||
lsmb += (BLOCK_SIZE_N/SG_MAT_ROW)*SG_MAT_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
if ((r0 + 1) * BLOCK_SIZE_M <= args.neh0 && (r1 + 1) * BLOCK_SIZE_N <= neh1) {
|
||||
device float * C = (device float *) dst +
|
||||
(BLOCK_SIZE_M * r0 + 32*(sgitg & 1)) + \
|
||||
(BLOCK_SIZE_N * r1 + 16*(sgitg >> 1)) * args.neh0 + im*args.neh1*args.neh0;
|
||||
|
||||
for (short i = 0; i < 8; i++) {
|
||||
simdgroup_store(mc[i], C + 8 * (i%4) + 8 * args.neh0 * (i/4), args.neh0);
|
||||
}
|
||||
} else {
|
||||
// block is smaller than 64x32, we should avoid writing data outside of the matrix
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
threadgroup float * temp_str = ((threadgroup float *) shmem) \
|
||||
+ 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
|
||||
for (int i = 0; i < 8; i++) {
|
||||
simdgroup_store(mc[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
|
||||
+ 32*(sgitg&1) + (16*(sgitg >> 1))*BLOCK_SIZE_M;
|
||||
for (short i = 0; i < 8; i++) {
|
||||
simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*BLOCK_SIZE_M*(i/4), BLOCK_SIZE_M);
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
if (sgitg == 0) {
|
||||
for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) {
|
||||
threadgroup const auto & jid = rowids[r1 * BLOCK_SIZE_N + j];
|
||||
int64_t joff = jid[0]*ne0 + jid[1]*ne0ne1;
|
||||
|
||||
device float * D = (device float *) dst + (r0*BLOCK_SIZE_M) + joff;
|
||||
device float * D = (device float *) dst + (r0*BLOCK_SIZE_M) + (r1*BLOCK_SIZE_N + j)*args.neh0 + im*args.neh1*args.neh0;
|
||||
device float4 * D4 = (device float4 *) D;
|
||||
|
||||
threadgroup float * C = temp_str + (j*BLOCK_SIZE_M);
|
||||
@@ -6476,66 +6568,6 @@ void kernel_mul_mm_id_impl(
|
||||
}
|
||||
}
|
||||
|
||||
template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
|
||||
kernel void kernel_mul_mm_id(
|
||||
constant ggml_metal_kargs_mul_mm_id & args,
|
||||
device const char * src0s,
|
||||
device const char * src1,
|
||||
device char * dst,
|
||||
device const char * ids,
|
||||
threadgroup char * shmem [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort tiitg[[thread_index_in_threadgroup]],
|
||||
ushort sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
|
||||
const int32_t i02 = tgpig.z;
|
||||
|
||||
tgpig.z = 0;
|
||||
|
||||
device const char * src0 = src0s + i02*args.nb02;
|
||||
|
||||
// row indices
|
||||
threadgroup ushort2 * rowids = (threadgroup ushort2 *)(shmem + 8192);
|
||||
|
||||
// TODO: parallelize this loop
|
||||
int32_t _ne1 = 0;
|
||||
for (ushort ii1 = 0; ii1 < args.nei1; ii1++) {
|
||||
for (ushort ii0 = 0; ii0 < args.nei0; ii0++) {
|
||||
int32_t id = ((device int32_t *) (ids + ii1*args.nbi1))[ii0];
|
||||
if (id == i02) {
|
||||
if (tiitg == 0) {
|
||||
rowids[_ne1] = ushort2(ii0, ii1);
|
||||
}
|
||||
_ne1++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
kernel_mul_mm_id_impl<block_q, nl, dequantize_func>(
|
||||
args.ne00,
|
||||
args.ne02,
|
||||
args.nb01,
|
||||
args.nb02,
|
||||
args.ne11,
|
||||
args.ne12,
|
||||
args.nb10,
|
||||
args.nb11,
|
||||
args.nb12,
|
||||
args.ne0,
|
||||
_ne1,
|
||||
(int64_t)args.ne0*args.ne1,
|
||||
src0,
|
||||
src1,
|
||||
rowids,
|
||||
dst,
|
||||
shmem,
|
||||
tgpig,
|
||||
tiitg,
|
||||
sgitg);
|
||||
}
|
||||
|
||||
#define QK_NL 16
|
||||
|
||||
//
|
||||
@@ -6576,63 +6608,64 @@ template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_q_t kernel_get
|
||||
// matrix-matrix multiplication
|
||||
//
|
||||
|
||||
typedef decltype(kernel_mul_mm<half, half4x4, simdgroup_half8x8, float4x4, 1, dequantize_f32>) mat_mm_t;
|
||||
typedef decltype(kernel_mul_mm<half, half4x4, simdgroup_half8x8, float4x4, 1, dequantize_f32>) mul_mm_t;
|
||||
|
||||
template [[host_name("kernel_mul_mm_f32_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, float4x4, 1, dequantize_f32>;
|
||||
template [[host_name("kernel_mul_mm_f16_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half4x4, 1, dequantize_f16>;
|
||||
template [[host_name("kernel_mul_mm_f32_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, float4x4, 1, dequantize_f32>;
|
||||
template [[host_name("kernel_mul_mm_f16_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half4x4, 1, dequantize_f16>;
|
||||
#if defined(GGML_METAL_USE_BF16)
|
||||
template [[host_name("kernel_mul_mm_bf16_f32")]] kernel mat_mm_t kernel_mul_mm<bfloat, bfloat4x4, simdgroup_bfloat8x8, bfloat4x4, 1, dequantize_bf16>;
|
||||
template [[host_name("kernel_mul_mm_bf16_f32")]] kernel mul_mm_t kernel_mul_mm<bfloat, bfloat4x4, simdgroup_bfloat8x8, bfloat4x4, 1, dequantize_bf16>;
|
||||
#endif
|
||||
template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_q4_0, 2, dequantize_q4_0>;
|
||||
template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_q4_1, 2, dequantize_q4_1>;
|
||||
template [[host_name("kernel_mul_mm_q5_0_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_q5_0, 2, dequantize_q5_0>;
|
||||
template [[host_name("kernel_mul_mm_q5_1_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_q5_1, 2, dequantize_q5_1>;
|
||||
template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_q8_0, 2, dequantize_q8_0>;
|
||||
template [[host_name("kernel_mul_mm_q2_K_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_q2_K, QK_NL, dequantize_q2_K>;
|
||||
template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_q3_K, QK_NL, dequantize_q3_K>;
|
||||
template [[host_name("kernel_mul_mm_q4_K_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_q4_K, QK_NL, dequantize_q4_K>;
|
||||
template [[host_name("kernel_mul_mm_q5_K_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_q5_K, QK_NL, dequantize_q5_K>;
|
||||
template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_q6_K, QK_NL, dequantize_q6_K>;
|
||||
template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
|
||||
template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
||||
template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
||||
template [[host_name("kernel_mul_mm_iq3_s_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_iq3_s, QK_NL, dequantize_iq3_s>;
|
||||
template [[host_name("kernel_mul_mm_iq2_s_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_iq2_s, QK_NL, dequantize_iq2_s>;
|
||||
template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_iq1_s, QK_NL, dequantize_iq1_s>;
|
||||
template [[host_name("kernel_mul_mm_iq1_m_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_iq1_m, QK_NL, dequantize_iq1_m>;
|
||||
template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_iq4_nl, 2, dequantize_iq4_nl>;
|
||||
template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_iq4_xs, QK_NL, dequantize_iq4_xs>;
|
||||
template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_q4_0, 2, dequantize_q4_0>;
|
||||
template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_q4_1, 2, dequantize_q4_1>;
|
||||
template [[host_name("kernel_mul_mm_q5_0_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_q5_0, 2, dequantize_q5_0>;
|
||||
template [[host_name("kernel_mul_mm_q5_1_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_q5_1, 2, dequantize_q5_1>;
|
||||
template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_q8_0, 2, dequantize_q8_0>;
|
||||
template [[host_name("kernel_mul_mm_q2_K_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_q2_K, QK_NL, dequantize_q2_K>;
|
||||
template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_q3_K, QK_NL, dequantize_q3_K>;
|
||||
template [[host_name("kernel_mul_mm_q4_K_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_q4_K, QK_NL, dequantize_q4_K>;
|
||||
template [[host_name("kernel_mul_mm_q5_K_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_q5_K, QK_NL, dequantize_q5_K>;
|
||||
template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_q6_K, QK_NL, dequantize_q6_K>;
|
||||
template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
|
||||
template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
||||
template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
||||
template [[host_name("kernel_mul_mm_iq3_s_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_iq3_s, QK_NL, dequantize_iq3_s>;
|
||||
template [[host_name("kernel_mul_mm_iq2_s_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_iq2_s, QK_NL, dequantize_iq2_s>;
|
||||
template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_iq1_s, QK_NL, dequantize_iq1_s>;
|
||||
template [[host_name("kernel_mul_mm_iq1_m_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_iq1_m, QK_NL, dequantize_iq1_m>;
|
||||
template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_iq4_nl, 2, dequantize_iq4_nl>;
|
||||
template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_iq4_xs, QK_NL, dequantize_iq4_xs>;
|
||||
|
||||
//
|
||||
// indirect matrix-matrix multiplication
|
||||
//
|
||||
|
||||
typedef decltype(kernel_mul_mm_id<float4x4, 1, dequantize_f32>) mat_mm_id_t;
|
||||
typedef decltype(kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, float4x4, 1, dequantize_f32>) mul_mm_id;
|
||||
|
||||
template [[host_name("kernel_mul_mm_id_f32_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<float4x4, 1, dequantize_f32>;
|
||||
template [[host_name("kernel_mul_mm_id_f16_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<half4x4, 1, dequantize_f16>;
|
||||
template [[host_name("kernel_mul_mm_id_f32_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, float4x4, 1, dequantize_f32>;
|
||||
template [[host_name("kernel_mul_mm_id_f16_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half4x4, 1, dequantize_f16>;
|
||||
#if defined(GGML_METAL_USE_BF16)
|
||||
template [[host_name("kernel_mul_mm_id_bf16_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<bfloat4x4, 1, dequantize_bf16>;
|
||||
template [[host_name("kernel_mul_mm_id_bf16_f16")]] kernel mul_mm_id kernel_mul_mm_id<bfloat, bfloat4x4, simdgroup_bfloat8x8, bfloat4x4, 1, dequantize_bf16>;
|
||||
#endif
|
||||
template [[host_name("kernel_mul_mm_id_q4_0_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q4_0, 2, dequantize_q4_0>;
|
||||
template [[host_name("kernel_mul_mm_id_q4_1_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q4_1, 2, dequantize_q4_1>;
|
||||
template [[host_name("kernel_mul_mm_id_q5_0_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q5_0, 2, dequantize_q5_0>;
|
||||
template [[host_name("kernel_mul_mm_id_q5_1_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q5_1, 2, dequantize_q5_1>;
|
||||
template [[host_name("kernel_mul_mm_id_q8_0_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q8_0, 2, dequantize_q8_0>;
|
||||
template [[host_name("kernel_mul_mm_id_q2_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q2_K, QK_NL, dequantize_q2_K>;
|
||||
template [[host_name("kernel_mul_mm_id_q3_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q3_K, QK_NL, dequantize_q3_K>;
|
||||
template [[host_name("kernel_mul_mm_id_q4_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q4_K, QK_NL, dequantize_q4_K>;
|
||||
template [[host_name("kernel_mul_mm_id_q5_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q5_K, QK_NL, dequantize_q5_K>;
|
||||
template [[host_name("kernel_mul_mm_id_q6_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q6_K, QK_NL, dequantize_q6_K>;
|
||||
template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
|
||||
template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
||||
template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
||||
template [[host_name("kernel_mul_mm_id_iq3_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_s, QK_NL, dequantize_iq3_s>;
|
||||
template [[host_name("kernel_mul_mm_id_iq2_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_s, QK_NL, dequantize_iq2_s>;
|
||||
template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
||||
template [[host_name("kernel_mul_mm_id_iq1_m_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_m, QK_NL, dequantize_iq1_m>;
|
||||
template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl, 2, dequantize_iq4_nl>;
|
||||
template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
|
||||
template [[host_name("kernel_mul_mm_id_q4_0_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_q4_0, 2, dequantize_q4_0>;
|
||||
template [[host_name("kernel_mul_mm_id_q4_1_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_q4_1, 2, dequantize_q4_1>;
|
||||
template [[host_name("kernel_mul_mm_id_q5_0_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_q5_0, 2, dequantize_q5_0>;
|
||||
template [[host_name("kernel_mul_mm_id_q5_1_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_q5_1, 2, dequantize_q5_1>;
|
||||
template [[host_name("kernel_mul_mm_id_q8_0_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_q8_0, 2, dequantize_q8_0>;
|
||||
template [[host_name("kernel_mul_mm_id_q2_K_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_q2_K, QK_NL, dequantize_q2_K>;
|
||||
template [[host_name("kernel_mul_mm_id_q3_K_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_q3_K, QK_NL, dequantize_q3_K>;
|
||||
template [[host_name("kernel_mul_mm_id_q4_K_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_q4_K, QK_NL, dequantize_q4_K>;
|
||||
template [[host_name("kernel_mul_mm_id_q5_K_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_q5_K, QK_NL, dequantize_q5_K>;
|
||||
template [[host_name("kernel_mul_mm_id_q6_K_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_q6_K, QK_NL, dequantize_q6_K>;
|
||||
template [[host_name("kernel_mul_mm_id_iq2_xxs_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
|
||||
template [[host_name("kernel_mul_mm_id_iq2_xs_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
||||
template [[host_name("kernel_mul_mm_id_iq3_xxs_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
||||
template [[host_name("kernel_mul_mm_id_iq3_s_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_iq3_s, QK_NL, dequantize_iq3_s>;
|
||||
template [[host_name("kernel_mul_mm_id_iq2_s_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_iq2_s, QK_NL, dequantize_iq2_s>;
|
||||
template [[host_name("kernel_mul_mm_id_iq1_s_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_iq1_s, QK_NL, dequantize_iq1_s>;
|
||||
template [[host_name("kernel_mul_mm_id_iq1_m_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_iq1_m, QK_NL, dequantize_iq1_m>;
|
||||
template [[host_name("kernel_mul_mm_id_iq4_nl_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_iq4_nl, 2, dequantize_iq4_nl>;
|
||||
template [[host_name("kernel_mul_mm_id_iq4_xs_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_iq4_xs, QK_NL, dequantize_iq4_xs>;
|
||||
|
||||
|
||||
//
|
||||
// matrix-vector multiplication
|
||||
|
||||
@@ -19,12 +19,6 @@
|
||||
#define GROUP_MAX_EPS_IQ1_M 1e-7f
|
||||
#define GROUP_MAX_EPS_IQ1_S 1e-12f
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
// disable "possible loss of data" to avoid warnings for hundreds of casts
|
||||
// we should just be careful :)
|
||||
#pragma warning(disable: 4244 4267)
|
||||
#endif
|
||||
|
||||
#define UNUSED GGML_UNUSED
|
||||
|
||||
// reference implementation for deterministic creation of model files
|
||||
|
||||
@@ -151,6 +151,12 @@ struct rpc_msg_buffer_clear_req {
|
||||
uint8_t value;
|
||||
};
|
||||
|
||||
struct rpc_msg_set_tensor_hash_req {
|
||||
rpc_tensor tensor;
|
||||
uint64_t offset;
|
||||
uint64_t hash;
|
||||
};
|
||||
|
||||
struct rpc_msg_set_tensor_hash_rsp {
|
||||
uint8_t result;
|
||||
};
|
||||
@@ -548,15 +554,12 @@ static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggm
|
||||
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
||||
rpc_tensor rpc_tensor = serialize_tensor(tensor);
|
||||
if (size > HASH_THRESHOLD) {
|
||||
// input serialization format: | rpc_tensor | offset (8 bytes) | hash (8 bytes)
|
||||
size_t input_size = sizeof(rpc_tensor) + sizeof(uint64_t) + sizeof(uint64_t);
|
||||
std::vector<uint8_t> input(input_size, 0);
|
||||
uint64_t hash = fnv_hash((const uint8_t*)data, size);
|
||||
memcpy(input.data(), &rpc_tensor, sizeof(rpc_tensor));
|
||||
memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset));
|
||||
memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), &hash, sizeof(hash));
|
||||
rpc_msg_set_tensor_hash_req request;
|
||||
request.tensor = rpc_tensor;
|
||||
request.offset = offset;
|
||||
request.hash = fnv_hash((const uint8_t*)data, size);
|
||||
rpc_msg_set_tensor_hash_rsp response;
|
||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR_HASH, input.data(), input.size(), &response, sizeof(response));
|
||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR_HASH, &request, sizeof(request), &response, sizeof(response));
|
||||
GGML_ASSERT(status);
|
||||
if (response.result) {
|
||||
// the server has the same data, no need to send it
|
||||
@@ -864,7 +867,7 @@ public:
|
||||
bool free_buffer(const rpc_msg_free_buffer_req & request);
|
||||
bool buffer_clear(const rpc_msg_buffer_clear_req & request);
|
||||
bool set_tensor(const std::vector<uint8_t> & input);
|
||||
bool set_tensor_hash(const std::vector<uint8_t> & input, rpc_msg_set_tensor_hash_rsp & response);
|
||||
bool set_tensor_hash(const rpc_msg_set_tensor_hash_req & request, rpc_msg_set_tensor_hash_rsp & response);
|
||||
bool get_tensor(const rpc_msg_get_tensor_req & request, std::vector<uint8_t> & response);
|
||||
bool copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_copy_tensor_rsp & response);
|
||||
bool graph_compute(const std::vector<uint8_t> & input, rpc_msg_graph_compute_rsp & response);
|
||||
@@ -1101,18 +1104,10 @@ bool rpc_server::get_cached_file(uint64_t hash, std::vector<uint8_t> & data) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool rpc_server::set_tensor_hash(const std::vector<uint8_t> & input, rpc_msg_set_tensor_hash_rsp & response)
|
||||
bool rpc_server::set_tensor_hash(const rpc_msg_set_tensor_hash_req & request, rpc_msg_set_tensor_hash_rsp & response)
|
||||
{
|
||||
// serialization format: | rpc_tensor | offset (8 bytes) | hash (8 bytes) |
|
||||
if (input.size() != sizeof(rpc_tensor) + 16) {
|
||||
return false;
|
||||
}
|
||||
const rpc_tensor * in_tensor = (const rpc_tensor *)input.data();
|
||||
uint64_t offset;
|
||||
memcpy(&offset, input.data() + sizeof(rpc_tensor), sizeof(offset));
|
||||
const uint64_t * hash = (const uint64_t *)(input.data() + sizeof(rpc_tensor) + sizeof(offset));
|
||||
std::vector<uint8_t> cached_file;
|
||||
if (!get_cached_file(*hash, cached_file)) {
|
||||
if (!get_cached_file(request.hash, cached_file)) {
|
||||
response.result = 0;
|
||||
return true;
|
||||
}
|
||||
@@ -1125,25 +1120,28 @@ bool rpc_server::set_tensor_hash(const std::vector<uint8_t> & input, rpc_msg_set
|
||||
ggml_context_ptr ctx_ptr { ggml_init(params) };
|
||||
GGML_ASSERT(ctx_ptr != nullptr);
|
||||
ggml_context * ctx = ctx_ptr.get();
|
||||
ggml_tensor * tensor = deserialize_tensor(ctx, in_tensor);
|
||||
ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
|
||||
if (tensor == nullptr) {
|
||||
GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
|
||||
return false;
|
||||
}
|
||||
GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu, hash: %" PRIx64 "\n", __func__, (void*)tensor->buffer, tensor->data, offset, size, *hash);
|
||||
GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu, hash: %" PRIx64 "\n",
|
||||
__func__, (void*)tensor->buffer, tensor->data, request.offset, size, request.hash);
|
||||
|
||||
// sanitize tensor->data
|
||||
{
|
||||
const size_t p0 = (size_t) ggml_backend_buffer_get_base(tensor->buffer);
|
||||
const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer);
|
||||
|
||||
if (in_tensor->data + offset < p0 || in_tensor->data + offset >= p1 || size > (p1 - in_tensor->data - offset)) {
|
||||
if (request.tensor.data + request.offset < p0
|
||||
|| request.tensor.data + request.offset >= p1
|
||||
|| size > (p1 - request.tensor.data - request.offset)) {
|
||||
GGML_LOG_ERROR("[%s] tensor data region (data=0x%" PRIx64 ", offset=%" PRIu64 ", size=%zu, hash=0x%" PRIx64 ") out of buffer bounds [0x%zx, 0x%zx)\n",
|
||||
__func__, in_tensor->data, offset, size, *hash, p0, p1);
|
||||
__func__, request.tensor.data, request.offset, size, request.hash, p0, p1);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
ggml_backend_tensor_set(tensor, cached_file.data(), offset, size);
|
||||
ggml_backend_tensor_set(tensor, cached_file.data(), request.offset, size);
|
||||
response.result = 1;
|
||||
return true;
|
||||
}
|
||||
@@ -1503,12 +1501,12 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
|
||||
break;
|
||||
}
|
||||
case RPC_CMD_SET_TENSOR_HASH: {
|
||||
std::vector<uint8_t> input;
|
||||
if (!recv_msg(sockfd, input)) {
|
||||
rpc_msg_set_tensor_hash_req request;
|
||||
if (!recv_msg(sockfd, &request, sizeof(request))) {
|
||||
return;
|
||||
}
|
||||
rpc_msg_set_tensor_hash_rsp response;
|
||||
if (!server.set_tensor_hash(input, response)) {
|
||||
if (!server.set_tensor_hash(request, response)) {
|
||||
return;
|
||||
}
|
||||
if (!send_msg(sockfd, &response, sizeof(response))) {
|
||||
|
||||
@@ -80,10 +80,6 @@ extern int g_ggml_sycl_disable_optimize;
|
||||
// max batch size to use MMQ kernels when tensor cores are available
|
||||
#define MMQ_MAX_BATCH_SIZE 32
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable : 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
// dmmv = dequantize_mul_mat_vec
|
||||
#ifndef GGML_SYCL_DMMV_X
|
||||
#define GGML_SYCL_DMMV_X 32
|
||||
@@ -118,17 +114,12 @@ static void crash() {
|
||||
GGML_ABORT("SYCL error");
|
||||
}
|
||||
|
||||
#define SYCL_CHECK(err) \
|
||||
do { \
|
||||
auto err_ = (err); \
|
||||
if (err_ != 0) \
|
||||
ggml_sycl_error( \
|
||||
#err, \
|
||||
__func__, \
|
||||
__FILE__, \
|
||||
__LINE__, \
|
||||
"Meet error in this line code!"); \
|
||||
} while (0)
|
||||
#define SYCL_CHECK(err) \
|
||||
do { \
|
||||
auto err_ = (err); \
|
||||
if (err_ != 0) \
|
||||
ggml_sycl_error(#err, __func__, __FILE__, __LINE__, "Exception caught in this line of code."); \
|
||||
} while (0)
|
||||
|
||||
#if DPCT_COMPAT_RT_VERSION >= 11100
|
||||
#define GGML_SYCL_ASSUME(x) __builtin_assume(x)
|
||||
|
||||
@@ -437,41 +437,52 @@ static void dequantize_row_iq4_nl_sycl(const void *vx, dst_t *y, const int64_t k
|
||||
}
|
||||
|
||||
template <typename src_t, typename dst_t>
|
||||
static void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
static void convert_unary_nc(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01,
|
||||
const int64_t ne02, const int64_t s01, const int64_t s02, const int64_t s03,
|
||||
const sycl::nd_item<3> & item_ct1) {
|
||||
|
||||
const int64_t work_group_size = item_ct1.get_local_range(2);
|
||||
const int64_t global_id = item_ct1.get_local_id(2) + work_group_size * item_ct1.get_group(2);
|
||||
const int64_t global_id = item_ct1.get_local_id(2) + work_group_size * item_ct1.get_group(2);
|
||||
|
||||
const int64_t i01 = item_ct1.get_group(1);
|
||||
const int64_t i02 = item_ct1.get_group(0) % ne02;
|
||||
const int64_t i03 = item_ct1.get_group(0) / ne02;
|
||||
|
||||
// make each work-item deal with more elements since sycl global range can not exceed max int
|
||||
const src_t * x = (const src_t *) vx;
|
||||
for (int64_t i = global_id; i < k; i += work_group_size * item_ct1.get_group_range(2)) {
|
||||
y[i] = x[i];
|
||||
const src_t * x = static_cast<const src_t *>(vx);
|
||||
const int64_t ix = i03 * s03 + i02 * s02 + i01 * s01;
|
||||
const int64_t iy = ((i03 * ne02 + i02) * ne01 + i01) * ne00;
|
||||
|
||||
#pragma unroll
|
||||
for (int64_t i00 = global_id; i00 < ne00; i00 += work_group_size * item_ct1.get_group_range(2)) {
|
||||
y[iy + i00] = static_cast<dst_t>(x[ix + i00]);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename src_t, typename dst_t>
|
||||
static void convert_unary_sycl(const void *__restrict__ vx,
|
||||
dst_t *__restrict__ y, const int64_t k,
|
||||
dpct::queue_ptr stream) {
|
||||
const int64_t num_blocks = (k + SYCL_DEQUANTIZE_BLOCK_SIZE - 1) / SYCL_DEQUANTIZE_BLOCK_SIZE;
|
||||
static void convert_unary_nc_sycl(const void * __restrict__ vx, dst_t * __restrict__ y,
|
||||
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
|
||||
const int64_t s01, const int64_t s02, const int64_t s03, dpct::queue_ptr queue) {
|
||||
dpct::has_capability_or_fail(queue->get_device(), { sycl::aspect::fp16 });
|
||||
|
||||
sycl::range<3> global_size(ne02 * ne03, ne01, ceil_div(ne00, SYCL_DEQUANTIZE_BLOCK_SIZE));
|
||||
|
||||
// decrease global range when it exceeds the max int
|
||||
int64_t local_size = downsample_sycl_global_range(num_blocks, SYCL_DEQUANTIZE_BLOCK_SIZE);
|
||||
sycl::range<3> block_nums(1, 1, num_blocks);
|
||||
sycl::range<3> local_range(1, 1, local_size);
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
// TODO: Downsample logic is separated from the kernel, a rewrite is desirable
|
||||
int64_t downsized_workgroup = downsample_sycl_global_range(global_size[0], SYCL_DEQUANTIZE_BLOCK_SIZE);
|
||||
sycl::range<3> workgroup_size(1, 1, downsized_workgroup);
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * local_range, local_range),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
convert_unary<src_t>(vx, y, k, item_ct1);
|
||||
});
|
||||
}
|
||||
queue->parallel_for(sycl::nd_range<3>(global_size * workgroup_size, workgroup_size), [=](sycl::nd_item<3> item_ct1) {
|
||||
convert_unary_nc<src_t>(vx, y, ne00, ne01, ne02, s01, s02, s03, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor *dst) {
|
||||
template <typename src_t, typename dst_t>
|
||||
static void convert_unary_sycl(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr queue) {
|
||||
convert_unary_nc_sycl<src_t>(vx, y, k, 1, 1, 1, k, k, k, queue);
|
||||
}
|
||||
|
||||
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
if (dst->src[0]->extra &&
|
||||
@@ -574,3 +585,12 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
to_fp16_nc_sycl_t get_to_fp16_nc_sycl(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_F32:
|
||||
return convert_unary_nc_sycl<float>;
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
//
|
||||
// MIT license
|
||||
// Copyright (C) 2024 Intel Corporation
|
||||
// Copyright (C) 2025 Intel Corporation
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
|
||||
@@ -16,12 +16,19 @@
|
||||
#include "common.hpp"
|
||||
|
||||
template <typename T>
|
||||
using to_t_sycl_t = void (*)(const void *__restrict__ x, T *__restrict__ y,
|
||||
int64_t k, dpct::queue_ptr stream);
|
||||
typedef to_t_sycl_t<float> to_fp32_sycl_t;
|
||||
using to_t_sycl_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, dpct::queue_ptr stream);
|
||||
typedef to_t_sycl_t<float> to_fp32_sycl_t;
|
||||
typedef to_t_sycl_t<sycl::half> to_fp16_sycl_t;
|
||||
|
||||
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor *dst);
|
||||
to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst);
|
||||
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst);
|
||||
to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor * dst);
|
||||
|
||||
#endif // GGML_SYCL_CONVERT_HPP
|
||||
// Nc = Non-contiguous
|
||||
template <typename T>
|
||||
using to_t_nc_sycl_t = void (*)(const void * x, T * y, int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03,
|
||||
int64_t s01, int64_t s02, int64_t s03, dpct::queue_ptr queue);
|
||||
|
||||
typedef to_t_nc_sycl_t<sycl::half> to_fp16_nc_sycl_t;
|
||||
to_fp16_nc_sycl_t get_to_fp16_nc_sycl(ggml_type type);
|
||||
|
||||
#endif // GGML_SYCL_CONVERT_HPP
|
||||
|
||||
@@ -193,7 +193,7 @@ static void ggml_check_sycl() try {
|
||||
|
||||
if (!initialized) {
|
||||
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
|
||||
g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
|
||||
g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 1);
|
||||
g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
|
||||
GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
|
||||
GGML_LOG_INFO("Running with Environment Variables:\n");
|
||||
@@ -338,7 +338,7 @@ ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
||||
assert(tensor->view_src->buffer->buft == buffer->buft);
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
if (tensor->type == GGML_TYPE_Q4_0) {
|
||||
if (tensor->type == GGML_TYPE_Q4_0 && !g_ggml_sycl_disable_optimize) {
|
||||
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
|
||||
tensor->extra = extra;
|
||||
ctx->tensor_extras.push_back(extra); //used to release it when destroy ctx.
|
||||
@@ -2694,35 +2694,31 @@ catch (sycl::exception const &exc) {
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
static void k_compute_batched_ptrs(const sycl::half *src0_as_f16,
|
||||
const sycl::half *src1_as_f16, char *dst,
|
||||
const void **ptrs_src, void **ptrs_dst,
|
||||
int64_t ne12, int64_t ne13, int64_t ne23,
|
||||
size_t nb02, size_t nb03, size_t nb12,
|
||||
size_t nb13, size_t nbd2, size_t nbd3,
|
||||
int64_t r2, int64_t r3,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
int64_t i13 = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
|
||||
item_ct1.get_local_id(2);
|
||||
int64_t i12 = item_ct1.get_group(1) * item_ct1.get_local_range(1) +
|
||||
item_ct1.get_local_id(1);
|
||||
static void k_compute_batched_ptrs(const sycl::half * src0_as_f16, const sycl::half * src1_as_f16, char * dst,
|
||||
const void ** ptrs_src, void ** ptrs_dst, int64_t ne12, int64_t ne13, int64_t ne23,
|
||||
size_t nb02, size_t nb03, size_t nb12, size_t nb13, size_t nbd2, size_t nbd3,
|
||||
int64_t r2, int64_t r3, const sycl::nd_item<3> & item_ct1) {
|
||||
const int64_t i13 = item_ct1.get_group(2) * item_ct1.get_local_range(2) + item_ct1.get_local_id(2);
|
||||
const int64_t i12 = item_ct1.get_group(1) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1);
|
||||
|
||||
if (i13 >= ne13 || i12 >= ne12) {
|
||||
return;
|
||||
}
|
||||
|
||||
int64_t i03 = i13 / r3;
|
||||
int64_t i02 = i12 / r2;
|
||||
const int64_t i03 = i13 / r3;
|
||||
const int64_t i02 = i12 / r2;
|
||||
|
||||
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
|
||||
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
|
||||
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
|
||||
const uint8_t * src0_bytes = reinterpret_cast<const uint8_t *>(src0_as_f16);
|
||||
const uint8_t * src1_bytes = reinterpret_cast<const uint8_t *>(src1_as_f16);
|
||||
uint8_t * dst_bytes = reinterpret_cast<uint8_t *>(dst);
|
||||
|
||||
ptrs_src[0 * ne23 + i12 + i13 * ne12] = src0_bytes + i02 * nb02 + i03 * nb03;
|
||||
ptrs_src[1 * ne23 + i12 + i13 * ne12] = src1_bytes + i12 * nb12 + i13 * nb13;
|
||||
ptrs_dst[0 * ne23 + i12 + i13 * ne12] = dst_bytes + i12 * nbd2 + i13 * nbd3;
|
||||
}
|
||||
|
||||
static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
|
||||
const ggml_tensor *src0,
|
||||
const ggml_tensor *src1,
|
||||
ggml_tensor *dst) try {
|
||||
static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor * src0,
|
||||
const ggml_tensor * src1, ggml_tensor * dst) try {
|
||||
GGML_ASSERT(!ggml_is_transposed(src0));
|
||||
GGML_ASSERT(!ggml_is_transposed(src1));
|
||||
GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer));
|
||||
@@ -2730,102 +2726,100 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
// TODO: see https://github.com/ggml-org/llama.cpp/pull/13155
|
||||
// Batched mul_mat requires a rewrite to support both oneDNN and non-contiguous dst
|
||||
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||
|
||||
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
||||
queue_ptr main_stream = ctx.stream();;
|
||||
queue_ptr queue = ctx.stream();
|
||||
|
||||
void * src0_ddq = src0->data;
|
||||
sycl::half *src0_as_f16 = (sycl::half *)src0_ddq;
|
||||
float * src1_ddf = (float *) src1->data;
|
||||
float * dst_ddf = (float *) dst->data;
|
||||
dpct::has_capability_or_fail(queue->get_device(), { sycl::aspect::fp16 });
|
||||
|
||||
const sycl::half * src0_f16 = static_cast<const sycl::half *>(src0->data);
|
||||
float * dst_ddf = static_cast<float *>(dst->data);
|
||||
|
||||
const sycl::half * src1_f16 = static_cast<const sycl::half *>(src1->data);
|
||||
const size_t type_size_src1 = ggml_type_size(src1->type);
|
||||
GGML_ASSERT(nb10 == type_size_src1);
|
||||
|
||||
// SRC1 strides
|
||||
int64_t s11 = nb11 / type_size_src1;
|
||||
int64_t s12 = nb12 / type_size_src1;
|
||||
int64_t s13 = nb13 / type_size_src1;
|
||||
ggml_sycl_pool_alloc<sycl::half> src1_f16_alloc(ctx.pool());
|
||||
|
||||
// convert src1 to fp16
|
||||
ggml_sycl_pool_alloc<sycl::half> src1_f16_alloc(ctx.pool());
|
||||
if (src1->type != GGML_TYPE_F16) {
|
||||
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
|
||||
const to_fp16_nc_sycl_t to_fp16_nc_sycl = get_to_fp16_nc_sycl(src1->type);
|
||||
GGML_ASSERT(to_fp16_nc_sycl != nullptr);
|
||||
const int64_t ne_src1 = ggml_nelements(src1);
|
||||
src1_f16_alloc.alloc(ne_src1);
|
||||
GGML_ASSERT(to_fp16_sycl != nullptr);
|
||||
to_fp16_sycl(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream);
|
||||
to_fp16_nc_sycl(src1_f16, src1_f16_alloc.get(), ne10, ne11, ne12, ne13, s11, s12, s13, queue);
|
||||
|
||||
src1_f16 = src1_f16_alloc.get();
|
||||
s11 = ne10;
|
||||
s12 = ne11 * s11;
|
||||
s13 = ne12 * s12;
|
||||
}
|
||||
sycl::half *src1_f16 = src1->type == GGML_TYPE_F16 ? (sycl::half *)src1_ddf
|
||||
: src1_f16_alloc.get();
|
||||
|
||||
char * dst_t;
|
||||
ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool());
|
||||
char * dst_t = reinterpret_cast<char *>(dst_ddf);
|
||||
|
||||
dpct::library_data_t cu_compute_type = dpct::library_data_t::real_float;
|
||||
dpct::library_data_t cu_data_type = dpct::library_data_t::real_float;
|
||||
dpct::library_data_t mkl_compute_type = dpct::library_data_t::real_float;
|
||||
dpct::library_data_t mkl_data_type = dpct::library_data_t::real_float;
|
||||
|
||||
// dst strides
|
||||
size_t nbd2 = dst->nb[2];
|
||||
size_t nbd3 = dst->nb[3];
|
||||
|
||||
const float alpha_f32 = 1.0f;
|
||||
const float beta_f32 = 0.0f;
|
||||
const float beta_f32 = 0.0f;
|
||||
|
||||
const void * alpha = &alpha_f32;
|
||||
const void * beta = &beta_f32;
|
||||
|
||||
dst_t = (char *) dst_ddf;
|
||||
|
||||
GGML_ASSERT(ne12 % ne02 == 0);
|
||||
GGML_ASSERT(ne13 % ne03 == 0);
|
||||
|
||||
// broadcast factors
|
||||
const int64_t r2 = ne12/ne02;
|
||||
const int64_t r3 = ne13/ne03;
|
||||
const int64_t r2 = ne12 / ne02;
|
||||
const int64_t r3 = ne13 / ne03;
|
||||
|
||||
if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
|
||||
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
||||
*main_stream, oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
|
||||
(const char *) src0_as_f16, dpct::library_data_t::real_half, nb01 / nb00, nb02 / nb00,
|
||||
(const char *) src1_f16, dpct::library_data_t::real_half, nb11 / nb10, nb12 / nb10, beta, (char *) dst_t,
|
||||
cu_data_type, ne01, nb2 / nb0, ne12 * ne13, cu_compute_type)));
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(*queue, oneapi::math::transpose::trans,
|
||||
oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
|
||||
src0_f16, dpct::library_data_t::real_half, nb01 / nb00, nb02 / nb00,
|
||||
src1_f16, dpct::library_data_t::real_half, s11, s12, beta, dst_t,
|
||||
mkl_data_type, ne0, ne1 * ne0, ne12 * ne13, mkl_compute_type)));
|
||||
} else {
|
||||
const int ne23 = ne12*ne13;
|
||||
const int ne23 = ne12 * ne13;
|
||||
|
||||
ggml_sycl_pool_alloc<const void *> ptrs_src(ctx.pool(), 2*ne23);
|
||||
ggml_sycl_pool_alloc< void *> ptrs_dst(ctx.pool(), 1*ne23);
|
||||
ggml_sycl_pool_alloc<const void *> ptrs_src(ctx.pool(), 2 * ne23);
|
||||
ggml_sycl_pool_alloc<void *> ptrs_dst(ctx.pool(), 1 * ne23);
|
||||
ggml_sycl_pool_alloc<matrix_info_t<float>> matrix_info(ctx.host_pool(), 1);
|
||||
|
||||
sycl::range<3> block_dims(1, ne12, ne13);
|
||||
/*
|
||||
DPCT1049:47: The work-group size passed to the SYCL kernel may exceed
|
||||
the limit. To get the device limit, query
|
||||
info::device::max_work_group_size. Adjust the work-group size if needed.
|
||||
*/
|
||||
{
|
||||
dpct::has_capability_or_fail(main_stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
main_stream->submit([&](sycl::handler &cgh) {
|
||||
const void **ptrs_src_get = ptrs_src.get();
|
||||
void **ptrs_dst_get = ptrs_dst.get();
|
||||
size_t nb12_scaled = src1->type == GGML_TYPE_F16 ? nb12 : nb12 / 2;
|
||||
size_t nb13_scaled = src1->type == GGML_TYPE_F16 ? nb13 : nb13 / 2;
|
||||
cgh.parallel_for(sycl::nd_range<3>(block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
k_compute_batched_ptrs(
|
||||
src0_as_f16, src1_f16,
|
||||
dst_t, ptrs_src_get,
|
||||
ptrs_dst_get, ne12, ne13, ne23,
|
||||
nb02, nb03, nb12_scaled, nb13_scaled,
|
||||
nbd2, nbd3, r2, r3, item_ct1);
|
||||
});
|
||||
queue->submit([&](sycl::handler & cgh) {
|
||||
const void ** ptrs_src_get = ptrs_src.get();
|
||||
void ** ptrs_dst_get = ptrs_dst.get();
|
||||
size_t nb12_scaled = src1->type == GGML_TYPE_F16 ? nb12 : s12 * sizeof(sycl::half);
|
||||
size_t nb13_scaled = src1->type == GGML_TYPE_F16 ? nb13 : s13 * sizeof(sycl::half);
|
||||
cgh.parallel_for(sycl::nd_range<3>(block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
k_compute_batched_ptrs(src0_f16, src1_f16, dst_t, ptrs_src_get, ptrs_dst_get, ne12, ne13, ne23, nb02,
|
||||
nb03, nb12_scaled, nb13_scaled, nbd2, nbd3, r2, r3, item_ct1);
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
||||
*main_stream, oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
|
||||
*queue, oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
|
||||
(const void **) (ptrs_src.get() + 0 * ne23), dpct::library_data_t::real_half, nb01 / nb00,
|
||||
(const void **) (ptrs_src.get() + 1 * ne23), dpct::library_data_t::real_half, nb11 / nb10, beta,
|
||||
(void **) (ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23, cu_compute_type, matrix_info.get())));
|
||||
(const void **) (ptrs_src.get() + 1 * ne23), dpct::library_data_t::real_half, s11, beta,
|
||||
(void **) (ptrs_dst.get() + 0 * ne23), mkl_data_type, ne0, ne23, mkl_compute_type, matrix_info.get())));
|
||||
}
|
||||
}
|
||||
catch (sycl::exception const &exc) {
|
||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||
<< ", line:" << __LINE__ << std::endl;
|
||||
std::exit(1);
|
||||
} catch (const sycl::exception & exc) {
|
||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl;
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
|
||||
@@ -2966,7 +2960,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||
// The kernel from the if path is faster for that specific case, but does not support all mul mats.
|
||||
ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
|
||||
}
|
||||
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
||||
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
||||
// KQV single-batch
|
||||
ggml_sycl_mul_mat_vec_nc(ctx, src0, src1, dst);
|
||||
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
||||
@@ -3873,9 +3867,6 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
if (a->ne[3] != b->ne[3]) {
|
||||
return false;
|
||||
}
|
||||
if (!ggml_is_contiguous(b)) {
|
||||
return false;
|
||||
}
|
||||
ggml_type a_type = a->type;
|
||||
if (a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ4_XS ||
|
||||
a_type == GGML_TYPE_IQ3_XXS || a_type == GGML_TYPE_IQ3_S ||
|
||||
|
||||
@@ -1632,7 +1632,7 @@ static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vec
|
||||
const uint32_t warps = warptile[0] / warptile[10];
|
||||
|
||||
const uint32_t load_bufs = (warptile[1] + warptile[2]) * (warptile[3] + bank_conflict_offset) * type_size;
|
||||
const uint32_t mmid_row_ids = mul_mat_id ? 3072 * sizeof(uint32_t) : 0;
|
||||
const uint32_t mmid_row_ids = mul_mat_id ? 4096 * sizeof(uint32_t) : 0;
|
||||
const uint32_t coopmat_stage = device->coopmat_support ? warptile[7] * warptile[8] / warps * sizeof(float) : 0;
|
||||
|
||||
const uint32_t total_size = load_bufs + mmid_row_ids + coopmat_stage + lut_size;
|
||||
@@ -5260,7 +5260,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||
|
||||
const uint64_t nei0 = ids->ne[0];
|
||||
const uint64_t nei1 = ids->ne[1];
|
||||
GGML_ASSERT(nei0 * nei1 <= 3072);
|
||||
GGML_ASSERT(nei0 * nei1 <= 4096);
|
||||
|
||||
const uint32_t nbi1 = ids->nb[1];
|
||||
const uint32_t nbi2 = ids->nb[2];
|
||||
|
||||
@@ -103,7 +103,7 @@ shared FLOAT_TYPE buf_a[BM * SHMEM_STRIDE];
|
||||
shared FLOAT_TYPE buf_b[BN * SHMEM_STRIDE];
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
shared u16vec2 row_ids[3072];
|
||||
shared u16vec2 row_ids[4096];
|
||||
#endif // MUL_MAT_ID
|
||||
|
||||
#define NUM_WARPS (BLOCK_SIZE / WARP)
|
||||
|
||||
@@ -92,7 +92,7 @@ layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
|
||||
#ifdef MUL_MAT_ID
|
||||
layout (binding = 3) readonly buffer IDS {int data_ids[];};
|
||||
|
||||
shared u16vec4 row_ids[3072];
|
||||
shared u16vec4 row_ids[4096];
|
||||
|
||||
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufB {
|
||||
B_TYPE b[];
|
||||
|
||||
@@ -101,7 +101,7 @@ shared FLOAT_TYPE_VEC2 buf_b_ds[BN];
|
||||
#define LOAD_VEC_B 4
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
shared u16vec2 row_ids[3072];
|
||||
shared u16vec2 row_ids[4096];
|
||||
#endif // MUL_MAT_ID
|
||||
|
||||
#define NUM_WARPS (BLOCK_SIZE / WARP)
|
||||
|
||||
+6
-2
@@ -1299,6 +1299,10 @@ bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
|
||||
return ggml_is_contiguous_n(tensor, 2);
|
||||
}
|
||||
|
||||
bool ggml_is_contiguously_allocated(const struct ggml_tensor * tensor) {
|
||||
return ggml_nbytes(tensor) == ggml_nelements(tensor) * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
||||
}
|
||||
|
||||
bool ggml_is_permuted(const struct ggml_tensor * tensor) {
|
||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||
|
||||
@@ -2728,11 +2732,11 @@ void ggml_mul_mat_set_prec(
|
||||
c = ggml_mul_mat_id(ctx, as, b, ids);
|
||||
|
||||
as -> [cols, rows, n_expert]
|
||||
ids -> [n_experts_used, n_tokens] (i32)
|
||||
b -> [cols, n_expert_used, n_tokens]
|
||||
ids -> [n_expert_used, n_tokens] (i32)
|
||||
c -> [rows, n_expert_used, n_tokens]
|
||||
|
||||
in b, n_experts_used can be broadcasted to match the n_expert_used of ids
|
||||
in b, n_expert_used can be broadcasted to match the n_expert_used of ids
|
||||
|
||||
c ~= as[:,:,i] @ b[:,i%r,t], i = ids[e,t] for all e,t in ids
|
||||
*/
|
||||
|
||||
@@ -1,7 +0,0 @@
|
||||
# pyright: reportUnusedImport=false
|
||||
|
||||
from .gguf_convert_endian import main as gguf_convert_endian_entrypoint
|
||||
from .gguf_dump import main as gguf_dump_entrypoint
|
||||
from .gguf_set_metadata import main as gguf_set_metadata_entrypoint
|
||||
from .gguf_new_metadata import main as gguf_new_metadata_entrypoint
|
||||
from .gguf_editor_gui import main as gguf_editor_gui_entrypoint
|
||||
@@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "gguf"
|
||||
version = "0.16.2"
|
||||
version = "0.16.3"
|
||||
description = "Read and write ML models in GGUF for GGML"
|
||||
authors = ["GGML <ggml@ggml.ai>"]
|
||||
packages = [
|
||||
@@ -36,8 +36,8 @@ requires = ["poetry-core>=1.0.0"]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[tool.poetry.scripts]
|
||||
gguf-convert-endian = "gguf.scripts:gguf_convert_endian_entrypoint"
|
||||
gguf-dump = "gguf.scripts:gguf_dump_entrypoint"
|
||||
gguf-set-metadata = "gguf.scripts:gguf_set_metadata_entrypoint"
|
||||
gguf-new-metadata = "gguf.scripts:gguf_new_metadata_entrypoint"
|
||||
gguf-editor-gui = "gguf.scripts:gguf_editor_gui_entrypoint"
|
||||
gguf-convert-endian = "gguf.scripts.gguf_convert_endian:main"
|
||||
gguf-dump = "gguf.scripts.gguf_dump:main"
|
||||
gguf-set-metadata = "gguf.scripts.gguf_set_metadata:main"
|
||||
gguf-new-metadata = "gguf.scripts.gguf_new_metadata:main"
|
||||
gguf-editor-gui = "gguf.scripts.gguf_editor_gui:main"
|
||||
|
||||
+13
-10
@@ -351,19 +351,17 @@ extern "C" {
|
||||
enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
|
||||
enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
|
||||
|
||||
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
|
||||
// TODO: move at the end of the struct
|
||||
bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
||||
bool embeddings; // if true, extract embeddings (together with logits)
|
||||
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
||||
bool no_perf; // whether to measure performance timings
|
||||
|
||||
// Abort callback
|
||||
// if it returns true, execution of llama_decode() will be aborted
|
||||
// currently works only with CPU execution
|
||||
ggml_abort_callback abort_callback;
|
||||
void * abort_callback_data;
|
||||
|
||||
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
|
||||
bool embeddings; // if true, extract embeddings (together with logits)
|
||||
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
||||
bool no_perf; // whether to measure performance timings
|
||||
};
|
||||
|
||||
// model quantization parameters
|
||||
@@ -924,14 +922,19 @@ extern "C" {
|
||||
// Frees a batch of tokens allocated with llama_batch_init()
|
||||
LLAMA_API void llama_batch_free(struct llama_batch batch);
|
||||
|
||||
// Processes a batch of tokens with the ecoder part of the encoder-decoder model.
|
||||
// Stores the encoder output internally for later use by the decoder cross-attention layers.
|
||||
// Process a batch of tokens.
|
||||
// In contrast to llama_decode() - this call does not use KV cache.
|
||||
// For encode-decoder contexts, processes the batch using the encoder.
|
||||
// Can store the encoder output internally for later use by the decoder's cross-attention layers.
|
||||
// 0 - success
|
||||
// < 0 - error. the KV cache state is restored to the state before this call
|
||||
LLAMA_API int32_t llama_encode(
|
||||
struct llama_context * ctx,
|
||||
struct llama_batch batch);
|
||||
|
||||
// Process a batch of tokens.
|
||||
// Requires KV cache.
|
||||
// For encode-decoder contexts, processes the batch using the decoder.
|
||||
// Positive return values does not mean a fatal error, but rather a warning.
|
||||
// 0 - success
|
||||
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
|
||||
|
||||
@@ -40,5 +40,6 @@ build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[tool.poetry.scripts]
|
||||
llama-convert-hf-to-gguf = "convert_hf_to_gguf:main"
|
||||
llama-convert-lora-to-gguf = "convert_lora_to_gguf:main"
|
||||
llama-convert-llama-ggml-to-gguf = "convert_llama_ggml_to_gguf:main"
|
||||
llama-ggml-vk-generate-shaders = "ggml_vk_generate_shaders:main"
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
-r ../tools/llava/requirements.txt
|
||||
-r ../tools/mtmd/requirements.txt
|
||||
-r ../tools/server/bench/requirements.txt
|
||||
-r ../tools/server/tests/requirements.txt
|
||||
|
||||
|
||||
@@ -1 +1 @@
|
||||
0482de9c63b9134eb462c7732888c0ee0dbc2755
|
||||
b59bddafe278877dfa22a80e53a637513862babb
|
||||
|
||||
@@ -253,6 +253,9 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
|
||||
std::vector<ggml_backend_buffer_type_t> buft_extra;
|
||||
{
|
||||
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
if (!cpu_dev) {
|
||||
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||
}
|
||||
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
|
||||
|
||||
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
|
||||
@@ -291,6 +294,9 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
|
||||
LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
|
||||
|
||||
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
if (!cpu_dev) {
|
||||
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||
}
|
||||
buft = ggml_backend_dev_buffer_type(cpu_dev);
|
||||
|
||||
break;
|
||||
|
||||
+8
-6
@@ -35,6 +35,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
||||
{ "mistral-v3", LLM_CHAT_TEMPLATE_MISTRAL_V3 },
|
||||
{ "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
|
||||
{ "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
|
||||
{ "mistral-v7-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN },
|
||||
{ "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
|
||||
{ "phi4", LLM_CHAT_TEMPLATE_PHI_4 },
|
||||
{ "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
|
||||
@@ -202,19 +203,20 @@ int32_t llm_chat_apply_template(
|
||||
if (add_ass) {
|
||||
ss << "<|im_start|>assistant\n";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) {
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7 || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN) {
|
||||
// Official mistral 'v7' template
|
||||
// See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
|
||||
// https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503#basic-instruct-template-v7-tekken
|
||||
const char * trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7 ? " " : "";
|
||||
for (auto message : chat) {
|
||||
std::string role(message->role);
|
||||
std::string content(message->content);
|
||||
if (role == "system") {
|
||||
ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]";
|
||||
ss << "[SYSTEM_PROMPT]" << trailing_space << content << "[/SYSTEM_PROMPT]";
|
||||
} else if (role == "user") {
|
||||
ss << "[INST] " << content << "[/INST]";
|
||||
}
|
||||
else {
|
||||
ss << " " << content << "</s>";
|
||||
ss << "[INST]" << trailing_space << content << "[/INST]";
|
||||
} else {
|
||||
ss << trailing_space << content << "</s>";
|
||||
}
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
|
||||
|
||||
@@ -14,6 +14,7 @@ enum llm_chat_template {
|
||||
LLM_CHAT_TEMPLATE_MISTRAL_V3,
|
||||
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
|
||||
LLM_CHAT_TEMPLATE_MISTRAL_V7,
|
||||
LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN,
|
||||
LLM_CHAT_TEMPLATE_PHI_3,
|
||||
LLM_CHAT_TEMPLATE_PHI_4,
|
||||
LLM_CHAT_TEMPLATE_FALCON_3,
|
||||
|
||||
+25
-14
@@ -116,8 +116,6 @@ llama_context::llama_context(
|
||||
__func__, n_ctx_per_seq, hparams.n_ctx_train);
|
||||
}
|
||||
|
||||
logits_all = params.logits_all;
|
||||
|
||||
if (!hparams.vocab_only) {
|
||||
// GPU backends
|
||||
for (auto * dev : model.devices) {
|
||||
@@ -253,7 +251,7 @@ llama_context::llama_context(
|
||||
}
|
||||
|
||||
// reserve worst-case graph
|
||||
if (!hparams.vocab_only) {
|
||||
if (!hparams.vocab_only && memory) {
|
||||
const uint32_t n_seqs = 1; // TODO: worst-case number of sequences
|
||||
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
||||
|
||||
@@ -702,6 +700,8 @@ int llama_context::encode(llama_batch & inp_batch) {
|
||||
t_compute_start_us = ggml_time_us();
|
||||
}
|
||||
|
||||
embd_seq.clear();
|
||||
|
||||
n_queued_tokens += n_tokens;
|
||||
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
@@ -763,12 +763,12 @@ int llama_context::encode(llama_batch & inp_batch) {
|
||||
ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
|
||||
GGML_ASSERT(backend_embd != nullptr);
|
||||
|
||||
GGML_ASSERT(embd != nullptr);
|
||||
|
||||
switch (cparams.pooling_type) {
|
||||
case LLAMA_POOLING_TYPE_NONE:
|
||||
{
|
||||
// extract token embeddings
|
||||
GGML_ASSERT(embd != nullptr);
|
||||
|
||||
GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size);
|
||||
ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens*n_embd*sizeof(float));
|
||||
} break;
|
||||
@@ -793,11 +793,18 @@ int llama_context::encode(llama_batch & inp_batch) {
|
||||
} break;
|
||||
case LLAMA_POOLING_TYPE_RANK:
|
||||
{
|
||||
// TODO: this likely should be the same logic as in llama_decoder_internal, but better to
|
||||
// wait for an encoder model that requires this pooling type in order to test it
|
||||
// https://github.com/ggerganov/llama.cpp/pull/9510
|
||||
GGML_ABORT("RANK pooling not implemented yet");
|
||||
}
|
||||
// extract the rerank score - a single float per sequence
|
||||
auto & embd_seq_out = embd_seq;
|
||||
|
||||
for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
|
||||
const llama_seq_id seq_id = ubatch.seq_id[s][0];
|
||||
if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
|
||||
continue;
|
||||
}
|
||||
embd_seq_out[seq_id].resize(1);
|
||||
ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float));
|
||||
}
|
||||
} break;
|
||||
case LLAMA_POOLING_TYPE_UNSPECIFIED:
|
||||
{
|
||||
GGML_ABORT("unknown pooling type");
|
||||
@@ -835,6 +842,11 @@ int llama_context::encode(llama_batch & inp_batch) {
|
||||
}
|
||||
|
||||
int llama_context::decode(llama_batch & inp_batch) {
|
||||
if (!memory) {
|
||||
LLAMA_LOG_WARN("%s: cannot decode batches with this context (use llama_encode() instead)\n", __func__);
|
||||
return encode(inp_batch);
|
||||
}
|
||||
|
||||
if (inp_batch.n_tokens == 0) {
|
||||
LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
|
||||
return -1;
|
||||
@@ -890,7 +902,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
||||
n_outputs_all += batch.logits[i] != 0;
|
||||
}
|
||||
} else if (logits_all || embd_pooled) {
|
||||
} else if (embd_pooled) {
|
||||
n_outputs_all = n_tokens_all;
|
||||
} else {
|
||||
// keep last output only
|
||||
@@ -1853,13 +1865,12 @@ llama_context_params llama_context_default_params() {
|
||||
/*.cb_eval_user_data =*/ nullptr,
|
||||
/*.type_k =*/ GGML_TYPE_F16,
|
||||
/*.type_v =*/ GGML_TYPE_F16,
|
||||
/*.logits_all =*/ false,
|
||||
/*.abort_callback =*/ nullptr,
|
||||
/*.abort_callback_data =*/ nullptr,
|
||||
/*.embeddings =*/ false,
|
||||
/*.offload_kqv =*/ true,
|
||||
/*.flash_attn =*/ false,
|
||||
/*.no_perf =*/ true,
|
||||
/*.abort_callback =*/ nullptr,
|
||||
/*.abort_callback_data =*/ nullptr,
|
||||
};
|
||||
|
||||
return result;
|
||||
|
||||
@@ -187,9 +187,6 @@ private:
|
||||
|
||||
std::unique_ptr<llama_memory_i> memory;
|
||||
|
||||
// TODO: remove
|
||||
bool logits_all = false;
|
||||
|
||||
// decode output (2-dimensional array: [n_outputs][n_vocab])
|
||||
size_t logits_size = 0; // capacity (of floats) for logits
|
||||
float * logits = nullptr;
|
||||
|
||||
+12
-1
@@ -782,7 +782,7 @@ ggml_tensor * llm_graph_context::build_ffn(
|
||||
} break;
|
||||
}
|
||||
|
||||
if (type_gate == LLM_FFN_PAR) {
|
||||
if (gate && type_gate == LLM_FFN_PAR) {
|
||||
cur = ggml_mul(ctx0, cur, tmp);
|
||||
cb(cur, "ffn_gate_par", il);
|
||||
}
|
||||
@@ -1227,8 +1227,19 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
||||
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
|
||||
|
||||
if (v_mla) {
|
||||
#if 0
|
||||
// v_mla can be applied as a matrix-vector multiplication with broadcasting across dimension 3 == n_tokens.
|
||||
// However, the code is optimized for dimensions 0 and 1 being large, so this is ineffient.
|
||||
cur = ggml_reshape_4d(ctx0, cur, v_mla->ne[0], 1, n_head, n_tokens);
|
||||
cur = ggml_mul_mat(ctx0, v_mla, cur);
|
||||
#else
|
||||
// It's preferable to do the calculation as a matrix-matrix multiplication with n_tokens in dimension 1.
|
||||
// The permutations are noops and only change how the tensor data is interpreted.
|
||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||
cur = ggml_mul_mat(ctx0, v_mla, cur);
|
||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||
cur = ggml_cont(ctx0, cur); // Needed because ggml_reshape_2d expects contiguous inputs.
|
||||
#endif
|
||||
}
|
||||
|
||||
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens);
|
||||
|
||||
@@ -823,6 +823,10 @@ void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps
|
||||
mmaps_used.reserve(files.size());
|
||||
for (const auto & file : files) {
|
||||
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
|
||||
if (!reg) {
|
||||
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||
}
|
||||
|
||||
auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
|
||||
std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa_fn());
|
||||
mmaps_used.emplace_back(mapping->size(), 0);
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user