mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-07-04 19:45:57 +02:00
Compare commits
81 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 12568ca8c8 | |||
| c807c6e3b0 | |||
| 0949beb5a3 | |||
| 9012c50fc8 | |||
| 0dd7f915fd | |||
| 550d684bd1 | |||
| 8635e221c8 | |||
| 930e0210d1 | |||
| 96c1db26c4 | |||
| 4ead6fd957 | |||
| 5eaee65384 | |||
| 60b68a6279 | |||
| b76429a69c | |||
| 86db42e97f | |||
| 6217b49583 | |||
| 0d0764dfd2 | |||
| 6da7168312 | |||
| 8bccdbbff9 | |||
| bcb5eeb645 | |||
| 225088ea76 | |||
| 82d3f4d3b2 | |||
| 17f6245168 | |||
| 7bfe60fdf9 | |||
| 750579ff14 | |||
| 134d6e54d4 | |||
| ca7f7b7b94 | |||
| 0dedb9ef7a | |||
| 2799d933b5 | |||
| 04fe84b69d | |||
| 5a4cd6741f | |||
| 2248799a58 | |||
| 72d693e4fb | |||
| 98d2d2884e | |||
| 84652b80cf | |||
| 52f1096f21 | |||
| 606fa42f5d | |||
| 7fc1c4ef78 | |||
| 82209efb7e | |||
| 9998d88bc8 | |||
| cd03ec7642 | |||
| 4889afba5f | |||
| 041fe83d74 | |||
| cfe9838d26 | |||
| ff6b1062af | |||
| 97895129e5 | |||
| 86f8daacfe | |||
| cf8b0dbda9 | |||
| fd6ae4ca1c | |||
| fb19f94c71 | |||
| 7f251fdbce | |||
| a6cc43c286 | |||
| a678916623 | |||
| 81df3f7cfa | |||
| de71b5f81c | |||
| 788fcbc5dd | |||
| 9d49acb2a7 | |||
| e365e658f0 | |||
| 4eac5b4509 | |||
| d5b780a676 | |||
| 471540ae8a | |||
| 19124078be | |||
| bcdcc1044f | |||
| 037bfe38d0 | |||
| 8685e7b075 | |||
| 09b4efa95f | |||
| 455d8e4be8 | |||
| 91fef95362 | |||
| 9e5647affa | |||
| 4f02d47339 | |||
| 23b8cc4991 | |||
| 59accc8863 | |||
| 83d58e02fc | |||
| 89a5474f0e | |||
| fd1c0ec3f0 | |||
| 45cac7ca70 | |||
| b94050e896 | |||
| a279d0f0f4 | |||
| 268d61e178 | |||
| 6990e2f1f7 | |||
| fcc7508759 | |||
| 5e6c0e18b6 |
@@ -1,4 +1,4 @@
|
||||
ARG ONEAPI_VERSION=2025.3.2-0-devel-ubuntu24.04
|
||||
ARG ONEAPI_VERSION=2025.3.3-0-devel-ubuntu24.04
|
||||
|
||||
## Build Image
|
||||
|
||||
|
||||
@@ -2,7 +2,19 @@ ARG OPENVINO_VERSION_MAJOR=2026.0
|
||||
ARG OPENVINO_VERSION_FULL=2026.0.0.20965.c6d6a13a886
|
||||
ARG UBUNTU_VERSION=24.04
|
||||
|
||||
# Optional proxy build arguments - empty by default
|
||||
# Intel GPU driver versions. https://github.com/intel/compute-runtime/releases
|
||||
ARG IGC_VERSION=v2.30.1
|
||||
ARG IGC_VERSION_FULL=2_2.30.1+20950
|
||||
ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1
|
||||
ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0
|
||||
ARG IGDGMM_VERSION=22.9.0
|
||||
|
||||
# Intel NPU driver versions. https://github.com/intel/linux-npu-driver/releases
|
||||
ARG NPU_DRIVER_VERSION=v1.32.0
|
||||
ARG NPU_DRIVER_FULL=v1.32.0.20260402-23905121947
|
||||
ARG LIBZE1_VERSION=1.27.0-1~24.04~ppa2
|
||||
|
||||
# Optional proxy build arguments
|
||||
ARG http_proxy=
|
||||
ARG https_proxy=
|
||||
|
||||
@@ -78,13 +90,47 @@ ARG http_proxy
|
||||
ARG https_proxy
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 libtbb12 curl \
|
||||
&& apt-get install -y libgomp1 libtbb12 curl wget ocl-icd-libopencl1 \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||
&& find /var/cache -type f -delete
|
||||
|
||||
# Install GPU drivers
|
||||
ARG IGC_VERSION
|
||||
ARG IGC_VERSION_FULL
|
||||
ARG COMPUTE_RUNTIME_VERSION
|
||||
ARG COMPUTE_RUNTIME_VERSION_FULL
|
||||
ARG IGDGMM_VERSION
|
||||
RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
|
||||
&& wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
|
||||
&& wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
|
||||
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
|
||||
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
|
||||
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
|
||||
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
|
||||
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libigdgmm12_${IGDGMM_VERSION}_amd64.deb \
|
||||
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
|
||||
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
|
||||
&& dpkg --install *.deb \
|
||||
&& rm -rf /tmp/neo/
|
||||
|
||||
# Install NPU drivers
|
||||
ARG NPU_DRIVER_VERSION
|
||||
ARG NPU_DRIVER_FULL
|
||||
ARG LIBZE1_VERSION
|
||||
RUN mkdir /tmp/npu/ && cd /tmp/npu/ \
|
||||
&& wget https://github.com/intel/linux-npu-driver/releases/download/${NPU_DRIVER_VERSION}/linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \
|
||||
&& tar -xf linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \
|
||||
&& dpkg --install *.deb \
|
||||
&& rm -rf /tmp/npu/
|
||||
|
||||
RUN cd /tmp \
|
||||
&& wget https://snapshot.ppa.launchpadcontent.net/kobuk-team/intel-graphics/ubuntu/20260324T100000Z/pool/main/l/level-zero-loader/libze1_${LIBZE1_VERSION}_amd64.deb \
|
||||
&& dpkg --install libze1_${LIBZE1_VERSION}_amd64.deb \
|
||||
&& rm libze1_${LIBZE1_VERSION}_amd64.deb
|
||||
|
||||
COPY --from=build /app/lib/ /app/
|
||||
|
||||
### Full (all binaries)
|
||||
|
||||
@@ -51,7 +51,7 @@ jobs:
|
||||
distribution: zulu
|
||||
|
||||
- name: Setup Android SDK
|
||||
uses: android-actions/setup-android@9fc6c4e9069bf8d3d10b2204b1fb8f6ef7065407 # v3
|
||||
uses: android-actions/setup-android@40fd30fb8d7440372e1316f5d1809ec01dcd3699 # v4.0.1
|
||||
with:
|
||||
log-accepted-android-sdk-licenses: false
|
||||
|
||||
|
||||
@@ -246,6 +246,7 @@ jobs:
|
||||
apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
glslc \
|
||||
spirv-headers \
|
||||
gcc-14-loongarch64-linux-gnu \
|
||||
g++-14-loongarch64-linux-gnu \
|
||||
libvulkan-dev:loong64
|
||||
|
||||
@@ -0,0 +1,120 @@
|
||||
name: CI (openvino)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-openvino.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp',
|
||||
]
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/build-openvino.yml',
|
||||
'ggml/src/ggml-openvino/**'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ubuntu-24-openvino:
|
||||
name: ubuntu-24-openvino-${{ matrix.openvino_device }}
|
||||
|
||||
concurrency:
|
||||
group: openvino-${{ matrix.variant }}-${{ github.head_ref || github.ref }}
|
||||
cancel-in-progress: false
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- variant: cpu
|
||||
runner: '"ubuntu-24.04"'
|
||||
openvino_device: "CPU"
|
||||
- variant: gpu
|
||||
runner: '["self-hosted","Linux","Intel","OpenVINO"]'
|
||||
openvino_device: "GPU"
|
||||
|
||||
runs-on: ${{ fromJSON(matrix.runner) }}
|
||||
|
||||
env:
|
||||
# Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
|
||||
OPENVINO_VERSION_MAJOR: "2026.0"
|
||||
OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
if: runner.environment == 'github-hosted'
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-24-openvino-${{ matrix.variant }}-no-preset-v1
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y build-essential libssl-dev libtbb12 cmake ninja-build python3-pip
|
||||
sudo apt-get install -y ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
|
||||
|
||||
- name: Use OpenVINO Toolkit Cache
|
||||
if: runner.environment == 'github-hosted'
|
||||
uses: actions/cache@v5
|
||||
id: cache-openvino
|
||||
with:
|
||||
path: ./openvino_toolkit
|
||||
key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
|
||||
|
||||
- name: Setup OpenVINO Toolkit
|
||||
if: steps.cache-openvino.outputs.cache-hit != 'true'
|
||||
uses: ./.github/actions/linux-setup-openvino
|
||||
with:
|
||||
path: ./openvino_toolkit
|
||||
version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
|
||||
version_full: ${{ env.OPENVINO_VERSION_FULL }}
|
||||
|
||||
- name: Install OpenVINO dependencies
|
||||
run: |
|
||||
cd ./openvino_toolkit
|
||||
chmod +x ./install_dependencies/install_openvino_dependencies.sh
|
||||
echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
source ./openvino_toolkit/setupvars.sh
|
||||
cmake -B build/ReleaseOV -G Ninja \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENVINO=ON
|
||||
time cmake --build build/ReleaseOV --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
# TODO: fix and re-enable the `test-llama-archs` test below
|
||||
run: |
|
||||
cd ${{ github.workspace }}
|
||||
if [ "${{ matrix.openvino_device }}" = "GPU" ]; then
|
||||
export GGML_OPENVINO_DEVICE=GPU
|
||||
fi
|
||||
ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000
|
||||
@@ -97,6 +97,36 @@ jobs:
|
||||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
|
||||
# TODO: investigate slight precision issues in some operations for test-backend-ops on the WebGPU backend.
|
||||
#ggml-ci-nvidia-webgpu:
|
||||
# runs-on: [self-hosted, Linux, NVIDIA]
|
||||
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
|
||||
# - name: Dawn Dependency
|
||||
# id: dawn-depends
|
||||
# run: |
|
||||
# DAWN_VERSION="v20260317.182325"
|
||||
# DAWN_OWNER="google"
|
||||
# DAWN_REPO="dawn"
|
||||
# DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-ubuntu-latest-Release"
|
||||
# echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
|
||||
# curl -L -o artifact.tar.gz \
|
||||
# "https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
|
||||
# mkdir dawn
|
||||
# tar -xvf artifact.tar.gz -C dawn --strip-components=1
|
||||
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# GG_BUILD_WEBGPU=1 \
|
||||
# GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
|
||||
# GG_BUILD_WEBGPU_DAWN_DIR="$GITHUB_WORKSPACE/dawn/lib64/cmake/Dawn" \
|
||||
# bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
|
||||
# TODO: provision AMX-compatible machine
|
||||
#ggml-ci-cpu-amx:
|
||||
# runs-on: [self-hosted, Linux, CPU, AMX]
|
||||
@@ -235,6 +265,10 @@ jobs:
|
||||
ggml-ci-intel-openvino-gpu-low-perf:
|
||||
runs-on: [self-hosted, Linux, Intel, OpenVINO]
|
||||
|
||||
concurrency:
|
||||
group: openvino-gpu-${{ github.head_ref || github.ref }}
|
||||
cancel-in-progress: false
|
||||
|
||||
env:
|
||||
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
|
||||
OPENVINO_VERSION_MAJOR: "2026.0"
|
||||
|
||||
@@ -0,0 +1,142 @@
|
||||
name: CI (sycl)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-sycl.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp'
|
||||
]
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/build-sycl.yml',
|
||||
'ggml/src/ggml-sycl/**'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
|
||||
ubuntu-24-sycl:
|
||||
strategy:
|
||||
matrix:
|
||||
build: [fp32, fp16]
|
||||
include:
|
||||
- build: fp32
|
||||
fp16: OFF
|
||||
- build: fp16
|
||||
fp16: ON
|
||||
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
env:
|
||||
ONEAPI_ROOT: /opt/intel/oneapi/
|
||||
ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
|
||||
continue-on-error: true
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- name: Use oneAPI Installation Cache
|
||||
uses: actions/cache@v5
|
||||
id: cache-sycl
|
||||
with:
|
||||
path: ${{ env.ONEAPI_ROOT }}
|
||||
key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Download & Install oneAPI
|
||||
shell: bash
|
||||
if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
cd /tmp
|
||||
wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
|
||||
sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
|
||||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-24-sycl-${{ matrix.build }}
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
cmake -B build \
|
||||
-G "Ninja" \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_SYCL=ON \
|
||||
-DCMAKE_C_COMPILER=icx \
|
||||
-DCMAKE_CXX_COMPILER=icpx \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_SYCL_F16=${{ matrix.fp16 }}
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
windows-latest-sycl:
|
||||
runs-on: windows-2022
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
env:
|
||||
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
|
||||
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
|
||||
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||
ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Use oneAPI Installation Cache
|
||||
uses: actions/cache@v5
|
||||
id: cache-sycl
|
||||
with:
|
||||
path: ${{ env.ONEAPI_ROOT }}
|
||||
key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Download & Install oneAPI
|
||||
shell: bash
|
||||
if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: windows-latest-sycl
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
# TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: examples/sycl/win-build-sycl.bat
|
||||
+50
-213
@@ -267,6 +267,56 @@ jobs:
|
||||
wget https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K-be.gguf
|
||||
./bin/llama-completion -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
||||
|
||||
android-arm64:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
env:
|
||||
NDK_VERSION: "29.0.14206865"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: android-arm64
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Set up JDK
|
||||
uses: actions/setup-java@v5
|
||||
with:
|
||||
java-version: 17
|
||||
distribution: temurin
|
||||
|
||||
- name: Setup Android SDK
|
||||
uses: android-actions/setup-android@40fd30fb8d7440372e1316f5d1809ec01dcd3699 # v4.0.1
|
||||
with:
|
||||
log-accepted-android-sdk-licenses: false
|
||||
|
||||
- name: Install NDK
|
||||
run: |
|
||||
sdkmanager "ndk;${{ env.NDK_VERSION }}"
|
||||
echo "ANDROID_NDK=${ANDROID_SDK_ROOT}/ndk/${{ env.NDK_VERSION }}" >> $GITHUB_ENV
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
|
||||
-DANDROID_ABI=arm64-v8a \
|
||||
-DANDROID_PLATFORM=android-28 \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DGGML_BACKEND_DL=ON \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_CPU_ALL_VARIANTS=ON \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||
-DGGML_RPC=ON
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
ubuntu-latest-rpc:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
@@ -505,186 +555,6 @@ jobs:
|
||||
-DGGML_MUSA=ON
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
ubuntu-22-sycl:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
continue-on-error: true
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- name: add oneAPI to apt
|
||||
shell: bash
|
||||
run: |
|
||||
cd /tmp
|
||||
wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
|
||||
sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
|
||||
rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
|
||||
sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
|
||||
|
||||
- name: install oneAPI dpcpp compiler
|
||||
shell: bash
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev
|
||||
|
||||
- name: install oneAPI MKL library
|
||||
shell: bash
|
||||
run: |
|
||||
sudo apt install intel-oneapi-mkl-devel
|
||||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-22-sycl
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
cmake -B build \
|
||||
-DGGML_SYCL=ON \
|
||||
-DCMAKE_C_COMPILER=icx \
|
||||
-DCMAKE_CXX_COMPILER=icpx
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
ubuntu-22-sycl-fp16:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
continue-on-error: true
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- name: add oneAPI to apt
|
||||
shell: bash
|
||||
run: |
|
||||
cd /tmp
|
||||
wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
|
||||
sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
|
||||
rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
|
||||
sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
|
||||
|
||||
- name: install oneAPI dpcpp compiler
|
||||
shell: bash
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev ninja-build
|
||||
|
||||
- name: install oneAPI MKL library
|
||||
shell: bash
|
||||
run: |
|
||||
sudo apt install intel-oneapi-mkl-devel
|
||||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-22-sycl-fp16
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
cmake -B build \
|
||||
-G "Ninja" \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_SYCL=ON \
|
||||
-DCMAKE_C_COMPILER=icx \
|
||||
-DCMAKE_CXX_COMPILER=icpx \
|
||||
-DGGML_SYCL_F16=ON
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
ubuntu-24-openvino:
|
||||
name: ubuntu-24-openvino-${{ matrix.openvino_device }}
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- variant: cpu
|
||||
runner: '"ubuntu-24.04"'
|
||||
openvino_device: "CPU"
|
||||
- variant: gpu
|
||||
runner: '["self-hosted","Linux","X64","Intel"]'
|
||||
openvino_device: "GPU"
|
||||
|
||||
runs-on: ${{ fromJSON(matrix.runner) }}
|
||||
|
||||
env:
|
||||
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
|
||||
OPENVINO_VERSION_MAJOR: "2026.0"
|
||||
OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
if: runner.environment == 'github-hosted'
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-24-openvino-${{ matrix.variant }}-no-preset-v1
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y build-essential libssl-dev libtbb12 cmake ninja-build python3-pip
|
||||
sudo apt-get install -y ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
|
||||
|
||||
- name: Use OpenVINO Toolkit Cache
|
||||
if: runner.environment == 'github-hosted'
|
||||
uses: actions/cache@v5
|
||||
id: cache-openvino
|
||||
with:
|
||||
path: ./openvino_toolkit
|
||||
key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
|
||||
|
||||
- name: Setup OpenVINO Toolkit
|
||||
if: steps.cache-openvino.outputs.cache-hit != 'true'
|
||||
uses: ./.github/actions/linux-setup-openvino
|
||||
with:
|
||||
path: ./openvino_toolkit
|
||||
version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
|
||||
version_full: ${{ env.OPENVINO_VERSION_FULL }}
|
||||
|
||||
- name: Install OpenVINO dependencies
|
||||
run: |
|
||||
cd ./openvino_toolkit
|
||||
chmod +x ./install_dependencies/install_openvino_dependencies.sh
|
||||
echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
source ./openvino_toolkit/setupvars.sh
|
||||
cmake -B build/ReleaseOV -G Ninja \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENVINO=ON
|
||||
time cmake --build build/ReleaseOV --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
# TODO: fix and re-enable the `test-llama-archs` test below
|
||||
run: |
|
||||
cd ${{ github.workspace }}
|
||||
if [ "${{ matrix.openvino_device }}" = "GPU" ]; then
|
||||
export GGML_OPENVINO_DEVICE=GPU
|
||||
fi
|
||||
ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000
|
||||
|
||||
windows-latest:
|
||||
runs-on: windows-2025
|
||||
@@ -893,39 +763,6 @@ jobs:
|
||||
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
|
||||
cmake --build build --config Release
|
||||
|
||||
windows-latest-sycl:
|
||||
runs-on: windows-2022
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
env:
|
||||
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
|
||||
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
|
||||
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: windows-latest-sycl
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Install
|
||||
run: |
|
||||
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
|
||||
# TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: examples/sycl/win-build-sycl.bat
|
||||
|
||||
windows-latest-hip:
|
||||
runs-on: windows-2022
|
||||
|
||||
@@ -236,6 +236,75 @@ jobs:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz
|
||||
name: llama-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz
|
||||
|
||||
android-arm64:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
env:
|
||||
NDK_VERSION: "29.0.14206865"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: android-arm64
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Set up JDK
|
||||
uses: actions/setup-java@v5
|
||||
with:
|
||||
java-version: 17
|
||||
distribution: temurin
|
||||
|
||||
- name: Setup Android SDK
|
||||
uses: android-actions/setup-android@40fd30fb8d7440372e1316f5d1809ec01dcd3699 # v4.0.1
|
||||
with:
|
||||
log-accepted-android-sdk-licenses: false
|
||||
|
||||
- name: Install NDK
|
||||
run: |
|
||||
sdkmanager "ndk;${{ env.NDK_VERSION }}"
|
||||
echo "ANDROID_NDK=${ANDROID_SDK_ROOT}/ndk/${{ env.NDK_VERSION }}" >> $GITHUB_ENV
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
|
||||
-DANDROID_ABI=arm64-v8a \
|
||||
-DANDROID_PLATFORM=android-28 \
|
||||
-DCMAKE_INSTALL_RPATH='$ORIGIN' \
|
||||
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
|
||||
-DGGML_BACKEND_DL=ON \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_CPU_ALL_VARIANTS=ON \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz
|
||||
name: llama-bin-android-arm64.tar.gz
|
||||
|
||||
ubuntu-24-openvino:
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
@@ -529,15 +598,29 @@ jobs:
|
||||
shell: bash
|
||||
|
||||
env:
|
||||
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
|
||||
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
|
||||
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
|
||||
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||
ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Use oneAPI Installation Cache
|
||||
uses: actions/cache@v5
|
||||
id: cache-sycl
|
||||
with:
|
||||
path: ${{ env.ONEAPI_ROOT }}
|
||||
key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Download & Install oneAPI
|
||||
shell: bash
|
||||
if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
@@ -545,10 +628,6 @@ jobs:
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Install
|
||||
run: |
|
||||
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
shell: cmd
|
||||
@@ -601,6 +680,82 @@ jobs:
|
||||
path: llama-bin-win-sycl-x64.zip
|
||||
name: llama-bin-win-sycl-x64.zip
|
||||
|
||||
ubuntu-24-sycl:
|
||||
strategy:
|
||||
matrix:
|
||||
build: [fp32, fp16]
|
||||
include:
|
||||
- build: fp32
|
||||
fp16: OFF
|
||||
- build: fp16
|
||||
fp16: ON
|
||||
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
env:
|
||||
ONEAPI_ROOT: /opt/intel/oneapi/
|
||||
ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Use oneAPI Installation Cache
|
||||
uses: actions/cache@v5
|
||||
id: cache-sycl
|
||||
with:
|
||||
path: ${{ env.ONEAPI_ROOT }}
|
||||
key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Download & Install oneAPI
|
||||
shell: bash
|
||||
if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
cd /tmp
|
||||
wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
|
||||
sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-24-sycl-${{ matrix.build }}
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
cmake -B build \
|
||||
-G "Ninja" \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_SYCL=ON \
|
||||
-DCMAKE_C_COMPILER=icx \
|
||||
-DCMAKE_CXX_COMPILER=icpx \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_SYCL_F16=${{ matrix.fp16 }}
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
|
||||
name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
|
||||
|
||||
ubuntu-22-rocm:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
@@ -618,6 +773,11 @@ jobs:
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Free up disk space
|
||||
uses: ggml-org/free-disk-space@v1.3.1
|
||||
with:
|
||||
tool-cache: true
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
@@ -971,6 +1131,8 @@ jobs:
|
||||
- ubuntu-cpu
|
||||
- ubuntu-vulkan
|
||||
- ubuntu-24-openvino
|
||||
- ubuntu-24-sycl
|
||||
- android-arm64
|
||||
- macOS-cpu
|
||||
- ios-xcode-build
|
||||
- openEuler-cann
|
||||
@@ -1058,6 +1220,11 @@ jobs:
|
||||
- [Ubuntu arm64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-arm64.tar.gz)
|
||||
- [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
|
||||
- [Ubuntu x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ needs.ubuntu-24-openvino.outputs.openvino_version }}-x64.tar.gz)
|
||||
- [Ubuntu x64 (SYCL FP32)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp32-x64.tar.gz)
|
||||
- [Ubuntu x64 (SYCL FP16)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp16-x64.tar.gz)
|
||||
|
||||
**Android:**
|
||||
- [Android arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz)
|
||||
|
||||
**Windows:**
|
||||
- [Windows x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-x64.zip)
|
||||
|
||||
@@ -145,3 +145,5 @@ poetry.toml
|
||||
/.windsurf/
|
||||
# emscripten
|
||||
a.out.*
|
||||
|
||||
AGENTS.local.md
|
||||
|
||||
+5
-1
@@ -225,7 +225,7 @@ foreach(FILE_PATH ${EXTRA_LICENSES})
|
||||
endforeach()
|
||||
|
||||
if (LLAMA_BUILD_COMMON)
|
||||
license_generate(common)
|
||||
license_generate(llama-common)
|
||||
endif()
|
||||
|
||||
#
|
||||
@@ -249,6 +249,10 @@ set_target_properties(llama
|
||||
|
||||
install(TARGETS llama LIBRARY PUBLIC_HEADER)
|
||||
|
||||
if (LLAMA_BUILD_COMMON)
|
||||
install(TARGETS llama-common LIBRARY)
|
||||
endif()
|
||||
|
||||
configure_package_config_file(
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/cmake/llama-config.cmake.in
|
||||
${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake
|
||||
|
||||
@@ -23,6 +23,7 @@
|
||||
/ci/ @ggerganov
|
||||
/cmake/ @ggerganov
|
||||
/common/ @ggml-org/llama-common
|
||||
/common/fit.* @JohannesGaessler
|
||||
/common/jinja/ @CISC
|
||||
/common/ngram-map.* @srogmann
|
||||
/convert_*.py @CISC
|
||||
|
||||
+29
-10
@@ -1,9 +1,11 @@
|
||||
# common
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
|
||||
llama_add_compile_flags()
|
||||
|
||||
#
|
||||
# llama-common-base
|
||||
#
|
||||
|
||||
# Build info header
|
||||
|
||||
if(EXISTS "${PROJECT_SOURCE_DIR}/.git")
|
||||
@@ -33,17 +35,25 @@ endif()
|
||||
|
||||
set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in")
|
||||
set(OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/build-info.cpp")
|
||||
|
||||
configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
|
||||
|
||||
set(TARGET build_info)
|
||||
add_library(${TARGET} OBJECT ${OUTPUT_FILE})
|
||||
set(TARGET llama-common-base)
|
||||
add_library(${TARGET} STATIC ${OUTPUT_FILE})
|
||||
|
||||
target_include_directories(${TARGET} PUBLIC .)
|
||||
|
||||
if (BUILD_SHARED_LIBS)
|
||||
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
endif()
|
||||
|
||||
set(TARGET common)
|
||||
#
|
||||
# llama-common
|
||||
#
|
||||
|
||||
add_library(${TARGET} STATIC
|
||||
set(TARGET llama-common)
|
||||
|
||||
add_library(${TARGET}
|
||||
arg.cpp
|
||||
arg.h
|
||||
base64.hpp
|
||||
@@ -63,6 +73,8 @@ add_library(${TARGET} STATIC
|
||||
debug.h
|
||||
download.cpp
|
||||
download.h
|
||||
fit.cpp
|
||||
fit.h
|
||||
hf-cache.cpp
|
||||
hf-cache.h
|
||||
http.h
|
||||
@@ -106,17 +118,24 @@ add_library(${TARGET} STATIC
|
||||
jinja/caps.h
|
||||
)
|
||||
|
||||
set_target_properties(${TARGET} PROPERTIES
|
||||
VERSION ${LLAMA_INSTALL_VERSION}
|
||||
SOVERSION 0
|
||||
MACHO_CURRENT_VERSION 0 # keep macOS linker from seeing oversized version number
|
||||
)
|
||||
|
||||
target_include_directories(${TARGET} PUBLIC . ../vendor)
|
||||
target_compile_features (${TARGET} PUBLIC cxx_std_17)
|
||||
|
||||
if (BUILD_SHARED_LIBS)
|
||||
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
# TODO: make fine-grained exports in the future
|
||||
set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
|
||||
endif()
|
||||
|
||||
target_link_libraries(${TARGET} PRIVATE
|
||||
build_info
|
||||
cpp-httplib
|
||||
)
|
||||
target_link_libraries(${TARGET} PUBLIC llama-common-base)
|
||||
target_link_libraries(${TARGET} PRIVATE cpp-httplib)
|
||||
|
||||
if (LLAMA_LLGUIDANCE)
|
||||
include(ExternalProject)
|
||||
|
||||
+35
-9
@@ -1,5 +1,6 @@
|
||||
#include "arg.h"
|
||||
|
||||
#include "build-info.h"
|
||||
#include "chat.h"
|
||||
#include "common.h"
|
||||
#include "download.h"
|
||||
@@ -291,7 +292,7 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa
|
||||
hf_tag = "default";
|
||||
}
|
||||
|
||||
std::string model_endpoint = get_model_endpoint();
|
||||
std::string model_endpoint = common_get_model_endpoint();
|
||||
auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini";
|
||||
|
||||
// prepare local path for caching
|
||||
@@ -1044,8 +1045,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
{"--version"},
|
||||
"show version and build info",
|
||||
[](common_params &) {
|
||||
fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
|
||||
fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
|
||||
fprintf(stderr, "version: %d (%s)\n", llama_build_number(), llama_commit());
|
||||
fprintf(stderr, "built with %s for %s\n", llama_compiler(), llama_build_target());
|
||||
exit(0);
|
||||
}
|
||||
));
|
||||
@@ -1315,13 +1316,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED, LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
|
||||
add_opt(common_arg(
|
||||
{"--clear-idle"},
|
||||
{"--no-clear-idle"},
|
||||
{"--cache-idle-slots"},
|
||||
{"--no-cache-idle-slots"},
|
||||
"save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)",
|
||||
[](common_params & params, bool value) {
|
||||
params.clear_idle = value;
|
||||
params.cache_idle_slots = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_CLEAR_IDLE").set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
).set_env("LLAMA_ARG_CACHE_IDLE_SLOTS").set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"--context-shift"},
|
||||
{"--no-context-shift"},
|
||||
@@ -2425,6 +2426,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
}
|
||||
).set_env("LLAMA_ARG_FIT"));
|
||||
add_opt(common_arg(
|
||||
{ "-fitp", "--fit-print" }, "[on|off]",
|
||||
string_format("print the estimated required memory ('on' or 'off', default: '%s')", params.fit_params_print ? "on" : "off"),
|
||||
[](common_params & params, const std::string & value) {
|
||||
if (is_truthy(value)) {
|
||||
params.fit_params_print = true;
|
||||
} else if (is_falsey(value)) {
|
||||
params.fit_params_print = false;
|
||||
} else {
|
||||
throw std::runtime_error(
|
||||
string_format("error: unknown value for --fit-print: '%s'\n", value.c_str()));
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_FIT_PARAMS}).set_env("LLAMA_ARG_FIT_ESTIMATE"));
|
||||
add_opt(common_arg(
|
||||
{ "-fitt", "--fit-target" }, "MiB0,MiB1,MiB2,...",
|
||||
string_format("target margin per device for --fit, comma-separated list of values, "
|
||||
@@ -3107,14 +3122,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
"token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)",
|
||||
[](common_params & params, int value) {
|
||||
if (value < -1) { throw std::invalid_argument("invalid value"); }
|
||||
params.reasoning_budget = value;
|
||||
params.sampling.reasoning_budget_tokens = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET"));
|
||||
add_opt(common_arg(
|
||||
{"--reasoning-budget-message"}, "MESSAGE",
|
||||
"message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.reasoning_budget_message = value;
|
||||
params.sampling.reasoning_budget_message = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET_MESSAGE"));
|
||||
add_opt(common_arg(
|
||||
@@ -3887,6 +3902,17 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
||||
|
||||
add_opt(common_arg(
|
||||
{"--spec-default"},
|
||||
string_format("enable default speculative decoding config"),
|
||||
[](common_params & params) {
|
||||
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD;
|
||||
params.speculative.ngram_size_n = 24;
|
||||
params.speculative.n_min = 48;
|
||||
params.speculative.n_max = 64;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
||||
|
||||
return ctx_arg;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,4 +1,35 @@
|
||||
#include "build-info.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
|
||||
int LLAMA_BUILD_NUMBER = @LLAMA_BUILD_NUMBER@;
|
||||
char const *LLAMA_COMMIT = "@LLAMA_BUILD_COMMIT@";
|
||||
char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
|
||||
char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
|
||||
char const * LLAMA_COMMIT = "@LLAMA_BUILD_COMMIT@";
|
||||
char const * LLAMA_COMPILER = "@BUILD_COMPILER@";
|
||||
char const * LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
|
||||
|
||||
int llama_build_number(void) {
|
||||
return LLAMA_BUILD_NUMBER;
|
||||
}
|
||||
|
||||
const char * llama_commit(void) {
|
||||
return LLAMA_COMMIT;
|
||||
}
|
||||
|
||||
const char * llama_compiler(void) {
|
||||
return LLAMA_COMPILER;
|
||||
}
|
||||
|
||||
const char * llama_build_target(void) {
|
||||
return LLAMA_BUILD_TARGET;
|
||||
}
|
||||
|
||||
const char * llama_build_info(void) {
|
||||
static std::string s = "b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT;
|
||||
return s.c_str();
|
||||
}
|
||||
|
||||
void llama_print_build_info(void) {
|
||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, llama_build_number(), llama_commit());
|
||||
fprintf(stderr, "%s: built with %s for %s\n", __func__, llama_compiler(), llama_build_target());
|
||||
}
|
||||
|
||||
@@ -0,0 +1,11 @@
|
||||
#pragma once
|
||||
|
||||
int llama_build_number(void);
|
||||
|
||||
const char * llama_commit(void);
|
||||
const char * llama_compiler(void);
|
||||
|
||||
const char * llama_build_target(void);
|
||||
const char * llama_build_info(void);
|
||||
|
||||
void llama_print_build_info(void);
|
||||
@@ -443,14 +443,14 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
|
||||
if (!format.per_call_start.empty()) {
|
||||
auto wrapped_call = format.per_call_start + p.space() + tool_choice + p.space() + format.per_call_end;
|
||||
if (inputs.parallel_tool_calls) {
|
||||
tool_calls = p.trigger_rule("tool-call", wrapped_call + p.zero_or_more(p.space() + wrapped_call));
|
||||
tool_calls = p.trigger_rule("tool-call", wrapped_call + p.zero_or_more(p.space() + wrapped_call) + p.space());
|
||||
} else {
|
||||
tool_calls = p.trigger_rule("tool-call", wrapped_call);
|
||||
tool_calls = p.trigger_rule("tool-call", wrapped_call + p.space());
|
||||
}
|
||||
if (!format.section_start.empty()) {
|
||||
tool_calls = p.trigger_rule("tool-calls",
|
||||
p.literal(format.section_start) + p.space() + tool_calls + p.space() +
|
||||
(format.section_end.empty() ? p.end() : p.literal(format.section_end)));
|
||||
(format.section_end.empty() ? p.end() : p.literal(format.section_end) + p.space()));
|
||||
}
|
||||
} else {
|
||||
std::string separator = ", "; // Default
|
||||
|
||||
+41
-56
@@ -397,6 +397,25 @@ json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msg
|
||||
return render_message_to_json(msgs, c);
|
||||
}
|
||||
|
||||
json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools) {
|
||||
if (tools.empty()) {
|
||||
return json();
|
||||
}
|
||||
|
||||
auto result = json::array();
|
||||
for (const auto & tool : tools) {
|
||||
result.push_back({
|
||||
{ "type", "function" },
|
||||
{ "function", {
|
||||
{ "name", tool.name },
|
||||
{ "description", tool.description },
|
||||
{ "parameters", json::parse(tool.parameters) },
|
||||
}},
|
||||
});
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & tools) {
|
||||
std::vector<common_chat_tool> result;
|
||||
|
||||
@@ -432,56 +451,6 @@ std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & too
|
||||
return result;
|
||||
}
|
||||
|
||||
json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools) {
|
||||
if (tools.empty()) {
|
||||
return json();
|
||||
}
|
||||
|
||||
auto result = json::array();
|
||||
for (const auto & tool : tools) {
|
||||
result.push_back({
|
||||
{ "type", "function" },
|
||||
{ "function",
|
||||
{
|
||||
{ "name", tool.name },
|
||||
{ "description", tool.description },
|
||||
{ "parameters", json::parse(tool.parameters) },
|
||||
} },
|
||||
});
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) {
|
||||
json delta = json::object();
|
||||
if (!diff.reasoning_content_delta.empty()) {
|
||||
delta["reasoning_content"] = diff.reasoning_content_delta;
|
||||
}
|
||||
if (!diff.content_delta.empty()) {
|
||||
delta["content"] = diff.content_delta;
|
||||
}
|
||||
if (diff.tool_call_index != std::string::npos) {
|
||||
json tool_call;
|
||||
tool_call["index"] = diff.tool_call_index;
|
||||
if (!diff.tool_call_delta.id.empty()) {
|
||||
tool_call["id"] = diff.tool_call_delta.id;
|
||||
tool_call["type"] = "function";
|
||||
}
|
||||
if (!diff.tool_call_delta.name.empty() || !diff.tool_call_delta.arguments.empty()) {
|
||||
json function = json::object();
|
||||
if (!diff.tool_call_delta.name.empty()) {
|
||||
function["name"] = diff.tool_call_delta.name;
|
||||
}
|
||||
if (!diff.tool_call_delta.arguments.empty()) {
|
||||
function["arguments"] = diff.tool_call_delta.arguments;
|
||||
}
|
||||
tool_call["function"] = function;
|
||||
}
|
||||
delta["tool_calls"] = json::array({ tool_call });
|
||||
}
|
||||
return delta;
|
||||
}
|
||||
|
||||
bool common_chat_verify_template(const std::string & tmpl, bool use_jinja) {
|
||||
if (use_jinja) {
|
||||
try {
|
||||
@@ -575,6 +544,26 @@ bool common_chat_templates_was_explicit(const struct common_chat_templates * tmp
|
||||
return tmpls->has_explicit_template;
|
||||
}
|
||||
|
||||
// LFM2 format detection: template uses <|tool_list_start|>[...]<|tool_list_end|> around the tool list
|
||||
// and <|tool_call_start|>[...]<|tool_call_end|> around each tool call
|
||||
static bool is_lfm2_template(const std::string & src) {
|
||||
return src.find("<|tool_list_start|>") != std::string::npos &&
|
||||
src.find("<|tool_list_end|>") != std::string::npos;
|
||||
}
|
||||
|
||||
common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates) {
|
||||
common_chat_prompt_preset asr_preset;
|
||||
asr_preset.system = "";
|
||||
asr_preset.user = "Transcribe audio to text";
|
||||
|
||||
if (chat_templates && chat_templates->template_default && is_lfm2_template(chat_templates->template_default->source())) {
|
||||
asr_preset.system = "Perform ASR.";
|
||||
asr_preset.user = "";
|
||||
}
|
||||
|
||||
return asr_preset;
|
||||
}
|
||||
|
||||
std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant) {
|
||||
if (!variant.empty()) {
|
||||
if (variant == "tool_use") {
|
||||
@@ -2084,10 +2073,7 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
|
||||
return common_chat_params_init_kimi_k2(tmpl, params);
|
||||
}
|
||||
|
||||
// LFM2 format detection: template uses <|tool_list_start|>[...]<|tool_list_end|> around the tool list
|
||||
// and <|tool_call_start|>[...]<|tool_call_end|> around each tool call
|
||||
if (src.find("<|tool_list_start|>") != std::string::npos &&
|
||||
src.find("<|tool_list_end|>") != std::string::npos) {
|
||||
if (is_lfm2_template(src)) {
|
||||
LOG_DBG("Using specialized template: LFM2\n");
|
||||
return common_chat_params_init_lfm2(tmpl, params);
|
||||
}
|
||||
@@ -2334,7 +2320,7 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & src_pars
|
||||
? input
|
||||
: params.generation_prompt + input;
|
||||
|
||||
LOG_DBG("Parsing PEG input with format %s: %s\n", common_chat_format_name(params.format), effective_input.c_str());
|
||||
//LOG_DBG("Parsing PEG input with format %s: %s\n", common_chat_format_name(params.format), effective_input.c_str());
|
||||
|
||||
common_peg_parse_flags flags = COMMON_PEG_PARSE_FLAG_LENIENT;
|
||||
if (params.debug) {
|
||||
@@ -2396,4 +2382,3 @@ std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_tem
|
||||
GGML_ASSERT(chat_templates->template_default != nullptr);
|
||||
return chat_templates->template_default->caps.to_map();
|
||||
}
|
||||
|
||||
|
||||
+10
-3
@@ -256,14 +256,13 @@ bool common_chat_templates_support_enable_thinking(const common_chat_templates *
|
||||
// Parses a JSON array of messages in OpenAI's chat completion API format.
|
||||
std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const nlohmann::ordered_json & messages);
|
||||
|
||||
std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const nlohmann::ordered_json & tools);
|
||||
|
||||
// DEPRECATED: only used in tests
|
||||
nlohmann::ordered_json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text = false);
|
||||
|
||||
std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const nlohmann::ordered_json & tools);
|
||||
nlohmann::ordered_json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);
|
||||
|
||||
nlohmann::ordered_json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff);
|
||||
|
||||
// get template caps, useful for reporting to server /props endpoint
|
||||
std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_templates * chat_templates);
|
||||
|
||||
@@ -275,3 +274,11 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
|
||||
const common_chat_template & tmpl,
|
||||
const std::string & src,
|
||||
autoparser::generation_params & params);
|
||||
|
||||
// specialized per-task preset
|
||||
struct common_chat_prompt_preset {
|
||||
std::string system;
|
||||
std::string user;
|
||||
};
|
||||
|
||||
common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates);
|
||||
|
||||
+41
-3
@@ -1,7 +1,9 @@
|
||||
#include "ggml.h"
|
||||
#include "gguf.h"
|
||||
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "fit.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
#include "sampling.h"
|
||||
@@ -372,7 +374,7 @@ void common_init() {
|
||||
const char * build_type = " (debug)";
|
||||
#endif
|
||||
|
||||
LOG_DBG("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
|
||||
LOG_DBG("build: %d (%s) with %s for %s%s\n", llama_build_number(), llama_commit(), llama_compiler(), llama_build_target(), build_type);
|
||||
}
|
||||
|
||||
std::string common_params_get_system_info(const common_params & params) {
|
||||
@@ -1146,7 +1148,7 @@ common_init_result::common_init_result(common_params & params) :
|
||||
|
||||
if (params.fit_params) {
|
||||
LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
|
||||
llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
|
||||
common_fit_params(params.model.path.c_str(), &mparams, &cparams,
|
||||
params.tensor_split,
|
||||
params.tensor_buft_overrides.data(),
|
||||
params.fit_params_target.data(),
|
||||
@@ -1381,7 +1383,7 @@ common_init_result_ptr common_init_from_params(common_params & params) {
|
||||
|
||||
common_init_result::~common_init_result() = default;
|
||||
|
||||
std::string get_model_endpoint() {
|
||||
std::string common_get_model_endpoint() {
|
||||
const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
|
||||
// We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
|
||||
const char * hf_endpoint_env = getenv("HF_ENDPOINT");
|
||||
@@ -1396,6 +1398,42 @@ std::string get_model_endpoint() {
|
||||
return model_endpoint;
|
||||
}
|
||||
|
||||
common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx) {
|
||||
auto * mem = llama_get_memory(ctx);
|
||||
if (mem == nullptr) {
|
||||
return COMMON_CONTEXT_SEQ_RM_TYPE_NO;
|
||||
}
|
||||
|
||||
common_context_seq_rm_type res = COMMON_CONTEXT_SEQ_RM_TYPE_PART;
|
||||
|
||||
llama_memory_clear(mem, true);
|
||||
|
||||
// eval 2 tokens to check if the context is compatible
|
||||
std::vector<llama_token> tmp;
|
||||
tmp.push_back(0);
|
||||
tmp.push_back(0);
|
||||
|
||||
int ret = llama_decode(ctx, llama_batch_get_one(tmp.data(), tmp.size()));
|
||||
if (ret != 0) {
|
||||
LOG_ERR("%s: llama_decode() failed: %d\n", __func__, ret);
|
||||
res = COMMON_CONTEXT_SEQ_RM_TYPE_NO;
|
||||
goto done;
|
||||
}
|
||||
|
||||
// try to remove the last tokens
|
||||
if (!llama_memory_seq_rm(mem, 0, 1, -1)) {
|
||||
LOG_WRN("%s: the target context does not support partial sequence removal\n", __func__);
|
||||
res = COMMON_CONTEXT_SEQ_RM_TYPE_FULL;
|
||||
goto done;
|
||||
}
|
||||
|
||||
done:
|
||||
llama_memory_clear(mem, true);
|
||||
llama_synchronize(ctx);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
|
||||
std::vector<llama_adapter_lora *> loras;
|
||||
std::vector<float> scales;
|
||||
|
||||
+36
-28
@@ -2,15 +2,15 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "llama-cpp.h"
|
||||
|
||||
#include "ggml-opt.h"
|
||||
#include "ggml.h"
|
||||
#include "llama-cpp.h"
|
||||
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <variant>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
|
||||
@@ -27,11 +27,6 @@
|
||||
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
|
||||
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
|
||||
|
||||
#define print_build_info() do { \
|
||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
|
||||
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
|
||||
} while(0)
|
||||
|
||||
struct common_time_meas {
|
||||
common_time_meas(int64_t & t_acc, bool disable = false);
|
||||
~common_time_meas();
|
||||
@@ -53,14 +48,6 @@ struct common_adapter_lora_info {
|
||||
|
||||
using llama_tokens = std::vector<llama_token>;
|
||||
|
||||
// build info
|
||||
extern int LLAMA_BUILD_NUMBER;
|
||||
extern const char * LLAMA_COMMIT;
|
||||
extern const char * LLAMA_COMPILER;
|
||||
extern const char * LLAMA_BUILD_TARGET;
|
||||
|
||||
const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
|
||||
|
||||
struct common_control_vector_load_info;
|
||||
|
||||
//
|
||||
@@ -287,6 +274,7 @@ struct common_params_sampling {
|
||||
std::vector<llama_token> reasoning_budget_start; // start tag token sequence
|
||||
std::vector<llama_token> reasoning_budget_end; // end tag token sequence
|
||||
std::vector<llama_token> reasoning_budget_forced; // forced sequence (message + end tag)
|
||||
std::string reasoning_budget_message; // message injected before end tag when budget exhausted
|
||||
|
||||
bool backend_sampling = false;
|
||||
|
||||
@@ -315,15 +303,15 @@ struct common_params_speculative {
|
||||
// general-purpose speculative decoding parameters
|
||||
|
||||
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
|
||||
int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
|
||||
int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
|
||||
float p_split = 0.1f; // speculative decoding split probability
|
||||
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
|
||||
|
||||
// ngram-based speculative decoding
|
||||
|
||||
uint16_t ngram_size_n = 12; // ngram size for lookup
|
||||
uint16_t ngram_size_m = 48; // mgram size for speculative tokens
|
||||
uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
|
||||
uint16_t ngram_size_n = 12; // ngram size for lookup
|
||||
uint16_t ngram_size_m = 48; // mgram size for speculative tokens
|
||||
uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
|
||||
|
||||
std::shared_ptr<common_ngram_mod> ngram_mod;
|
||||
|
||||
@@ -433,11 +421,12 @@ struct common_params {
|
||||
// offload params
|
||||
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
||||
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
|
||||
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
|
||||
bool fit_params_print = false; // print the estimated required memory to run the model
|
||||
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
|
||||
|
||||
// margin per device in bytes for fitting parameters to free memory:
|
||||
std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);
|
||||
@@ -579,7 +568,7 @@ struct common_params {
|
||||
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
||||
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
|
||||
bool cache_prompt = true; // whether to enable prompt caching
|
||||
bool clear_idle = true; // save and clear idle slots upon starting a new task
|
||||
bool cache_idle_slots = true; // save and clear idle slots upon starting a new task
|
||||
int32_t n_ctx_checkpoints = 32; // max number of context checkpoints per slot
|
||||
int32_t checkpoint_every_nt = 8192; // make a checkpoint every n tokens during prefill
|
||||
int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
|
||||
@@ -593,8 +582,6 @@ struct common_params {
|
||||
bool force_pure_content_parser = false;
|
||||
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
||||
int enable_reasoning = -1; // -1 = auto, 0 = disable, 1 = enable
|
||||
int reasoning_budget = -1;
|
||||
std::string reasoning_budget_message; // message injected before end tag when budget exhausted
|
||||
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
|
||||
int sleep_idle_seconds = -1; // if >0, server will sleep after this many seconds of idle time
|
||||
|
||||
@@ -759,6 +746,11 @@ inline bool string_starts_with(std::string_view str, std::string_view prefix) {
|
||||
str.compare(0, prefix.size(), prefix) == 0;
|
||||
}
|
||||
|
||||
// remove when moving to c++20
|
||||
inline bool string_starts_with(std::string_view str, char prefix) {
|
||||
return !str.empty() && str.front() == prefix;
|
||||
}
|
||||
|
||||
// remove when moving to c++20
|
||||
inline bool string_ends_with(std::string_view str, std::string_view suffix) {
|
||||
return str.size() >= suffix.size() &&
|
||||
@@ -859,7 +851,23 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
|
||||
// clear LoRA adapters from context, then apply new list of adapters
|
||||
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
|
||||
|
||||
std::string get_model_endpoint();
|
||||
// model endpoint from env
|
||||
std::string common_get_model_endpoint();
|
||||
|
||||
//
|
||||
// Context utils
|
||||
//
|
||||
|
||||
enum common_context_seq_rm_type {
|
||||
COMMON_CONTEXT_SEQ_RM_TYPE_NO = 0, // seq_rm not supported (e.g. no memory module)
|
||||
COMMON_CONTEXT_SEQ_RM_TYPE_PART = 1, // can seq_rm partial sequences
|
||||
COMMON_CONTEXT_SEQ_RM_TYPE_FULL = 2, // can seq_rm full sequences only
|
||||
};
|
||||
|
||||
// check if the llama_context can remove sequences
|
||||
// note: clears the memory of the context
|
||||
common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx);
|
||||
|
||||
|
||||
//
|
||||
// Batch utils
|
||||
|
||||
+3
-2
@@ -1,5 +1,6 @@
|
||||
#include "arg.h"
|
||||
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "download.h"
|
||||
@@ -303,7 +304,7 @@ static int common_download_file_single_online(const std::string & url,
|
||||
headers.emplace(h.first, h.second);
|
||||
}
|
||||
if (headers.find("User-Agent") == headers.end()) {
|
||||
headers.emplace("User-Agent", "llama-cpp/" + build_info);
|
||||
headers.emplace("User-Agent", "llama-cpp/" + std::string(llama_build_info()));
|
||||
}
|
||||
if (!opts.bearer_token.empty()) {
|
||||
headers.emplace("Authorization", "Bearer " + opts.bearer_token);
|
||||
@@ -441,7 +442,7 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string
|
||||
headers.emplace(h.first, h.second);
|
||||
}
|
||||
if (headers.find("User-Agent") == headers.end()) {
|
||||
headers.emplace("User-Agent", "llama-cpp/" + build_info);
|
||||
headers.emplace("User-Agent", "llama-cpp/" + std::string(llama_build_info()));
|
||||
}
|
||||
|
||||
if (params.timeout > 0) {
|
||||
|
||||
+951
@@ -0,0 +1,951 @@
|
||||
#include "fit.h"
|
||||
|
||||
#include "log.h"
|
||||
|
||||
#include "../src/llama-ext.h"
|
||||
|
||||
#include <array>
|
||||
#include <cassert>
|
||||
#include <stdexcept>
|
||||
#include <cinttypes>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
// this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue
|
||||
// enum to identify part of a layer for distributing its tensors:
|
||||
enum common_layer_fraction_t {
|
||||
LAYER_FRACTION_NONE = 0, // nothing
|
||||
LAYER_FRACTION_ATTN = 1, // attention
|
||||
LAYER_FRACTION_UP = 2, // attention + up
|
||||
LAYER_FRACTION_GATE = 3, // attention + up + gate
|
||||
LAYER_FRACTION_MOE = 4, // everything but sparse MoE weights
|
||||
};
|
||||
|
||||
class common_params_fit_exception : public std::runtime_error {
|
||||
using std::runtime_error::runtime_error;
|
||||
};
|
||||
|
||||
static std::vector<llama_device_memory_data> common_get_device_memory_data(
|
||||
const char * path_model,
|
||||
const llama_model_params * mparams,
|
||||
const llama_context_params * cparams,
|
||||
std::vector<ggml_backend_dev_t> & devs,
|
||||
uint32_t & hp_ngl,
|
||||
uint32_t & hp_n_ctx_train,
|
||||
uint32_t & hp_n_expert,
|
||||
ggml_log_level log_level) {
|
||||
struct user_data_t {
|
||||
struct {
|
||||
ggml_log_callback callback;
|
||||
void * user_data;
|
||||
} original_logger;
|
||||
ggml_log_level min_level; // prints below this log level go to debug log
|
||||
};
|
||||
user_data_t ud;
|
||||
llama_log_get(&ud.original_logger.callback, &ud.original_logger.user_data);
|
||||
ud.min_level = log_level;
|
||||
|
||||
llama_log_set([](ggml_log_level level, const char * text, void * user_data) {
|
||||
const user_data_t * ud = (const user_data_t *) user_data;
|
||||
const ggml_log_level level_eff = level >= ud->min_level ? level : GGML_LOG_LEVEL_DEBUG;
|
||||
ud->original_logger.callback(level_eff, text, ud->original_logger.user_data);
|
||||
}, &ud);
|
||||
|
||||
llama_model_params mparams_copy = *mparams;
|
||||
mparams_copy.no_alloc = true;
|
||||
mparams_copy.use_mmap = false;
|
||||
mparams_copy.use_mlock = false;
|
||||
|
||||
llama_model * model = llama_model_load_from_file(path_model, mparams_copy);
|
||||
if (model == nullptr) {
|
||||
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
|
||||
throw std::runtime_error("failed to load model");
|
||||
}
|
||||
|
||||
llama_context * ctx = llama_init_from_model(model, *cparams);
|
||||
if (ctx == nullptr) {
|
||||
llama_model_free(model);
|
||||
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
|
||||
throw std::runtime_error("failed to create llama_context from model");
|
||||
}
|
||||
|
||||
const size_t nd = llama_model_n_devices(model);
|
||||
std::vector<llama_device_memory_data> ret(nd + 1);
|
||||
|
||||
llama_memory_breakdown memory_breakdown = llama_get_memory_breakdown(ctx);
|
||||
|
||||
for (const auto & [buft, mb] : memory_breakdown) {
|
||||
if (ggml_backend_buft_is_host(buft)) {
|
||||
ret.back().mb.model += mb.model;
|
||||
ret.back().mb.context += mb.context;
|
||||
ret.back().mb.compute += mb.compute;
|
||||
continue;
|
||||
}
|
||||
|
||||
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
|
||||
if (!dev) {
|
||||
continue;
|
||||
}
|
||||
for (size_t i = 0; i < nd; i++) {
|
||||
if (dev == llama_model_get_device(model, i)) {
|
||||
ret[i].mb.model += mb.model;
|
||||
ret[i].mb.context += mb.context;
|
||||
ret[i].mb.compute += mb.compute;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
if (cpu_dev == nullptr) {
|
||||
throw std::runtime_error("no CPU backend found");
|
||||
}
|
||||
size_t free;
|
||||
size_t total;
|
||||
ggml_backend_dev_memory(cpu_dev, &free, &total);
|
||||
ret.back().free = free;
|
||||
ret.back().total = total;
|
||||
}
|
||||
for (size_t i = 0; i < nd; i++) {
|
||||
size_t free;
|
||||
size_t total;
|
||||
ggml_backend_dev_memory(llama_model_get_device(model, i), &free, &total);
|
||||
|
||||
// devices can return 0 bytes for free and total memory if they do not
|
||||
// have any to report. in this case, we will use the host memory as a fallback
|
||||
// fixes: https://github.com/ggml-org/llama.cpp/issues/18577
|
||||
if (free == 0 && total == 0) {
|
||||
free = ret.back().free;
|
||||
total = ret.back().total;
|
||||
}
|
||||
ret[i].free = free;
|
||||
ret[i].total = total;
|
||||
}
|
||||
|
||||
devs.clear();
|
||||
for (int i = 0; i < llama_model_n_devices(model); i++) {
|
||||
devs.push_back(llama_model_get_device(model, i));
|
||||
}
|
||||
|
||||
hp_ngl = llama_model_n_layer(model);
|
||||
hp_n_ctx_train = llama_model_n_ctx_train(model);
|
||||
hp_n_expert = llama_model_n_expert(model);
|
||||
|
||||
common_memory_breakdown_print(ctx);
|
||||
|
||||
llama_free(ctx);
|
||||
llama_model_free(model);
|
||||
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void common_params_fit_impl(
|
||||
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
||||
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
||||
size_t * margins_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
||||
if (mparams->split_mode == LLAMA_SPLIT_MODE_TENSOR) {
|
||||
throw common_params_fit_exception("llama_params_fit is not implemented for SPLIT_MODE_TENSOR, abort");
|
||||
}
|
||||
constexpr int64_t MiB = 1024*1024;
|
||||
typedef std::vector<llama_device_memory_data> dmds_t;
|
||||
const llama_model_params default_mparams = llama_model_default_params();
|
||||
|
||||
std::vector<ggml_backend_dev_t> devs;
|
||||
uint32_t hp_ngl = 0; // hparams.n_gpu_layers
|
||||
uint32_t hp_nct = 0; // hparams.n_ctx_train
|
||||
uint32_t hp_nex = 0; // hparams.n_expert
|
||||
|
||||
// step 1: get data for default parameters and check whether any changes are necessary in the first place
|
||||
|
||||
LOG_INF("%s: getting device memory data for initial parameters:\n", __func__);
|
||||
const dmds_t dmds_full = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
const size_t nd = devs.size(); // number of devices
|
||||
|
||||
std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
|
||||
margins.reserve(nd);
|
||||
if (nd == 0) {
|
||||
margins.push_back(margins_s[0]);
|
||||
} else {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
margins.push_back(margins_s[id]);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::string> dev_names;
|
||||
{
|
||||
dev_names.reserve(nd);
|
||||
size_t max_length = 0;
|
||||
for (const auto & dev : devs) {
|
||||
std::string name = ggml_backend_dev_name(dev);
|
||||
name += " (";
|
||||
name += ggml_backend_dev_description(dev);
|
||||
name += ")";
|
||||
dev_names.push_back(name);
|
||||
max_length = std::max(max_length, name.length());
|
||||
}
|
||||
for (std::string & dn : dev_names) {
|
||||
dn.insert(dn.end(), max_length - dn.length(), ' ');
|
||||
}
|
||||
}
|
||||
|
||||
int64_t sum_free = 0;
|
||||
int64_t sum_projected_free = 0;
|
||||
int64_t sum_projected_used = 0;
|
||||
int64_t sum_projected_model = 0;
|
||||
std::vector<int64_t> projected_free_per_device;
|
||||
projected_free_per_device.reserve(nd);
|
||||
|
||||
if (nd == 0) {
|
||||
sum_projected_used = dmds_full.back().mb.total();
|
||||
sum_free = dmds_full.back().total;
|
||||
sum_projected_free = sum_free - sum_projected_used;
|
||||
LOG_INF("%s: projected to use %" PRId64 " MiB of host memory vs. %" PRId64 " MiB of total host memory\n",
|
||||
__func__, sum_projected_used/MiB, sum_free/MiB);
|
||||
if (sum_projected_free >= margins[0]) {
|
||||
LOG_INF("%s: will leave %" PRId64 " >= %" PRId64 " MiB of system memory, no changes needed\n",
|
||||
__func__, sum_projected_free/MiB, margins[0]/MiB);
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
if (nd > 1) {
|
||||
LOG_INF("%s: projected memory use with initial parameters [MiB]:\n", __func__);
|
||||
}
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
const llama_device_memory_data & dmd = dmds_full[id];
|
||||
|
||||
const int64_t projected_used = dmd.mb.total();
|
||||
const int64_t projected_free = dmd.free - projected_used;
|
||||
projected_free_per_device.push_back(projected_free);
|
||||
|
||||
sum_free += dmd.free;
|
||||
sum_projected_used += projected_used;
|
||||
sum_projected_free += projected_free;
|
||||
sum_projected_model += dmd.mb.model;
|
||||
|
||||
if (nd > 1) {
|
||||
LOG_INF("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
|
||||
__func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
|
||||
}
|
||||
}
|
||||
assert(sum_free >= 0 && sum_projected_used >= 0);
|
||||
LOG_INF("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
|
||||
__func__, sum_projected_used/MiB, sum_free/MiB);
|
||||
if (nd == 1) {
|
||||
if (projected_free_per_device[0] >= margins[0]) {
|
||||
LOG_INF("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
|
||||
__func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
bool changes_needed = false;
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
if (projected_free_per_device[id] < margins[id]) {
|
||||
changes_needed = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!changes_needed) {
|
||||
LOG_INF("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// step 2: try reducing memory use by reducing the context size
|
||||
|
||||
{
|
||||
int64_t global_surplus = sum_projected_free;
|
||||
if (nd == 0) {
|
||||
global_surplus -= margins[0];
|
||||
} else {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
global_surplus -= margins[id];
|
||||
}
|
||||
}
|
||||
if (global_surplus < 0) {
|
||||
if (nd <= 1) {
|
||||
LOG_INF("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
|
||||
__func__, margins[0]/MiB, -global_surplus/MiB);
|
||||
} else {
|
||||
LOG_INF(
|
||||
"%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n",
|
||||
__func__, -global_surplus/MiB);
|
||||
}
|
||||
if (cparams->n_ctx == 0) {
|
||||
if (hp_nct > n_ctx_min) {
|
||||
int64_t sum_used_target = sum_free;
|
||||
if (nd == 0) {
|
||||
sum_used_target -= margins[0];
|
||||
} else {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
sum_used_target -= margins[id];
|
||||
}
|
||||
}
|
||||
if (nd > 1) {
|
||||
// for multiple devices we need to be more conservative in terms of how much context we think can fit:
|
||||
// - for dense models only whole layers can be assigned to devices
|
||||
// - for MoE models only whole tensors can be assigned to devices, which we estimate to be <= 1/3 of a layer
|
||||
// - on average we expect a waste of 0.5 layers/tensors per device
|
||||
// - use slightly more than the expected average for nd devices to be safe
|
||||
const int64_t model_per_layer = sum_projected_model / std::min(uint32_t(mparams->n_gpu_layers), hp_ngl);
|
||||
sum_used_target -= (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6);
|
||||
}
|
||||
|
||||
int64_t sum_projected_used_min_ctx = 0;
|
||||
cparams->n_ctx = n_ctx_min;
|
||||
const dmds_t dmds_min_ctx = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
if (nd == 0) {
|
||||
sum_projected_used_min_ctx = dmds_min_ctx.back().mb.total();
|
||||
} else {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
sum_projected_used_min_ctx += dmds_min_ctx[id].mb.total();
|
||||
}
|
||||
}
|
||||
if (sum_used_target > sum_projected_used_min_ctx) {
|
||||
// linear interpolation between minimum and maximum context size:
|
||||
cparams->n_ctx += (hp_nct - n_ctx_min) * (sum_used_target - sum_projected_used_min_ctx)
|
||||
/ (sum_projected_used - sum_projected_used_min_ctx);
|
||||
cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend
|
||||
|
||||
const int64_t bytes_per_ctx = (sum_projected_used - sum_projected_used_min_ctx) / (hp_nct - n_ctx_min);
|
||||
const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
|
||||
LOG_INF("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
|
||||
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
|
||||
if (nd <= 1) {
|
||||
LOG_INF("%s: entire model can be fit by reducing context\n", __func__);
|
||||
return;
|
||||
}
|
||||
LOG_INF("%s: entire model should be fit across devices by reducing context\n", __func__);
|
||||
} else {
|
||||
const int64_t memory_reduction = sum_projected_used - sum_projected_used_min_ctx;
|
||||
LOG_INF("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
|
||||
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
|
||||
}
|
||||
} else {
|
||||
if (n_ctx_min == UINT32_MAX) {
|
||||
LOG_INF("%s: user has requested full context size of %" PRIu32 " -> no change\n", __func__, hp_nct);
|
||||
} else {
|
||||
LOG_INF("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
|
||||
__func__, hp_nct, n_ctx_min);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
LOG_INF("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (nd == 0) {
|
||||
throw common_params_fit_exception("was unable to fit model into system memory by reducing context, abort");
|
||||
}
|
||||
|
||||
if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
|
||||
throw common_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
|
||||
}
|
||||
if (nd > 1) {
|
||||
if (!tensor_split) {
|
||||
throw common_params_fit_exception("did not provide a buffer to write the tensor_split to, abort");
|
||||
}
|
||||
if (mparams->tensor_split) {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
if (mparams->tensor_split[id] != 0.0f) {
|
||||
throw common_params_fit_exception("model_params::tensor_split already set by user, abort");
|
||||
}
|
||||
}
|
||||
}
|
||||
if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||
throw common_params_fit_exception("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
|
||||
}
|
||||
}
|
||||
if (!tensor_buft_overrides) {
|
||||
throw common_params_fit_exception("did not provide buffer to set tensor_buft_overrides, abort");
|
||||
}
|
||||
if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) {
|
||||
throw common_params_fit_exception("model_params::tensor_buft_overrides already set by user, abort");
|
||||
}
|
||||
|
||||
// step 3: iteratively fill the back to front with "dense" layers
|
||||
// - for a dense model simply fill full layers, giving each device a contiguous slice of the model
|
||||
// - for a MoE model, same as dense model but with all MoE tensors in system memory
|
||||
|
||||
// utility function that returns a static C string matching the tensors for a specific layer index and layer fraction:
|
||||
auto get_overflow_pattern = [&](const size_t il, const common_layer_fraction_t lf) -> const char * {
|
||||
constexpr size_t n_strings = 1000;
|
||||
if (il >= n_strings) {
|
||||
throw std::runtime_error("at most " + std::to_string(n_strings) + " model layers are supported");
|
||||
}
|
||||
switch (lf) {
|
||||
case LAYER_FRACTION_ATTN: {
|
||||
static std::array<std::string, n_strings> patterns;
|
||||
if (patterns[il].empty()) {
|
||||
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(gate|up|gate_up|down).*";
|
||||
}
|
||||
return patterns[il].c_str();
|
||||
}
|
||||
case LAYER_FRACTION_UP: {
|
||||
static std::array<std::string, n_strings> patterns;
|
||||
if (patterns[il].empty()) {
|
||||
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(gate|gate_up|down).*";
|
||||
}
|
||||
return patterns[il].c_str();
|
||||
}
|
||||
case LAYER_FRACTION_GATE: {
|
||||
static std::array<std::string, n_strings> patterns;
|
||||
if (patterns[il].empty()) {
|
||||
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_down.*";
|
||||
}
|
||||
return patterns[il].c_str();
|
||||
}
|
||||
case LAYER_FRACTION_MOE: {
|
||||
static std::array<std::string, n_strings> patterns;
|
||||
if (patterns[il].empty()) {
|
||||
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(up|down|gate_up|gate)_(ch|)exps";
|
||||
}
|
||||
return patterns[il].c_str();
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
};
|
||||
|
||||
struct ngl_t {
|
||||
uint32_t n_layer = 0; // number of total layers
|
||||
uint32_t n_part = 0; // number of partial layers, <= n_layer
|
||||
|
||||
// for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
|
||||
common_layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
|
||||
|
||||
uint32_t n_full() const {
|
||||
assert(n_layer >= n_part);
|
||||
return n_layer - n_part;
|
||||
}
|
||||
};
|
||||
|
||||
const size_t ntbo = llama_max_tensor_buft_overrides();
|
||||
|
||||
// utility function to set n_gpu_layers and tensor_split
|
||||
auto set_ngl_tensor_split_tbo = [&](
|
||||
const std::vector<ngl_t> & ngl_per_device,
|
||||
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
|
||||
llama_model_params & mparams) {
|
||||
mparams.n_gpu_layers = 0;
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
mparams.n_gpu_layers += ngl_per_device[id].n_layer;
|
||||
if (nd > 1) {
|
||||
tensor_split[id] = ngl_per_device[id].n_layer;
|
||||
}
|
||||
}
|
||||
assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl + 1);
|
||||
uint32_t il0 = hp_ngl + 1 - mparams.n_gpu_layers; // start index for tensor buft overrides
|
||||
|
||||
mparams.tensor_split = tensor_split;
|
||||
|
||||
size_t itbo = 0;
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
il0 += ngl_per_device[id].n_full();
|
||||
for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
|
||||
if (itbo + 1 >= ntbo) {
|
||||
tensor_buft_overrides[itbo].pattern = nullptr;
|
||||
tensor_buft_overrides[itbo].buft = nullptr;
|
||||
itbo++;
|
||||
mparams.tensor_buft_overrides = tensor_buft_overrides;
|
||||
throw common_params_fit_exception("llama_max_tensor_buft_overrides() == "
|
||||
+ std::to_string(ntbo) + " is insufficient for model");
|
||||
}
|
||||
tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
|
||||
tensor_buft_overrides[itbo].buft = il == il0 ? overflow_bufts[id] : ggml_backend_cpu_buffer_type();
|
||||
itbo++;
|
||||
}
|
||||
il0 += ngl_per_device[id].n_part;
|
||||
}
|
||||
tensor_buft_overrides[itbo].pattern = nullptr;
|
||||
tensor_buft_overrides[itbo].buft = nullptr;
|
||||
itbo++;
|
||||
mparams.tensor_buft_overrides = tensor_buft_overrides;
|
||||
};
|
||||
|
||||
// utility function that returns the memory use per device for given numbers of layers per device
|
||||
auto get_memory_for_layers = [&](
|
||||
const char * func_name,
|
||||
const std::vector<ngl_t> & ngl_per_device,
|
||||
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts) -> std::vector<int64_t> {
|
||||
llama_model_params mparams_copy = *mparams;
|
||||
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy);
|
||||
|
||||
const dmds_t dmd_nl = common_get_device_memory_data(
|
||||
path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
|
||||
LOG_INF("%s: memory for test allocation by device:\n", func_name);
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
const ngl_t & n = ngl_per_device[id];
|
||||
LOG_INF(
|
||||
"%s: id=%zu, n_layer=%2" PRIu32 ", n_part=%2" PRIu32 ", overflow_type=%d, mem=%6" PRId64 " MiB\n",
|
||||
func_name, id, n.n_layer, n.n_part, int(n.overflow_type), dmd_nl[id].mb.total()/MiB);
|
||||
}
|
||||
|
||||
std::vector<int64_t> ret;
|
||||
ret.reserve(nd);
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
ret.push_back(dmd_nl[id].mb.total());
|
||||
}
|
||||
return ret;
|
||||
};
|
||||
|
||||
int64_t global_surplus_cpu_moe = 0;
|
||||
if (hp_nex > 0) {
|
||||
const static std::string pattern_moe_all = "blk\\.\\d+\\.ffn_(up|down|gate_up|gate)_(ch|)exps"; // matches all MoE tensors
|
||||
ggml_backend_buffer_type_t cpu_buft = ggml_backend_cpu_buffer_type();
|
||||
tensor_buft_overrides[0] = {pattern_moe_all.c_str(), cpu_buft};
|
||||
tensor_buft_overrides[1] = {nullptr, nullptr};
|
||||
mparams->tensor_buft_overrides = tensor_buft_overrides;
|
||||
|
||||
LOG_INF("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__);
|
||||
const dmds_t dmds_cpu_moe = common_get_device_memory_data(
|
||||
path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
global_surplus_cpu_moe += dmds_cpu_moe[id].free;
|
||||
global_surplus_cpu_moe -= int64_t(dmds_cpu_moe[id].mb.total()) + margins[id];
|
||||
}
|
||||
|
||||
if (global_surplus_cpu_moe > 0) {
|
||||
LOG_INF("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n",
|
||||
__func__, global_surplus_cpu_moe/MiB);
|
||||
} else {
|
||||
LOG_INF("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n",
|
||||
__func__, -global_surplus_cpu_moe/MiB);
|
||||
}
|
||||
|
||||
// reset
|
||||
tensor_buft_overrides[0] = {nullptr, nullptr};
|
||||
mparams->tensor_buft_overrides = tensor_buft_overrides;
|
||||
}
|
||||
|
||||
std::vector<int64_t> targets; // maximum acceptable memory use per device
|
||||
targets.reserve(nd);
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
targets.push_back(dmds_full[id].free - margins[id]);
|
||||
LOG_INF("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
|
||||
}
|
||||
|
||||
std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the first partial layer of a device overflows to:
|
||||
overflow_bufts.reserve(nd);
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
|
||||
}
|
||||
|
||||
std::vector<ngl_t> ngl_per_device(nd);
|
||||
std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts);
|
||||
|
||||
// optimize the number of layers per device using the method of false position:
|
||||
// - ngl_per_device has 0 layers for each device, lower bound
|
||||
// - try a "high" configuration where a device is given all unassigned layers
|
||||
// - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target
|
||||
// - check memory use of our guess, replace either the low or high bound
|
||||
// - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
|
||||
// - the last device has the output layer, which cannot be a partial layer
|
||||
if (hp_nex == 0) {
|
||||
LOG_INF("%s: filling dense layers back-to-front:\n", __func__);
|
||||
} else {
|
||||
LOG_INF("%s: filling dense-only layers back-to-front:\n", __func__);
|
||||
}
|
||||
for (int id = nd - 1; id >= 0; id--) {
|
||||
uint32_t n_unassigned = hp_ngl + 1;
|
||||
for (size_t jd = id + 1; jd < nd; ++jd) {
|
||||
assert(n_unassigned >= ngl_per_device[jd].n_layer);
|
||||
n_unassigned -= ngl_per_device[jd].n_layer;
|
||||
}
|
||||
|
||||
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
|
||||
ngl_per_device_high[id].n_layer = n_unassigned;
|
||||
if (hp_nex > 0) {
|
||||
ngl_per_device_high[id].n_part = size_t(id) < nd - 1 ? ngl_per_device_high[id].n_layer : ngl_per_device_high[id].n_layer - 1;
|
||||
}
|
||||
if (ngl_per_device_high[id].n_layer > 0) {
|
||||
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
|
||||
if (mem_high[id] > targets[id]) {
|
||||
assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
|
||||
uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
|
||||
LOG_INF("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
|
||||
while (delta > 1) {
|
||||
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
|
||||
step_size = std::max(step_size, uint32_t(1));
|
||||
step_size = std::min(step_size, delta - 1);
|
||||
|
||||
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
||||
ngl_per_device_test[id].n_layer += step_size;
|
||||
if (hp_nex) {
|
||||
ngl_per_device_test[id].n_part += size_t(id) == nd - 1 && ngl_per_device_test[id].n_part == 0 ?
|
||||
step_size - 1 : step_size; // the first layer is the output layer which must always be full
|
||||
}
|
||||
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
|
||||
|
||||
if (mem_test[id] <= targets[id]) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
mem = mem_test;
|
||||
LOG_INF("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
|
||||
} else {
|
||||
ngl_per_device_high = ngl_per_device_test;
|
||||
mem_high = mem_test;
|
||||
LOG_INF("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer);
|
||||
}
|
||||
delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
|
||||
}
|
||||
} else {
|
||||
assert(ngl_per_device_high[id].n_layer == n_unassigned);
|
||||
ngl_per_device = ngl_per_device_high;
|
||||
mem = mem_high;
|
||||
LOG_INF("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t projected_margin = dmds_full[id].free - mem[id];
|
||||
LOG_INF(
|
||||
"%s: - %s: %2" PRIu32 " layers, %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
|
||||
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
|
||||
}
|
||||
if (hp_nex == 0 || global_surplus_cpu_moe <= 0) {
|
||||
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
|
||||
return;
|
||||
}
|
||||
|
||||
// step 4: for a MoE model where all dense tensors fit,
|
||||
// convert the dense-only layers in the back to full layers in the front until all devices are full
|
||||
// essentially the same procedure as for the dense-only layers except front-to-back
|
||||
// also, try fitting at least part of one more layer to reduce waste for "small" GPUs with e.g. 24 GiB VRAM
|
||||
|
||||
size_t id_dense_start = nd;
|
||||
for (int id = nd - 1; id >= 0; id--) {
|
||||
if (ngl_per_device[id].n_layer > 0) {
|
||||
id_dense_start = id;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
assert(id_dense_start < nd);
|
||||
|
||||
LOG_INF("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
|
||||
for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) {
|
||||
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
|
||||
for (size_t jd = id_dense_start; jd < nd; jd++) {
|
||||
const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1;
|
||||
ngl_per_device_high[id].n_layer += n_layer_move;
|
||||
ngl_per_device_high[jd].n_layer -= n_layer_move;
|
||||
ngl_per_device_high[jd].n_part = 0;
|
||||
}
|
||||
size_t id_dense_start_high = nd - 1;
|
||||
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
|
||||
|
||||
if (mem_high[id] > targets[id]) {
|
||||
assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
|
||||
uint32_t delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
|
||||
while (delta > 1) {
|
||||
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
|
||||
step_size = std::max(step_size, uint32_t(1));
|
||||
step_size = std::min(step_size, delta - 1);
|
||||
|
||||
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
||||
size_t id_dense_start_test = id_dense_start;
|
||||
uint32_t n_converted_test = 0;
|
||||
for (;id_dense_start_test < nd; id_dense_start_test++) {
|
||||
const uint32_t n_convert_jd = std::min(step_size - n_converted_test, ngl_per_device_test[id_dense_start_test].n_part);
|
||||
ngl_per_device_test[id_dense_start_test].n_layer -= n_convert_jd;
|
||||
ngl_per_device_test[id_dense_start_test].n_part -= n_convert_jd;
|
||||
ngl_per_device_test[id].n_layer += n_convert_jd;
|
||||
n_converted_test += n_convert_jd;
|
||||
|
||||
if (ngl_per_device_test[id_dense_start_test].n_part > 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
|
||||
|
||||
if (mem_test[id] <= targets[id]) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
mem = mem_test;
|
||||
id_dense_start = id_dense_start_test;
|
||||
LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
} else {
|
||||
ngl_per_device_high = ngl_per_device_test;
|
||||
mem_high = mem_test;
|
||||
id_dense_start_high = id_dense_start_test;
|
||||
LOG_INF("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
|
||||
__func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
|
||||
}
|
||||
assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
|
||||
delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
|
||||
}
|
||||
} else {
|
||||
ngl_per_device = ngl_per_device_high;
|
||||
mem = mem_high;
|
||||
id_dense_start = id_dense_start_high;
|
||||
LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
}
|
||||
|
||||
// try to fit at least part of one more layer
|
||||
if (ngl_per_device[id_dense_start].n_layer > (id < nd - 1 ? 0 : 1)) {
|
||||
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
||||
size_t id_dense_start_test = id_dense_start;
|
||||
ngl_per_device_test[id_dense_start_test].n_layer--;
|
||||
ngl_per_device_test[id_dense_start_test].n_part--;
|
||||
ngl_per_device_test[id].n_layer++;
|
||||
ngl_per_device_test[id].n_part++;
|
||||
if (ngl_per_device_test[id_dense_start_test].n_part == 0) {
|
||||
id_dense_start_test++;
|
||||
}
|
||||
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
|
||||
std::vector<ggml_backend_buffer_type_t> overflow_bufts_test = overflow_bufts;
|
||||
if (id < nd - 1) {
|
||||
overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1]);
|
||||
}
|
||||
LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
|
||||
std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
||||
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
overflow_bufts = overflow_bufts_test;
|
||||
mem = mem_test;
|
||||
id_dense_start = id_dense_start_test;
|
||||
LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
|
||||
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
|
||||
LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
|
||||
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
||||
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
overflow_bufts = overflow_bufts_test;
|
||||
mem = mem_test;
|
||||
id_dense_start = id_dense_start_test;
|
||||
LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
}
|
||||
} else {
|
||||
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
|
||||
LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
|
||||
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
||||
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
overflow_bufts = overflow_bufts_test;
|
||||
mem = mem_test;
|
||||
id_dense_start = id_dense_start_test;
|
||||
LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t projected_margin = dmds_full[id].free - mem[id];
|
||||
LOG_INF(
|
||||
"%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
|
||||
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
|
||||
}
|
||||
|
||||
// print info for devices that were not changed during the conversion from dense only to full layers:
|
||||
for (size_t id = id_dense_start + 1; id < nd; id++) {
|
||||
const int64_t projected_margin = dmds_full[id].free - mem[id];
|
||||
LOG_INF(
|
||||
"%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
|
||||
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
|
||||
}
|
||||
|
||||
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
|
||||
}
|
||||
|
||||
enum common_params_fit_status common_fit_params(
|
||||
const char * path_model,
|
||||
llama_model_params * mparams,
|
||||
llama_context_params * cparams,
|
||||
float * tensor_split,
|
||||
llama_model_tensor_buft_override * tensor_buft_overrides,
|
||||
size_t * margins,
|
||||
uint32_t n_ctx_min,
|
||||
ggml_log_level log_level) {
|
||||
const int64_t t0_us = llama_time_us();
|
||||
common_params_fit_status status = COMMON_PARAMS_FIT_STATUS_SUCCESS;
|
||||
try {
|
||||
common_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level);
|
||||
LOG_INF("%s: successfully fit params to free device memory\n", __func__);
|
||||
} catch (const common_params_fit_exception & e) {
|
||||
LOG_WRN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
|
||||
status = COMMON_PARAMS_FIT_STATUS_FAILURE;
|
||||
} catch (const std::runtime_error & e) {
|
||||
LOG_ERR("%s: encountered an error while trying to fit params to free device memory: %s\n", __func__, e.what());
|
||||
status = COMMON_PARAMS_FIT_STATUS_ERROR;
|
||||
}
|
||||
const int64_t t1_us = llama_time_us();
|
||||
LOG_INF("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
|
||||
return status;
|
||||
}
|
||||
|
||||
void common_memory_breakdown_print(const struct llama_context * ctx) {
|
||||
//const auto & devices = ctx->get_model().devices;
|
||||
const auto * model = llama_get_model(ctx);
|
||||
|
||||
std::vector<ggml_backend_dev_t> devices;
|
||||
for (int i = 0; i < llama_model_n_devices(model); i++) {
|
||||
devices.push_back(llama_model_get_device(model, i));
|
||||
}
|
||||
|
||||
llama_memory_breakdown memory_breakdown = llama_get_memory_breakdown(ctx);
|
||||
|
||||
std::vector<std::array<std::string, 9>> table_data;
|
||||
table_data.reserve(devices.size());
|
||||
const std::string template_header = "%s: | %s | %s %s %s %s %s %s %s |\n";
|
||||
const std::string template_gpu = "%s: | %s | %s = %s + (%s = %s + %s + %s) + %s |\n";
|
||||
const std::string template_other = "%s: | %s | %s %s %s = %s + %s + %s %s |\n";
|
||||
|
||||
table_data.push_back({template_header, "memory breakdown [MiB]", "total", "free", "self", "model", "context", "compute", "unaccounted"});
|
||||
|
||||
constexpr size_t MiB = 1024 * 1024;
|
||||
const std::vector<std::string> desc_prefixes_strip = {"NVIDIA ", "GeForce ", "Tesla ", "AMD ", "Radeon ", "Instinct "};
|
||||
|
||||
// track seen buffer types to avoid double counting:
|
||||
std::set<ggml_backend_buffer_type_t> seen_buffer_types;
|
||||
|
||||
// accumulative memory breakdown for each device and for host:
|
||||
std::vector<llama_memory_breakdown_data> mb_dev(devices.size());
|
||||
llama_memory_breakdown_data mb_host;
|
||||
|
||||
for (const auto & buft_mb : memory_breakdown) {
|
||||
ggml_backend_buffer_type_t buft = buft_mb.first;
|
||||
const llama_memory_breakdown_data & mb = buft_mb.second;
|
||||
if (ggml_backend_buft_is_host(buft)) {
|
||||
mb_host.model += mb.model;
|
||||
mb_host.context += mb.context;
|
||||
mb_host.compute += mb.compute;
|
||||
seen_buffer_types.insert(buft);
|
||||
continue;
|
||||
}
|
||||
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
|
||||
if (dev) {
|
||||
int i_dev = -1;
|
||||
for (size_t i = 0; i < devices.size(); i++) {
|
||||
if (devices[i] == dev) {
|
||||
i_dev = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (i_dev != -1) {
|
||||
mb_dev[i_dev].model += mb.model;
|
||||
mb_dev[i_dev].context += mb.context;
|
||||
mb_dev[i_dev].compute += mb.compute;
|
||||
seen_buffer_types.insert(buft);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// print memory breakdown for each device:
|
||||
for (size_t i = 0; i < devices.size(); i++) {
|
||||
ggml_backend_dev_t dev = devices[i];
|
||||
llama_memory_breakdown_data mb = mb_dev[i];
|
||||
|
||||
const std::string name = ggml_backend_dev_name(dev);
|
||||
std::string desc = ggml_backend_dev_description(dev);
|
||||
for (const std::string & prefix : desc_prefixes_strip) {
|
||||
if (desc.length() >= prefix.length() && desc.substr(0, prefix.length()) == prefix) {
|
||||
desc = desc.substr(prefix.length());
|
||||
}
|
||||
}
|
||||
|
||||
size_t free, total;
|
||||
ggml_backend_dev_memory(dev, &free, &total);
|
||||
|
||||
const size_t self = mb.model + mb.context + mb.compute;
|
||||
const size_t unaccounted = total - self - free;
|
||||
|
||||
table_data.push_back({
|
||||
template_gpu,
|
||||
" - " + name + " (" + desc + ")",
|
||||
std::to_string(total / MiB),
|
||||
std::to_string(free / MiB),
|
||||
std::to_string(self / MiB),
|
||||
std::to_string(mb.model / MiB),
|
||||
std::to_string(mb.context / MiB),
|
||||
std::to_string(mb.compute / MiB),
|
||||
std::to_string(unaccounted / MiB)});
|
||||
}
|
||||
|
||||
// print memory breakdown for host:
|
||||
{
|
||||
const size_t self = mb_host.model + mb_host.context + mb_host.compute;
|
||||
table_data.push_back({
|
||||
template_other,
|
||||
" - Host",
|
||||
"", // total
|
||||
"", // free
|
||||
std::to_string(self / MiB),
|
||||
std::to_string(mb_host.model / MiB),
|
||||
std::to_string(mb_host.context / MiB),
|
||||
std::to_string(mb_host.compute / MiB),
|
||||
""}); // unaccounted
|
||||
}
|
||||
|
||||
// print memory breakdown for all remaining buffer types:
|
||||
for (const auto & buft_mb : memory_breakdown) {
|
||||
ggml_backend_buffer_type_t buft = buft_mb.first;
|
||||
const llama_memory_breakdown_data & mb = buft_mb.second;
|
||||
if (seen_buffer_types.count(buft) == 1) {
|
||||
continue;
|
||||
}
|
||||
const std::string name = ggml_backend_buft_name(buft);
|
||||
const size_t self = mb.model + mb.context + mb.compute;
|
||||
table_data.push_back({
|
||||
template_other,
|
||||
" - " + name,
|
||||
"", // total
|
||||
"", // free
|
||||
std::to_string(self / MiB),
|
||||
std::to_string(mb.model / MiB),
|
||||
std::to_string(mb.context / MiB),
|
||||
std::to_string(mb.compute / MiB),
|
||||
""}); // unaccounted
|
||||
seen_buffer_types.insert(buft);
|
||||
}
|
||||
|
||||
for (size_t j = 1; j < table_data[0].size(); j++) {
|
||||
size_t max_len = 0;
|
||||
for (const auto & td : table_data) {
|
||||
max_len = std::max(max_len, td[j].length());
|
||||
}
|
||||
for (auto & td : table_data) {
|
||||
td[j].insert(j == 1 ? td[j].length() : 0, max_len - td[j].length(), ' ');
|
||||
}
|
||||
}
|
||||
for (const auto & td : table_data) {
|
||||
LOG_INF(td[0].c_str(),
|
||||
__func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(),
|
||||
td[6].c_str(), td[7].c_str(), td[8].c_str());
|
||||
}
|
||||
}
|
||||
|
||||
void common_fit_print(
|
||||
const char * path_model,
|
||||
llama_model_params * mparams,
|
||||
llama_context_params * cparams) {
|
||||
std::vector<ggml_backend_dev_t> devs;
|
||||
uint32_t hp_ngl = 0; // hparams.n_gpu_layers
|
||||
uint32_t hp_nct = 0; // hparams.n_ctx_train
|
||||
uint32_t hp_nex = 0; // hparams.n_expert
|
||||
|
||||
auto dmd = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, GGML_LOG_LEVEL_ERROR);
|
||||
GGML_ASSERT(dmd.size() == devs.size() + 1);
|
||||
|
||||
for (size_t id = 0; id < devs.size(); id++) {
|
||||
printf("%s ", ggml_backend_dev_name(devs[id]));
|
||||
printf("%zu ", dmd[id].mb.model/1024/1024);
|
||||
printf("%zu ", dmd[id].mb.context/1024/1024);
|
||||
printf("%zu ", dmd[id].mb.compute/1024/1024);
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
printf("Host ");
|
||||
printf("%zu ", dmd.back().mb.model/1024/1024);
|
||||
printf("%zu ", dmd.back().mb.context/1024/1024);
|
||||
printf("%zu ", dmd.back().mb.compute/1024/1024);
|
||||
printf("\n");
|
||||
}
|
||||
@@ -0,0 +1,32 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
enum common_params_fit_status {
|
||||
COMMON_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
|
||||
COMMON_PARAMS_FIT_STATUS_FAILURE = 1, // could not find allocations that are projected to fit
|
||||
COMMON_PARAMS_FIT_STATUS_ERROR = 2, // a hard error occurred, e.g. because no model could be found at the specified path
|
||||
};
|
||||
|
||||
// fits mparams and cparams to free device memory (assumes system memory is unlimited)
|
||||
// - returns true if the parameters could be successfully modified to fit device memory
|
||||
// - this function is NOT thread safe because it modifies the global llama logger state
|
||||
// - only parameters that have the same value as in llama_default_model_params are modified
|
||||
// with the exception of the context size which is modified if and only if equal to 0
|
||||
enum common_params_fit_status common_fit_params(
|
||||
const char * path_model,
|
||||
struct llama_model_params * mparams,
|
||||
struct llama_context_params * cparams,
|
||||
float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements
|
||||
struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
|
||||
size_t * margins, // margins of memory to leave per device in bytes
|
||||
uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use
|
||||
enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log
|
||||
|
||||
// print estimated memory to stdout
|
||||
void common_fit_print(
|
||||
const char * path_model,
|
||||
struct llama_model_params * mparams,
|
||||
struct llama_context_params * cparams);
|
||||
|
||||
void common_memory_breakdown_print(const struct llama_context * ctx);
|
||||
+4
-3
@@ -1,5 +1,6 @@
|
||||
#include "hf-cache.h"
|
||||
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "http.h"
|
||||
@@ -200,7 +201,7 @@ static nl::json api_get(const std::string & url,
|
||||
auto [cli, parts] = common_http_client(url);
|
||||
|
||||
httplib::Headers headers = {
|
||||
{"User-Agent", "llama-cpp/" + build_info},
|
||||
{"User-Agent", "llama-cpp/" + std::string(llama_build_info())},
|
||||
{"Accept", "application/json"}
|
||||
};
|
||||
|
||||
@@ -229,7 +230,7 @@ static nl::json api_get(const std::string & url,
|
||||
static std::string get_repo_commit(const std::string & repo_id,
|
||||
const std::string & token) {
|
||||
try {
|
||||
auto endpoint = get_model_endpoint();
|
||||
auto endpoint = common_get_model_endpoint();
|
||||
auto json = api_get(endpoint + "api/models/" + repo_id + "/refs", token);
|
||||
|
||||
if (!json.is_object() ||
|
||||
@@ -307,7 +308,7 @@ hf_files get_repo_files(const std::string & repo_id,
|
||||
hf_files files;
|
||||
|
||||
try {
|
||||
auto endpoint = get_model_endpoint();
|
||||
auto endpoint = common_get_model_endpoint();
|
||||
auto json = api_get(endpoint + "api/models/" + repo_id + "/tree/" + commit + "?recursive=true", token);
|
||||
|
||||
if (!json.is_array()) {
|
||||
|
||||
@@ -23,6 +23,10 @@
|
||||
|
||||
int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
|
||||
|
||||
int common_log_get_verbosity_thold(void) {
|
||||
return common_log_verbosity_thold;
|
||||
}
|
||||
|
||||
void common_log_set_verbosity_thold(int verbosity) {
|
||||
common_log_verbosity_thold = verbosity;
|
||||
}
|
||||
|
||||
+2
-2
@@ -38,7 +38,7 @@ enum log_colors {
|
||||
|
||||
// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
|
||||
// set via common_log_set_verbosity()
|
||||
extern int common_log_verbosity_thold;
|
||||
int common_log_get_verbosity_thold(void);
|
||||
|
||||
void common_log_set_verbosity_thold(int verbosity); // not thread-safe
|
||||
|
||||
@@ -98,7 +98,7 @@ void common_log_flush (struct common_log * log); // f
|
||||
|
||||
#define LOG_TMPL(level, verbosity, ...) \
|
||||
do { \
|
||||
if ((verbosity) <= common_log_verbosity_thold) { \
|
||||
if ((verbosity) <= common_log_get_verbosity_thold()) { \
|
||||
common_log_add(common_log_main(), (level), __VA_ARGS__); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
@@ -208,7 +208,7 @@ void common_ngram_map_begin(
|
||||
count_keys, count_keys_del, count_values_del, count_map_entries_upd);
|
||||
}
|
||||
|
||||
map.idx_last_check = (map.size_last_begin > 0) ? map.size_last_begin - 1 : 0;
|
||||
map.idx_last_check = size_begin;
|
||||
map.size_last_begin = size_begin;
|
||||
}
|
||||
|
||||
@@ -231,7 +231,7 @@ void common_ngram_map_draft(common_ngram_map & map,
|
||||
GGML_ABORT("%s: cur_len exceeds UINT32_MAX: %zu", __func__, cur_len);
|
||||
}
|
||||
|
||||
if (map.idx_last_check > cur_len) {
|
||||
if (map.idx_last_check > cur_len) {
|
||||
// Should not happen because of common_ngram_map_begin().
|
||||
GGML_ABORT("%s: map.idx_last_check > cur_len: %zu > %zu", __func__, map.idx_last_check, cur_len);
|
||||
}
|
||||
@@ -386,7 +386,7 @@ void common_ngram_map_draft(common_ngram_map & map,
|
||||
LOG_DBG("%s: key_idx = %zu, key_offset = %zu, key_num = %d, draft.size = %zu\n", __func__,
|
||||
curr_key.key_idx, key_offset, curr_key.key_num, draft.size());
|
||||
|
||||
map.last_draft_created = false;
|
||||
map.last_draft_created = true;
|
||||
map.last_draft_key_idx = key_offset;
|
||||
map.last_draft_value_idx = 0; // value 0 is used for simple mode
|
||||
return;
|
||||
@@ -524,7 +524,7 @@ void common_ngram_map_accept(common_ngram_map & map, uint16_t n_accepted) {
|
||||
struct common_ngram_map_value & curr_value = curr_key.values[val_idx]; // value used for draft generation.
|
||||
|
||||
// update the value statistics
|
||||
LOG_INF("common_ngram_map_send_accepted: n_accepted = %d, prev value_num = %d\n",
|
||||
LOG_DBG("common_ngram_map_send_accepted: n_accepted = %d, prev value_num = %d\n",
|
||||
n_accepted, curr_value.n_accepted);
|
||||
curr_value.n_accepted = n_accepted;
|
||||
}
|
||||
|
||||
+4
-2
@@ -1,10 +1,12 @@
|
||||
#include "sampling.h"
|
||||
|
||||
#include "common.h"
|
||||
#include "ggml.h"
|
||||
#include "fit.h"
|
||||
#include "log.h"
|
||||
#include "reasoning-budget.h"
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cctype>
|
||||
#include <climits>
|
||||
@@ -511,7 +513,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
|
||||
LOG_INF("%s: unaccounted time = %10.2f ms / %5.1f %% (total - sampling - prompt eval - eval) / (total)\n", __func__, t_unacc_ms, t_unacc_pc);
|
||||
LOG_INF("%s: graphs reused = %10d\n", __func__, data.n_reused);
|
||||
|
||||
llama_memory_breakdown_print(ctx);
|
||||
common_memory_breakdown_print(ctx);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
+135
-52
@@ -13,6 +13,7 @@
|
||||
#include <cstring>
|
||||
#include <iomanip>
|
||||
#include <map>
|
||||
#include <cinttypes>
|
||||
|
||||
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128
|
||||
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
|
||||
@@ -144,10 +145,28 @@ struct common_speculative_state {
|
||||
virtual void accept(uint16_t n_accepted) = 0;
|
||||
};
|
||||
|
||||
struct common_speculative_checkpoint {
|
||||
llama_pos pos_min = 0;
|
||||
llama_pos pos_max = 0;
|
||||
|
||||
int64_t n_tokens = 0;
|
||||
|
||||
std::vector<uint8_t> data;
|
||||
|
||||
size_t size() const {
|
||||
return data.size();
|
||||
}
|
||||
|
||||
size_t ckpt_size = 0;
|
||||
};
|
||||
|
||||
struct common_speculative_state_draft : public common_speculative_state {
|
||||
llama_context * ctx_tgt; // only used for retokenizing from ctx_dft
|
||||
llama_context * ctx_dft;
|
||||
|
||||
bool use_ckpt = false;
|
||||
struct common_speculative_checkpoint ckpt;
|
||||
|
||||
common_sampler * smpl;
|
||||
|
||||
llama_batch batch;
|
||||
@@ -160,10 +179,12 @@ struct common_speculative_state_draft : public common_speculative_state {
|
||||
enum common_speculative_type type,
|
||||
llama_context * ctx_tgt,
|
||||
llama_context * ctx_dft,
|
||||
const std::vector<std::pair<std::string, std::string>> & replacements)
|
||||
const std::vector<std::pair<std::string, std::string>> & replacements,
|
||||
bool use_ckpt)
|
||||
: common_speculative_state(type)
|
||||
, ctx_tgt(ctx_tgt)
|
||||
, ctx_dft(ctx_dft)
|
||||
, use_ckpt(use_ckpt)
|
||||
{
|
||||
batch = llama_batch_init(llama_n_batch(ctx_dft), 0, 1);
|
||||
smpl = nullptr;
|
||||
@@ -218,7 +239,48 @@ struct common_speculative_state_draft : public common_speculative_state {
|
||||
}
|
||||
|
||||
void begin(const llama_tokens & prompt) override {
|
||||
GGML_UNUSED(prompt);
|
||||
if (use_ckpt && ckpt.size() > 0) {
|
||||
// delete checkpoint
|
||||
LOG_DBG("%s: delete checkpoint, prompt.size=%zu, pos_min=%d, pos_max=%d, n_tokens=%" PRId64 ", size=%.3f MiB\n",
|
||||
__func__, prompt.size(), ckpt.pos_min, ckpt.pos_max, ckpt.n_tokens, (float) ckpt.data.size() / 1024 / 1024);
|
||||
ckpt.pos_min = 0;
|
||||
ckpt.pos_max = 0;
|
||||
ckpt.n_tokens = 0;
|
||||
ckpt.ckpt_size = 0;
|
||||
ckpt.data.clear();
|
||||
}
|
||||
}
|
||||
|
||||
size_t draft_create_checkpoint(int n_tokens_prompt, int n_tokens_batch) {
|
||||
int slot_id = 0;
|
||||
const size_t checkpoint_size = llama_state_seq_get_size_ext(ctx_dft, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
|
||||
|
||||
ckpt.pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx_dft), slot_id);
|
||||
ckpt.pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), slot_id);
|
||||
ckpt.n_tokens = n_tokens_prompt - n_tokens_batch;
|
||||
ckpt.data.resize(checkpoint_size);
|
||||
|
||||
const size_t n = llama_state_seq_get_data_ext(ctx_dft, ckpt.data.data(), checkpoint_size, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
|
||||
if (n != checkpoint_size) {
|
||||
GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", checkpoint_size, n);
|
||||
}
|
||||
|
||||
LOG_DBG("%s: pos_min = %d, pos_max = %d, size = %.3f MiB\n", __func__,
|
||||
ckpt.pos_min, ckpt.pos_max, (float) ckpt.data.size() / 1024 / 1024);
|
||||
return n;
|
||||
}
|
||||
|
||||
size_t draft_restore_checkpoint(size_t ckpt_size_part_expected) {
|
||||
int slot_id = 0;
|
||||
LOG_DBG("%s: pos_min = %d, pos_max = %d\n", __func__, ckpt.pos_min, ckpt.pos_max);
|
||||
const size_t n = llama_state_seq_set_data_ext(ctx_dft, ckpt.data.data(), ckpt.size(), slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
|
||||
if (n != ckpt_size_part_expected) {
|
||||
GGML_ABORT("%s: failed to restore context checkpoint (pos_min=%d, pos_max=%d, size=%zu, get_data_ext->%zu, set_data_ext->%zu",
|
||||
__func__, ckpt.pos_min, ckpt.pos_max, ckpt.size(), ckpt_size_part_expected, n);
|
||||
}
|
||||
llama_memory_seq_rm(llama_get_memory(ctx_dft), slot_id, ckpt.pos_max + 1, -1);
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
void draft(
|
||||
@@ -236,8 +298,8 @@ struct common_speculative_state_draft : public common_speculative_state {
|
||||
|
||||
auto * mem_dft = llama_get_memory(ctx_dft);
|
||||
|
||||
int reuse_i = 0;
|
||||
int reuse_n = 0;
|
||||
int reuse_i = 0; // index of part to be reused in prompt_dft
|
||||
int reuse_n = 0; // length of part to be reused in prompt_dft
|
||||
|
||||
const int n_ctx = llama_n_ctx(ctx_dft) - params.n_max;
|
||||
|
||||
@@ -287,18 +349,26 @@ struct common_speculative_state_draft : public common_speculative_state {
|
||||
}
|
||||
}
|
||||
|
||||
LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt_dft.size());
|
||||
LOG_DBG("%s: reuse_i = %d, reuse_n = %d, #prompt_dft = %zu, #prompt_cur = %zu\n",
|
||||
__func__, reuse_i, reuse_n, prompt_dft.size(), prompt_cur.size());
|
||||
if (use_ckpt && ckpt.ckpt_size == 0 && reuse_n > 0) {
|
||||
LOG_DBG("%s: no checkpoint available, no reuse, (reuse_i=%d, reuse_n=%d) -> (0, 0)\n",
|
||||
__func__, reuse_i, reuse_n);
|
||||
reuse_i = 0;
|
||||
reuse_n = 0;
|
||||
}
|
||||
|
||||
result.clear();
|
||||
result.reserve(params.n_max);
|
||||
|
||||
if (reuse_n == 0) {
|
||||
bool needs_ckpt = use_ckpt && prompt_dft.size() > 0;
|
||||
if (reuse_n == 0 || (use_ckpt && reuse_i > 0)) {
|
||||
llama_memory_clear(mem_dft, false);
|
||||
prompt_dft.clear();
|
||||
} else {
|
||||
// this happens when a previous draft has been discarded (for example, due to being too small), but the
|
||||
// target model agreed with it. in this case, we simply pass back the previous results to save compute
|
||||
if (reuse_i + reuse_n < (int) prompt_dft.size() && prompt_dft[reuse_i + reuse_n] == id_last) {
|
||||
if (reuse_i + reuse_n < (int64_t) prompt_dft.size() && prompt_dft[reuse_i + reuse_n] == id_last) {
|
||||
for (int i = reuse_i + reuse_n + 1; i < (int) prompt_dft.size(); ++i) {
|
||||
result.push_back(prompt_dft[i]);
|
||||
|
||||
@@ -310,19 +380,50 @@ struct common_speculative_state_draft : public common_speculative_state {
|
||||
return;
|
||||
}
|
||||
|
||||
bool do_restore = false;
|
||||
if (prompt_dft.size() > prompt_cur.size() && reuse_i + reuse_n < (int64_t) prompt_dft.size()) {
|
||||
// This can happen after a partial acceptance (speculative decoding with checkpoints)
|
||||
LOG_DBG("%s: #prompt_dft=%zu, #prompt_cur=%zu, shorten draft\n",
|
||||
__func__, prompt_dft.size(), prompt_cur.size());
|
||||
prompt_dft.resize(prompt_cur.size());
|
||||
do_restore = true;
|
||||
}
|
||||
|
||||
if (reuse_i > 0) {
|
||||
llama_memory_seq_rm (mem_dft, 0, 0, reuse_i);
|
||||
bool is_removed = llama_memory_seq_rm (mem_dft, 0, 0, reuse_i);
|
||||
if (!is_removed) {
|
||||
LOG_ERR("%s: llama_memory_seq_rm failed, reuse_i=%d\n", __func__, reuse_i);
|
||||
}
|
||||
llama_memory_seq_add(mem_dft, 0, reuse_i, -1, -reuse_i);
|
||||
|
||||
prompt_dft.erase(prompt_dft.begin(), prompt_dft.begin() + reuse_i);
|
||||
}
|
||||
|
||||
if (reuse_n < (int) prompt_dft.size()) {
|
||||
llama_memory_seq_rm (mem_dft, 0, reuse_n, -1);
|
||||
prompt_dft.erase(prompt_dft.begin() + reuse_n, prompt_dft.end());
|
||||
if (reuse_n < (int) prompt_dft.size() || do_restore) {
|
||||
if (use_ckpt) {
|
||||
if (ckpt.n_tokens > (int64_t) prompt_dft.size()) {
|
||||
LOG_INF("%s: checkpoint is too large, prompt_tgt.size=%zu, ckpt.n_tokens=%" PRId64 ", reuse_n=%d, prompt_dft.size=%zu\n",
|
||||
__func__, prompt_tgt.size(), ckpt.n_tokens, reuse_n, prompt_dft.size());
|
||||
}
|
||||
draft_restore_checkpoint(ckpt.ckpt_size);
|
||||
reuse_n = ckpt.n_tokens;
|
||||
prompt_dft.resize(reuse_n);
|
||||
needs_ckpt = false;
|
||||
} else {
|
||||
bool is_removed = llama_memory_seq_rm (mem_dft, 0, reuse_n, -1);
|
||||
if (!is_removed) {
|
||||
LOG_ERR("%s: llama_memory_seq_rm failed, reuse_n=%d, prompt_dft.size=%zu\n",
|
||||
__func__, reuse_n, prompt_dft.size());
|
||||
}
|
||||
prompt_dft.erase(prompt_dft.begin() + reuse_n, prompt_dft.end());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (needs_ckpt) {
|
||||
ckpt.ckpt_size = draft_create_checkpoint(prompt_dft.size(), batch.n_tokens);
|
||||
}
|
||||
|
||||
// prepare a batch to evaluate any new tokens in the prompt
|
||||
common_batch_clear(batch);
|
||||
|
||||
@@ -337,7 +438,11 @@ struct common_speculative_state_draft : public common_speculative_state {
|
||||
if (batch.n_tokens > 0) {
|
||||
//LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str());
|
||||
|
||||
llama_decode(ctx_dft, batch);
|
||||
int ret = llama_decode(ctx_dft, batch);
|
||||
if (ret != 0 && ret != 1) {
|
||||
LOG_WRN("%s: llama_decode returned %d, prompt_cur.size=%zu\n",
|
||||
__func__, ret, prompt_cur.size());
|
||||
}
|
||||
}
|
||||
|
||||
const llama_pos n_past = prompt_dft.size();
|
||||
@@ -351,7 +456,11 @@ struct common_speculative_state_draft : public common_speculative_state {
|
||||
|
||||
LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx_dft, prompt_dft).c_str());
|
||||
|
||||
llama_decode(ctx_dft, batch);
|
||||
int ret = llama_decode(ctx_dft, batch);
|
||||
if (ret != 0 && ret != 1) {
|
||||
LOG_WRN("%s: llama_decode returned %d, prompt_cur.size=%zu, prompt_dft.size=%zu\n",
|
||||
__func__, ret, prompt_cur.size(), prompt_dft.size());
|
||||
}
|
||||
|
||||
common_sampler_reset(smpl);
|
||||
|
||||
@@ -387,7 +496,11 @@ struct common_speculative_state_draft : public common_speculative_state {
|
||||
common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
|
||||
|
||||
// evaluate the drafted tokens on the draft model
|
||||
llama_decode(ctx_dft, batch);
|
||||
ret = llama_decode(ctx_dft, batch);
|
||||
if (ret != 0) {
|
||||
LOG_WRN("%s: llama_decode[%d] returned %d, prompt_cur.size=%zu, prompt_dft.size=%zu\n",
|
||||
__func__, i, ret, prompt_cur.size(), prompt_dft.size());
|
||||
}
|
||||
|
||||
prompt_dft.push_back(id);
|
||||
}
|
||||
@@ -636,6 +749,7 @@ struct common_speculative_state_ngram_mod : public common_speculative_state {
|
||||
|
||||
mod.reset();
|
||||
n_low = 0;
|
||||
i_last = 0;
|
||||
}
|
||||
} else {
|
||||
n_low = 0;
|
||||
@@ -739,6 +853,7 @@ struct common_speculative_state_ngram_cache : public common_speculative_state {
|
||||
|
||||
struct common_speculative {
|
||||
std::vector<std::unique_ptr<common_speculative_state>> impls; // list of implementations to use and their states
|
||||
|
||||
common_speculative_state * curr_impl = nullptr; // current implementation in use (for stats)
|
||||
};
|
||||
|
||||
@@ -798,42 +913,6 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
|
||||
return it->second;
|
||||
}
|
||||
|
||||
bool common_speculative_is_compat(llama_context * ctx_tgt) {
|
||||
auto * mem = llama_get_memory(ctx_tgt);
|
||||
if (mem == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool res = true;
|
||||
|
||||
llama_memory_clear(mem, true);
|
||||
|
||||
// eval 2 tokens to check if the context is compatible
|
||||
std::vector<llama_token> tmp;
|
||||
tmp.push_back(0);
|
||||
tmp.push_back(0);
|
||||
|
||||
int ret = llama_decode(ctx_tgt, llama_batch_get_one(tmp.data(), tmp.size()));
|
||||
if (ret != 0) {
|
||||
LOG_ERR("%s: llama_decode() failed: %d\n", __func__, ret);
|
||||
res = false;
|
||||
goto done;
|
||||
}
|
||||
|
||||
// try to remove the last tokens
|
||||
if (!llama_memory_seq_rm(mem, 0, 1, -1)) {
|
||||
LOG_WRN("%s: the target context does not support partial sequence removal\n", __func__);
|
||||
res = false;
|
||||
goto done;
|
||||
}
|
||||
|
||||
done:
|
||||
llama_memory_clear(mem, true);
|
||||
llama_synchronize(ctx_tgt);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// initialization of the speculative decoding system
|
||||
//
|
||||
common_speculative * common_speculative_init(
|
||||
@@ -908,10 +987,13 @@ common_speculative * common_speculative_init(
|
||||
case COMMON_SPECULATIVE_TYPE_NONE:
|
||||
break;
|
||||
case COMMON_SPECULATIVE_TYPE_DRAFT: {
|
||||
const bool use_ckpt = common_context_can_seq_rm(ctx_dft) == COMMON_CONTEXT_SEQ_RM_TYPE_FULL;
|
||||
|
||||
impls.push_back(std::make_unique<common_speculative_state_draft>(config.type,
|
||||
/* .ctx_tgt = */ ctx_tgt,
|
||||
/* .ctx_dft = */ ctx_dft,
|
||||
/* .replacements = */ params.replacements
|
||||
/* .replacements = */ params.replacements,
|
||||
/* .use_ckpt = */ use_ckpt
|
||||
));
|
||||
break;
|
||||
}
|
||||
@@ -966,7 +1048,8 @@ common_speculative * common_speculative_init(
|
||||
}
|
||||
|
||||
auto * result = new common_speculative {
|
||||
/* .impls = */ std::move(impls)
|
||||
/* .impls = */ std::move(impls),
|
||||
/* .curr_impl = */ nullptr,
|
||||
};
|
||||
|
||||
return result;
|
||||
|
||||
@@ -14,10 +14,6 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
|
||||
// convert type to string
|
||||
std::string common_speculative_type_to_str(enum common_speculative_type type);
|
||||
|
||||
// check if the llama_context is compatible for speculative decoding
|
||||
// note: clears the memory of the context
|
||||
bool common_speculative_is_compat(llama_context * ctx_tgt);
|
||||
|
||||
common_speculative * common_speculative_init(
|
||||
common_params_speculative & params,
|
||||
llama_context * ctx_tgt);
|
||||
@@ -39,3 +35,9 @@ void common_speculative_accept(common_speculative * spec, uint16_t n_accepted);
|
||||
|
||||
// print statistics about the speculative decoding
|
||||
void common_speculative_print_stats(const common_speculative * spec);
|
||||
|
||||
struct common_speculative_deleter {
|
||||
void operator()(common_speculative * s) { common_speculative_free(s); }
|
||||
};
|
||||
|
||||
typedef std::unique_ptr<common_speculative, common_speculative_deleter> common_speculative_ptr;
|
||||
|
||||
+120
-22
@@ -746,7 +746,12 @@ class ModelBase:
|
||||
|
||||
if (not quant_algo or not quant_layers) and quant_config_file.is_file():
|
||||
with open(quant_config_file, "r", encoding="utf-8") as f:
|
||||
quant_config = json.load(f).get("quantization") or {}
|
||||
hf_quant_config = json.load(f)
|
||||
quant_config = hf_quant_config.get("quantization") or {}
|
||||
producer = hf_quant_config.get("producer") or {}
|
||||
producer_name = (producer.get("name") or "").lower()
|
||||
if quant_method is None:
|
||||
self.hparams.setdefault("quantization_config", {})["quant_method"] = producer_name
|
||||
quant_algo = quant_config.get("quant_algo", quant_algo)
|
||||
quant_layers = quant_config.get("quantized_layers", quant_layers) or {}
|
||||
|
||||
@@ -1850,20 +1855,28 @@ class TextModel(ModelBase):
|
||||
with open(module_path, encoding="utf-8") as f:
|
||||
modules = json.load(f)
|
||||
for mod in modules:
|
||||
if mod["type"] == "sentence_transformers.models.Pooling":
|
||||
if mod["type"].endswith("Pooling"):
|
||||
pooling_path = mod["path"]
|
||||
break
|
||||
|
||||
mode_mapping = {
|
||||
"mean": gguf.PoolingType.MEAN,
|
||||
"cls": gguf.PoolingType.CLS,
|
||||
"lasttoken": gguf.PoolingType.LAST,
|
||||
}
|
||||
|
||||
# get pooling type
|
||||
if pooling_path is not None:
|
||||
with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f:
|
||||
pooling = json.load(f)
|
||||
if pooling["pooling_mode_mean_tokens"]:
|
||||
if pooling.get("pooling_mode_mean_tokens"):
|
||||
pooling_type = gguf.PoolingType.MEAN
|
||||
elif pooling["pooling_mode_cls_token"]:
|
||||
elif pooling.get("pooling_mode_cls_token"):
|
||||
pooling_type = gguf.PoolingType.CLS
|
||||
elif pooling["pooling_mode_lasttoken"]:
|
||||
elif pooling.get("pooling_mode_lasttoken"):
|
||||
pooling_type = gguf.PoolingType.LAST
|
||||
elif (pooling_mode := pooling.get("pooling_mode")) in mode_mapping:
|
||||
pooling_type = mode_mapping[pooling_mode]
|
||||
else:
|
||||
raise NotImplementedError("Only MEAN, CLS, and LAST pooling types supported")
|
||||
self.gguf_writer.add_pooling_type(pooling_type)
|
||||
@@ -7180,7 +7193,7 @@ class EmbeddingGemma(Gemma3Model):
|
||||
with open(modules_file, encoding="utf-8") as modules_json_file:
|
||||
mods = json.load(modules_json_file)
|
||||
for mod in mods:
|
||||
if mod["type"] == "sentence_transformers.models.Dense":
|
||||
if mod["type"].endswith("Dense"):
|
||||
mod_path = mod["path"]
|
||||
# check if model.safetensors file for Dense layer exists
|
||||
model_tensors_file = self.dir_model / mod_path / "model.safetensors"
|
||||
@@ -10912,14 +10925,14 @@ class NemotronHModel(GraniteHybridModel):
|
||||
vocab_size = -(vocab_size // -pad_vocab) * pad_vocab
|
||||
self.hparams["vocab_size"] = vocab_size
|
||||
|
||||
assert max(tokenizer.vocab.values()) < vocab_size
|
||||
assert max(tokenizer.vocab.values()) < vocab_size # ty: ignore[unresolved-attribute]
|
||||
|
||||
tokpre = self.get_vocab_base_pre(tokenizer)
|
||||
|
||||
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
|
||||
added_vocab = tokenizer.get_added_vocab()
|
||||
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute]
|
||||
added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute]
|
||||
|
||||
added_tokens_decoder = tokenizer.added_tokens_decoder
|
||||
added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute]
|
||||
|
||||
for i in range(vocab_size):
|
||||
if i not in reverse_vocab:
|
||||
@@ -10930,7 +10943,7 @@ class NemotronHModel(GraniteHybridModel):
|
||||
if token in added_vocab:
|
||||
if not added_tokens_decoder[i].normalized:
|
||||
previous_token = token
|
||||
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
|
||||
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) # ty: ignore[unresolved-attribute, invalid-assignment]
|
||||
if previous_token != token:
|
||||
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
|
||||
|
||||
@@ -11847,7 +11860,7 @@ class LLaDAMoEModel(TextModel):
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("HunYuanDenseV1ForCausalLM", "HunYuanVLForConditionalGeneration")
|
||||
@ModelBase.register("HunYuanDenseV1ForCausalLM")
|
||||
class HunYuanModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
|
||||
|
||||
@@ -11986,28 +11999,58 @@ class HunYuanModel(TextModel):
|
||||
|
||||
|
||||
@ModelBase.register("HunYuanVLForConditionalGeneration")
|
||||
class HunyuanOCRVisionModel(MmprojModel):
|
||||
class HunyuanVLVisionModel(MmprojModel):
|
||||
# Handles both HunyuanOCR and HunyuanVL, which share the HF architecture name
|
||||
# "HunYuanVLForConditionalGeneration" and the `vit.perceive.*` vision layout.
|
||||
# Each variant maps to a different projector type in clip.cpp so image
|
||||
# preprocessing follows the correct code path.
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
assert self.hparams_vision is not None
|
||||
# HunyuanOCR uses max_image_size instead of image_size
|
||||
# HunyuanOCR / HunyuanVL uses max_image_size instead of image_size
|
||||
if "image_size" not in self.hparams_vision:
|
||||
self.hparams_vision["image_size"] = self.hparams_vision.get("max_image_size", 2048)
|
||||
|
||||
@staticmethod
|
||||
def is_ocr_variant(hparams: dict) -> bool:
|
||||
"""Return True for HunyuanOCR, False for HunyuanVL.
|
||||
|
||||
The projector's output dim must equal the text model's hidden_size by
|
||||
construction (that's what "projector" means). HunyuanOCR pairs a 1B text
|
||||
backbone (hidden=1024); HunyuanVL pairs a 4B one (hidden=3072). So the
|
||||
ViT -> LLM projection dim is a hard architectural signature, not a
|
||||
magic number.
|
||||
"""
|
||||
vision_out = int((hparams.get("vision_config") or {}).get("out_hidden_size", 0))
|
||||
return vision_out == 1024
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
assert self.hparams_vision is not None
|
||||
hparams = self.hparams_vision
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR)
|
||||
self.gguf_writer.add_vision_use_gelu(True)
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("rms_norm_eps", 1e-5))
|
||||
self.gguf_writer.add_vision_spatial_merge_size(hparams.get("spatial_merge_size", 2))
|
||||
self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
|
||||
self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])
|
||||
vcfg = self.hparams_vision
|
||||
|
||||
if self.is_ocr_variant(self.global_config):
|
||||
# --- HunyuanOCR ---
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR)
|
||||
self.gguf_writer.add_vision_use_gelu(True)
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(vcfg.get("rms_norm_eps", 1e-5))
|
||||
self.gguf_writer.add_vision_spatial_merge_size(vcfg.get("spatial_merge_size", 2))
|
||||
self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
|
||||
self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])
|
||||
return
|
||||
|
||||
# --- HunyuanVL ---
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANVL)
|
||||
self.gguf_writer.add_vision_use_gelu(str(vcfg["hidden_act"]).lower() == "gelu")
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(float(vcfg["rms_norm_eps"]))
|
||||
self.gguf_writer.add_vision_spatial_merge_size(int(vcfg["spatial_merge_size"]))
|
||||
self.gguf_writer.add_vision_min_pixels(int(self.preprocessor_config["min_pixels"]))
|
||||
self.gguf_writer.add_vision_max_pixels(int(self.preprocessor_config["max_pixels"]))
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if not name.startswith("vit."):
|
||||
return # skip text tensors
|
||||
return
|
||||
# strip CLS token (row 0) from position embeddings so resize_position_embeddings works
|
||||
if "position_embedding" in name:
|
||||
data_torch = data_torch[1:] # [n_patches+1, n_embd] -> [n_patches, n_embd]
|
||||
@@ -12015,11 +12058,66 @@ class HunyuanOCRVisionModel(MmprojModel):
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
# force conv weights to F32 or F16 to avoid BF16 IM2COL issues on Metal
|
||||
# Both HunyuanOCR and HunyuanVL emit the ViT -> LLM projection as mm.0/mm.2.
|
||||
if ("mm.0." in new_name or "mm.2." in new_name) and new_name.endswith(".weight"):
|
||||
return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32
|
||||
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
||||
|
||||
|
||||
@ModelBase.register("HunYuanVLForConditionalGeneration")
|
||||
class HunyuanVLTextModel(HunYuanModel):
|
||||
# The "HunYuanVLForConditionalGeneration" HF architecture covers both HunyuanOCR
|
||||
# and HunyuanVL. HunyuanOCR reuses the HunYuan-Dense text backbone (standard RoPE),
|
||||
# while HunyuanVL introduces a new LLM arch with XD-RoPE. Detect the variant from
|
||||
# the config and pick the matching GGUF architecture.
|
||||
model_arch = gguf.MODEL_ARCH.HUNYUAN_VL
|
||||
|
||||
@staticmethod
|
||||
def _is_ocr_config(hparams: dict) -> bool:
|
||||
# OCR pairs a 1B text backbone (hidden=1024) with a ViT projector that
|
||||
# outputs 1024-d; HunyuanVL uses 3072-d. Keep in sync with
|
||||
# HunyuanVLVisionModel.is_ocr_variant.
|
||||
return int((hparams.get("vision_config") or {}).get("out_hidden_size", 0)) == 1024
|
||||
|
||||
def __init__(self, dir_model: Path, *args, **kwargs):
|
||||
raw_hparams = kwargs.get("hparams") or ModelBase.load_hparams(dir_model, is_mistral_format=False)
|
||||
if self._is_ocr_config(raw_hparams):
|
||||
self.model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
|
||||
else:
|
||||
self.model_arch = gguf.MODEL_ARCH.HUNYUAN_VL
|
||||
super().__init__(dir_model, *args, **kwargs)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
|
||||
# Only emit XD-RoPE metadata for the HunyuanVL backbone; HunyuanOCR uses
|
||||
# the HunYuan-Dense arch which already handles standard rope in super().
|
||||
if self.model_arch != gguf.MODEL_ARCH.HUNYUAN_VL:
|
||||
return
|
||||
|
||||
if self.rope_parameters.get("rope_type") != "xdrope":
|
||||
return
|
||||
|
||||
# defaults for HunyuanVL. The C++ side later computes:
|
||||
# freq_base = rope_theta * alpha ** (head_dim / (head_dim - 2))
|
||||
self.gguf_writer.add_rope_freq_base(float(self.rope_parameters["rope_theta"]))
|
||||
self.gguf_writer.add_rope_scaling_alpha(float(self.rope_parameters["alpha"]))
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
||||
self.gguf_writer.add_rope_scaling_factor(float(self.rope_parameters.get("factor", 1)))
|
||||
|
||||
ctx_len = int(self.hparams["max_position_embeddings"])
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(ctx_len)
|
||||
self.gguf_writer.add_context_length(ctx_len)
|
||||
|
||||
self.gguf_writer.add_rope_dimension_sections(list(self.rope_parameters["xdrope_section"]))
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# Skip vision tensors — they are written by HunyuanVLVisionModel
|
||||
if name.startswith("vit."):
|
||||
return
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("SmolLM3ForCausalLM")
|
||||
class SmolLM3Model(LlamaModel):
|
||||
model_arch = gguf.MODEL_ARCH.SMOLLM3
|
||||
|
||||
@@ -244,7 +244,6 @@ build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf"
|
||||
- `-fa 1` is required when running llama-bench with the OpenVINO backend.
|
||||
- `GGML_OPENVINO_STATEFUL_EXECUTION=1 GGML_OPENVINO_DEVICE=GPU ./llama-bench -fa 1`
|
||||
- `llama-server` with OpenVINO backend supports only one chat session/thread, when `GGML_OPENVINO_STATEFUL_EXECUTION=1` is enabled.
|
||||
- For Intel GPU, NPU detection in containers, GPU, NPU user-space drivers/libraries must be present inside the image. We will include in a future PR. Until then, you can use this reference Dockerfile: [openvino.Dockerfile](https://github.com/ravi9/llama.cpp/blob/ov-docker-update/.devops/openvino.Dockerfile)
|
||||
|
||||
> [!NOTE]
|
||||
> The OpenVINO backend is actively under development. Fixes are underway, and this document will continue to be updated as issues are resolved.
|
||||
@@ -274,8 +273,6 @@ docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_p
|
||||
Run llama.cpp with OpenVINO backend Docker container.
|
||||
Save sample models in `~/models` as [shown above](#3-download-sample-model). It will be mounted to the container in the examples below.
|
||||
|
||||
> [!NOTE]
|
||||
> Intel GPU, NPU detection in containers will be included in a future PR. Until then, you can use this reference Dockerfile: [openvino.Dockerfile](https://github.com/ravi9/llama.cpp/blob/ov-docker-update/.devops/openvino.Dockerfile).
|
||||
|
||||
```bash
|
||||
# Run Docker container
|
||||
|
||||
@@ -31,6 +31,8 @@ SYCL cross-platform capabilities enable support for other vendor GPUs as well.
|
||||
|
||||
## Recommended Release
|
||||
|
||||
### Windows
|
||||
|
||||
The following releases are verified and recommended:
|
||||
|
||||
|Commit ID|Tag|Release|Verified Platform| Update date|
|
||||
@@ -39,6 +41,13 @@ The following releases are verified and recommended:
|
||||
|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc A770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19|
|
||||
|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc A770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1||
|
||||
|
||||
### Ubuntu 24.04
|
||||
|
||||
The release packages for Ubuntu 24.04 x64 (FP32/FP16) only include the binary files of the llama.cpp SYCL backend. They require the target machine to have pre-installed Intel GPU drivers and oneAPI packages that are the same version as the build package. To get the version and installation info, refer to release.yml: ubuntu-24-sycl -> Download & Install oneAPI.
|
||||
|
||||
It is recommended to use them with Intel Docker.
|
||||
|
||||
The packages for FP32 and FP16 would have different accuracy and performance on LLMs. Please choose it acording to the test result.
|
||||
|
||||
## News
|
||||
|
||||
@@ -229,6 +238,7 @@ Upon a successful installation, SYCL is enabled for the available intel devices,
|
||||
|
||||
|Verified release|
|
||||
|-|
|
||||
|2025.3.3 |
|
||||
|2025.2.1|
|
||||
|2025.1|
|
||||
|2024.1|
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-batched)
|
||||
add_executable(${TARGET} batched.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-convert-llama2c-to-ggml)
|
||||
add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-debug)
|
||||
add_executable(${TARGET} debug.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-diffusion-cli)
|
||||
add_executable(${TARGET} diffusion-cli.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama llama-common ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-embedding)
|
||||
add_executable(${TARGET} embedding.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
set(TARGET llama-eval-callback)
|
||||
add_executable(${TARGET} eval-callback.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
if(LLAMA_BUILD_TESTS)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-gen-docs)
|
||||
add_executable(${TARGET} gen-docs.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-idle)
|
||||
add_executable(${TARGET} idle.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama llama-common ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
||||
@@ -51,6 +51,6 @@ target_include_directories(${CMAKE_PROJECT_NAME} PRIVATE
|
||||
|
||||
target_link_libraries(${CMAKE_PROJECT_NAME}
|
||||
llama
|
||||
common
|
||||
llama-common
|
||||
android
|
||||
log)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-lookahead)
|
||||
add_executable(${TARGET} lookahead.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -1,23 +1,23 @@
|
||||
set(TARGET llama-lookup)
|
||||
add_executable(${TARGET} lookup.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
set(TARGET llama-lookup-create)
|
||||
add_executable(${TARGET} lookup-create.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
set(TARGET llama-lookup-merge)
|
||||
add_executable(${TARGET} lookup-merge.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
set(TARGET llama-lookup-stats)
|
||||
add_executable(${TARGET} lookup-stats.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -25,7 +25,11 @@ MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}"
|
||||
OUTPUT_DIR="${OUTPUT_DIR:-../../models}"
|
||||
TYPE="${OUTTYPE:-f16}"
|
||||
METADATA_OVERRIDE="${METADATA_OVERRIDE:-}"
|
||||
CONVERTED_MODEL="${OUTPUT_DIR}/${MODEL_NAME}.gguf"
|
||||
if [[ -n "$MMPROJ" ]]; then
|
||||
CONVERTED_MODEL="${OUTPUT_DIR}/mmproj-${MODEL_NAME}.gguf"
|
||||
else
|
||||
CONVERTED_MODEL="${OUTPUT_DIR}/${MODEL_NAME}.gguf"
|
||||
fi
|
||||
|
||||
echo "Model path: ${MODEL_PATH}"
|
||||
echo "Model name: ${MODEL_NAME}"
|
||||
@@ -38,6 +42,7 @@ if [[ -n "$DEBUG" ]]; then
|
||||
else
|
||||
CMD_ARGS=("python")
|
||||
fi
|
||||
|
||||
CMD_ARGS+=("../../convert_hf_to_gguf.py" "--verbose")
|
||||
CMD_ARGS+=("${MODEL_PATH}")
|
||||
CMD_ARGS+=("--outfile" "${CONVERTED_MODEL}")
|
||||
@@ -50,7 +55,3 @@ CMD_ARGS+=("--outtype" "${TYPE}")
|
||||
echo ""
|
||||
echo "The environment variable CONVERTED_MODEL can be set to this path using:"
|
||||
echo "export CONVERTED_MODEL=$(realpath ${CONVERTED_MODEL})"
|
||||
if [[ -n "$MMPROJ" ]]; then
|
||||
mmproj_file="${OUTPUT_DIR}/mmproj-$(basename "${CONVERTED_MODEL}")"
|
||||
echo "The mmproj model was created in $(realpath "$mmproj_file")"
|
||||
fi
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-parallel)
|
||||
add_executable(${TARGET} parallel.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-passkey)
|
||||
add_executable(${TARGET} passkey.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-retrieval)
|
||||
add_executable(${TARGET} retrieval.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-save-load-state)
|
||||
add_executable(${TARGET} save-load-state.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-speculative-simple)
|
||||
add_executable(${TARGET} speculative-simple.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -8,8 +8,24 @@
|
||||
#include <clocale>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <cinttypes>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
|
||||
struct spec_checkpoint {
|
||||
int64_t n_tokens = 0;
|
||||
|
||||
std::vector<uint8_t> data;
|
||||
|
||||
size_t size() const {
|
||||
return data.size();
|
||||
}
|
||||
|
||||
bool empty() const {
|
||||
return data.empty();
|
||||
}
|
||||
};
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
std::setlocale(LC_NUMERIC, "C");
|
||||
@@ -46,6 +62,14 @@ int main(int argc, char ** argv) {
|
||||
model_tgt = llama_init_tgt->model();
|
||||
ctx_tgt = llama_init_tgt->context();
|
||||
|
||||
// check if the context supports partial sequence removal
|
||||
const auto ctx_seq_rm = common_context_can_seq_rm(ctx_tgt);
|
||||
const bool use_ckpt = (ctx_seq_rm == COMMON_CONTEXT_SEQ_RM_TYPE_FULL);
|
||||
|
||||
if (use_ckpt) {
|
||||
LOG_INF("speculative decoding will use checkpoints (context does not support partial sequence removal)\n");
|
||||
}
|
||||
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model_tgt);
|
||||
|
||||
// load the draft model
|
||||
@@ -119,7 +143,7 @@ int main(int argc, char ** argv) {
|
||||
const auto t_enc_start = ggml_time_us();
|
||||
|
||||
// target model sampling context
|
||||
struct common_sampler * smpl = common_sampler_init(model_tgt, params.sampling);
|
||||
common_sampler_ptr smpl(common_sampler_init(model_tgt, params.sampling));
|
||||
|
||||
// eval the prompt
|
||||
llama_decode(ctx_tgt, llama_batch_get_one(inp.data(), inp.size() - 1));
|
||||
@@ -142,21 +166,61 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, 1);
|
||||
|
||||
size_t n_draft = 0;
|
||||
|
||||
llama_tokens draft;
|
||||
spec_checkpoint spec_ckpt;
|
||||
|
||||
const auto t_enc_end = ggml_time_us();
|
||||
|
||||
const auto t_dec_start = ggml_time_us();
|
||||
|
||||
while (true) {
|
||||
// optionally, generate draft tokens that can be appended to the target batch
|
||||
// generate or reuse draft tokens
|
||||
//
|
||||
// this is the most important part of the speculation. the more probable tokens that are provided here
|
||||
// the better the performance will be. in theory, this computation can be performed asynchronously and even
|
||||
// offloaded to a remote device. it doesn't even have to be based on an LLM. instead, it can provide tokens
|
||||
// from a cache or lookup tables.
|
||||
//
|
||||
llama_tokens draft = common_speculative_draft(spec, params_spec, prompt_tgt, id_last);
|
||||
if (draft.empty()) {
|
||||
// generate a new draft
|
||||
draft = common_speculative_draft(spec, params_spec, prompt_tgt, id_last);
|
||||
|
||||
//LOG_DBG("draft: %s\n", string_from(ctx_dft, draft).c_str());
|
||||
if ((int) draft.size() > params_spec.n_max) {
|
||||
LOG_WRN("draft size %zu exceeds max %d, truncating\n", draft.size(), params_spec.n_max);
|
||||
draft.resize(params_spec.n_max);
|
||||
}
|
||||
|
||||
if ((int) draft.size() < params_spec.n_min) {
|
||||
LOG_DBG("ignoring small draft: %zu < %d\n", draft.size(), params_spec.n_min);
|
||||
draft.clear();
|
||||
}
|
||||
|
||||
// save the original draft size
|
||||
n_draft = draft.size();
|
||||
|
||||
// save a checkpoint of the target context before evaluating the draft
|
||||
// this allows us to restore the state if partial draft acceptance occurs
|
||||
if (!draft.empty() && use_ckpt) {
|
||||
const size_t ckpt_size = llama_state_seq_get_size_ext(ctx_tgt, 0, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
|
||||
spec_ckpt.data.resize(ckpt_size);
|
||||
|
||||
const size_t n = llama_state_seq_get_data_ext(ctx_tgt, spec_ckpt.data.data(), ckpt_size, 0, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
|
||||
GGML_ASSERT(n == ckpt_size);
|
||||
|
||||
spec_ckpt.n_tokens = (int64_t) prompt_tgt.size();
|
||||
LOG_DBG("created speculative checkpoint (n_tokens = %" PRId64 ", size = %.3f MiB)\n",
|
||||
spec_ckpt.n_tokens, (float) spec_ckpt.data.size() / 1024 / 1024);
|
||||
}
|
||||
} else {
|
||||
// we have a previous (partial) draft to reuse from checkpoint restoration
|
||||
if (use_ckpt) {
|
||||
GGML_ASSERT(!spec_ckpt.empty());
|
||||
}
|
||||
}
|
||||
|
||||
GGML_ASSERT(n_draft > 0);
|
||||
|
||||
// always have a token to evaluate from before - id_last
|
||||
common_batch_clear(batch_tgt);
|
||||
@@ -178,6 +242,12 @@ int main(int argc, char ** argv) {
|
||||
llama_decode(ctx_tgt, batch_tgt);
|
||||
}
|
||||
|
||||
// only save the sampler sampler state if we use checkpoints
|
||||
common_sampler_ptr smpl_save;
|
||||
if (use_ckpt) {
|
||||
smpl_save.reset(common_sampler_clone(smpl.get()));
|
||||
}
|
||||
|
||||
// sample from the full target batch and return the accepted tokens based on the target sampler
|
||||
//
|
||||
// for each token to be accepted, the sampler would have to sample that same token
|
||||
@@ -185,14 +255,38 @@ int main(int argc, char ** argv) {
|
||||
// available logits from the batch and sample the next token until we run out of logits or the sampler
|
||||
// disagrees with the draft
|
||||
//
|
||||
const auto ids = common_sampler_sample_and_accept_n(smpl, ctx_tgt, draft);
|
||||
auto ids = common_sampler_sample_and_accept_n(smpl.get(), ctx_tgt, draft);
|
||||
|
||||
//LOG_DBG("ids: %s\n", string_from(ctx_tgt, ids).c_str());
|
||||
|
||||
GGML_ASSERT(ids.size() > 0); // there will always be at least one accepted token
|
||||
|
||||
// check for partial draft acceptance:
|
||||
// if the context doesn't support partial sequence removal, restore the checkpoint
|
||||
// and make the accepted tokens the new partial draft for the next iteration
|
||||
if (use_ckpt && ids.size() - 1 < draft.size()) {
|
||||
LOG_DBG("partial acceptance: %zu < %zu, restoring checkpoint\n", ids.size() - 1, draft.size());
|
||||
|
||||
draft = std::move(ids);
|
||||
|
||||
const size_t n = llama_state_seq_set_data_ext(ctx_tgt, spec_ckpt.data.data(), spec_ckpt.size(), 0, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
|
||||
GGML_ASSERT(n == spec_ckpt.size());
|
||||
|
||||
llama_memory_seq_rm(llama_get_memory(ctx_tgt), 0, spec_ckpt.n_tokens, -1);
|
||||
|
||||
prompt_tgt.resize(spec_ckpt.n_tokens);
|
||||
smpl = std::move(smpl_save);
|
||||
|
||||
n_past = (int) prompt_tgt.size();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
common_speculative_accept(spec, ids.size() - 1);
|
||||
|
||||
// full acceptance: consume the draft and commit accepted tokens
|
||||
n_past += ids.size() - 1;
|
||||
n_drafted += draft.size(); // note: we ignore the discarded small drafts
|
||||
n_drafted += n_draft; // note: we ignore the discarded small drafts
|
||||
n_accept += ids.size() - 1;
|
||||
n_predict += ids.size();
|
||||
|
||||
@@ -222,6 +316,9 @@ int main(int argc, char ** argv) {
|
||||
|
||||
LOG_DBG("accepted %d/%d draft tokens, the last target token is: (%d)\n", (int) ids.size() - 1, (int) draft.size(), id_last);
|
||||
|
||||
// clear the draft since it has been consumed
|
||||
draft.clear();
|
||||
|
||||
{
|
||||
LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
|
||||
|
||||
@@ -254,11 +351,10 @@ int main(int argc, char ** argv) {
|
||||
|
||||
LOG_INF("\n");
|
||||
LOG_INF("target:\n\n");
|
||||
common_perf_print(ctx_tgt, smpl);
|
||||
common_perf_print(ctx_tgt, smpl.get());
|
||||
|
||||
llama_batch_free(batch_tgt);
|
||||
|
||||
common_sampler_free(smpl);
|
||||
common_speculative_free(spec);
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-speculative)
|
||||
add_executable(${TARGET} speculative.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -5,5 +5,5 @@
|
||||
set(TARGET llama-ls-sycl-device)
|
||||
add_executable(${TARGET} ls-sycl-device.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-finetune)
|
||||
add_executable(${TARGET} finetune.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
||||
+3
-9
@@ -1,17 +1,11 @@
|
||||
cmake_minimum_required(VERSION 3.14...3.28) # for add_link_options and implicit target directories.
|
||||
|
||||
# ref: https://cmake.org/cmake/help/latest/policy/CMP0194.html
|
||||
# MSVC is not a valid assembler for the ASM language.
|
||||
# Set to NEW to avoid a warning on CMake 4.1+ with MSVC.
|
||||
if (POLICY CMP0194)
|
||||
cmake_policy(SET CMP0194 NEW)
|
||||
endif()
|
||||
project("ggml" C CXX ASM)
|
||||
|
||||
### GGML Version
|
||||
set(GGML_VERSION_MAJOR 0)
|
||||
set(GGML_VERSION_MINOR 9)
|
||||
set(GGML_VERSION_PATCH 11)
|
||||
set(GGML_VERSION_MINOR 10)
|
||||
set(GGML_VERSION_PATCH 0)
|
||||
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
|
||||
|
||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
|
||||
@@ -219,7 +213,7 @@ set (GGML_CUDA_COMPRESSION_MODE "size" CACHE STRING
|
||||
set_property(CACHE GGML_CUDA_COMPRESSION_MODE PROPERTY STRINGS "none;speed;balance;size")
|
||||
|
||||
option(GGML_HIP "ggml: use HIP" OFF)
|
||||
option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
|
||||
option(GGML_HIP_GRAPHS "ggml: use HIP graph" ON)
|
||||
option(GGML_HIP_RCCL "ggml: use ROCm Collective Comm. Library" OFF)
|
||||
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
|
||||
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
|
||||
|
||||
@@ -473,7 +473,7 @@ target_link_libraries(ggml-base PRIVATE Threads::Threads)
|
||||
find_library(MATH_LIBRARY m)
|
||||
if (MATH_LIBRARY)
|
||||
if (NOT WIN32 OR NOT DEFINED ENV{ONEAPI_ROOT})
|
||||
target_link_libraries(ggml-base PRIVATE m)
|
||||
target_link_libraries(ggml-base PRIVATE ${MATH_LIBRARY})
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
+371
-220
@@ -1133,7 +1133,7 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer
|
||||
if (t_ij->view_src != nullptr && ggml_backend_buffer_is_meta(t_ij->view_src->buffer)) {
|
||||
t_ij->view_src = ggml_backend_meta_buffer_simple_tensor(tensor->view_src, j);
|
||||
if (t_ij->view_offs > 0 && split_dim >= 0 && split_dim < GGML_MAX_DIMS) {
|
||||
GGML_ASSERT(ne[split_dim] != 0 && tensor->ne[split_dim] != 0);
|
||||
GGML_ASSERT(tensor->ne[split_dim] != 0);
|
||||
const int split_dim_view_src = ggml_backend_meta_get_split_state(tensor->view_src, /*assume_sync =*/ true).axis;
|
||||
GGML_ASSERT(split_dim_view_src >= 0 && split_dim_view_src < GGML_MAX_DIMS);
|
||||
|
||||
@@ -1170,6 +1170,28 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer
|
||||
|
||||
simple_tensors.push_back(t_ij);
|
||||
}
|
||||
|
||||
// If one of the sources has a zero-sized slice, disable the computation:
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (tensor->src[i] == nullptr || !ggml_backend_buffer_is_meta(tensor->src[i]->buffer)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const ggml_backend_meta_split_state split_state_src = ggml_backend_meta_get_split_state(tensor->src[i], /*assume_sync =*/ true);
|
||||
if (split_state_src.axis < 0 || split_state_src.axis >= GGML_MAX_DIMS) {
|
||||
continue;
|
||||
}
|
||||
for (size_t j = 0; j < n_simple_bufs; j++) {
|
||||
int64_t ne_sum = 0;
|
||||
for (size_t s = 0; s < split_state_src.n_segments; s++) {
|
||||
ne_sum += split_state_src.ne[s*n_simple_bufs + j];
|
||||
}
|
||||
if (ne_sum == 0) {
|
||||
simple_tensors[j]->flags &= ~GGML_TENSOR_FLAG_COMPUTE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
buf_ctx->simple_tensors[tensor] = simple_tensors;
|
||||
|
||||
return GGML_STATUS_SUCCESS;
|
||||
@@ -1270,7 +1292,45 @@ static void ggml_backend_meta_buffer_get_tensor(ggml_backend_buffer_t buffer, co
|
||||
GGML_ASSERT(ggml_is_contiguous(tensor));
|
||||
|
||||
const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor, /*assume_sync =*/ false);
|
||||
GGML_ASSERT(split_state.n_segments == 1);
|
||||
|
||||
if (split_state.n_segments != 1) {
|
||||
GGML_ASSERT(split_state.axis >= 0 && split_state.axis < GGML_MAX_DIMS);
|
||||
GGML_ASSERT(offset == 0);
|
||||
GGML_ASSERT(size == ggml_nbytes(tensor));
|
||||
GGML_ASSERT(tensor->ne[3] == 1);
|
||||
size_t offset_data = 0;
|
||||
std::vector<size_t> simple_offsets(n_bufs, 0);
|
||||
if (split_state.axis == GGML_BACKEND_SPLIT_AXIS_0) {
|
||||
GGML_ASSERT(tensor->ne[2] == 1);
|
||||
const int64_t blck_size = ggml_blck_size(tensor->type);
|
||||
for (size_t s = 0; s < split_state.n_segments; s++) {
|
||||
for (size_t j = 0; j < n_bufs; j++) {
|
||||
const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||
GGML_ASSERT(split_state.ne[s*n_bufs + j] % blck_size == 0);
|
||||
const size_t nbytes = split_state.ne[s*n_bufs + j]/blck_size * tensor->nb[0];
|
||||
ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data, simple_offsets[j], nbytes,
|
||||
tensor->ne[1], simple_tensor->nb[1], tensor->nb[1]);
|
||||
offset_data += nbytes;
|
||||
simple_offsets[j] += nbytes;
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(offset_data*tensor->ne[1] == size);
|
||||
return;
|
||||
}
|
||||
GGML_ASSERT(split_state.axis == GGML_BACKEND_SPLIT_AXIS_1);
|
||||
for (size_t s = 0; s < split_state.n_segments; s++) {
|
||||
for (size_t j = 0; j < n_bufs; j++) {
|
||||
const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||
const size_t nbytes = split_state.ne[s*n_bufs + j] * tensor->nb[1];
|
||||
ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data, simple_offsets[j], nbytes,
|
||||
tensor->ne[2], simple_tensor->nb[2], tensor->nb[2]);
|
||||
offset_data += nbytes;
|
||||
simple_offsets[j] += nbytes;
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(offset_data*tensor->ne[2] == size);
|
||||
return;
|
||||
}
|
||||
|
||||
switch (split_state.axis) {
|
||||
case GGML_BACKEND_SPLIT_AXIS_0:
|
||||
@@ -1404,26 +1464,32 @@ struct ggml_backend_meta_context {
|
||||
struct backend_config {
|
||||
ggml_backend_t backend;
|
||||
|
||||
std::vector<cgraph_config> cgraphs;
|
||||
std::vector<ggml_tensor *> nodes;
|
||||
ggml_backend_buffer_ptr buf;
|
||||
std::vector<cgraph_config> cgraphs;
|
||||
std::vector<ggml_tensor *> nodes;
|
||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||
|
||||
backend_config(ggml_backend_t backend) : backend(backend) {}
|
||||
backend_config(ggml_backend_t backend, const size_t n_reduce_steps) : backend(backend) {
|
||||
bufs.resize(n_reduce_steps);
|
||||
}
|
||||
};
|
||||
std::string name;
|
||||
std::vector<backend_config> backend_configs;
|
||||
ggml_context_ptr ctx;
|
||||
std::vector<ggml_cgraph *> cgraphs_aux;
|
||||
std::vector<ggml_tensor *> nodes_aux;
|
||||
size_t n_reduce_steps;
|
||||
int max_nnodes = 0;
|
||||
size_t max_tmp_size = 0;
|
||||
size_t max_subgraphs = 0;
|
||||
size_t n_subgraphs = 0;
|
||||
uint64_t uid = 0;
|
||||
|
||||
void * comm_ctx = nullptr;
|
||||
ggml_backend_comm_allreduce_tensor_t comm_allreduce = nullptr;
|
||||
|
||||
ggml_backend_meta_context(ggml_backend_dev_t meta_dev, const char * params) {
|
||||
const size_t n_devs = ggml_backend_meta_dev_n_devs(meta_dev);
|
||||
n_reduce_steps = std::ceil(std::log2(n_devs));
|
||||
name = "Meta(";
|
||||
std::vector<ggml_backend_t> simple_backends;
|
||||
backend_configs.reserve(n_devs);
|
||||
@@ -1435,7 +1501,7 @@ struct ggml_backend_meta_context {
|
||||
}
|
||||
name += ggml_backend_dev_name(simple_dev);
|
||||
simple_backends.push_back(ggml_backend_dev_init(simple_dev, params));
|
||||
backend_configs.emplace_back(simple_backends.back());
|
||||
backend_configs.emplace_back(simple_backends.back(), n_reduce_steps);
|
||||
}
|
||||
name += ")";
|
||||
|
||||
@@ -1465,10 +1531,6 @@ struct ggml_backend_meta_context {
|
||||
ggml_backend_free(bc.backend);
|
||||
}
|
||||
}
|
||||
|
||||
size_t n_reduce_steps() const {
|
||||
return std::ceil(std::log2(backend_configs.size()));
|
||||
}
|
||||
};
|
||||
|
||||
static const char * ggml_backend_meta_get_name(ggml_backend_t backend) {
|
||||
@@ -1578,6 +1640,9 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
|
||||
const size_t n_backends = ggml_backend_meta_n_backends(backend);
|
||||
ggml_backend_meta_context * backend_ctx = (ggml_backend_meta_context *) backend->context;
|
||||
|
||||
// If the previous cgraph had a defined UID it can be used to skip rebuilding the subgraphs per simple backend.
|
||||
const bool needs_rebuild = (cgraph->uid == 0) || (cgraph->uid != backend_ctx->uid);
|
||||
|
||||
bool max_nnodes_raised = false;
|
||||
if (cgraph->n_nodes > backend_ctx->max_nnodes) {
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
@@ -1587,173 +1652,216 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
|
||||
}
|
||||
backend_ctx->max_nnodes = cgraph->n_nodes;
|
||||
max_nnodes_raised = true;
|
||||
assert(needs_rebuild);
|
||||
}
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
if (node->view_src != nullptr && node->view_src->op == GGML_OP_NONE && ggml_backend_buffer_is_host(node->view_src->buffer)) {
|
||||
// FIXME s_copy_main is on the CPU and its view seems to be incorrectly added to the graph nodes.
|
||||
// For regular usage this doesn't matter since it's a noop but trying to call ggml_backend_meta_buffer_simple_tensor results in a crash.
|
||||
bcj.nodes[i] = node;
|
||||
continue;
|
||||
if (needs_rebuild) {
|
||||
size_t n_subgraphs = 0;
|
||||
size_t max_tmp_size = 0;
|
||||
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
if (node->view_src != nullptr && node->view_src->op == GGML_OP_NONE && ggml_backend_buffer_is_host(node->view_src->buffer)) {
|
||||
// FIXME s_copy_main is on the CPU and its view seems to be incorrectly added to the graph nodes.
|
||||
// For regular usage this doesn't matter since it's a noop but trying to call ggml_backend_meta_buffer_simple_tensor results in a crash.
|
||||
bcj.nodes[i] = node;
|
||||
continue;
|
||||
}
|
||||
bcj.nodes[i] = ggml_backend_meta_buffer_simple_tensor(node, j);
|
||||
GGML_ASSERT(bcj.nodes[i]);
|
||||
}
|
||||
bcj.nodes[i] = ggml_backend_meta_buffer_simple_tensor(node, j);
|
||||
GGML_ASSERT(bcj.nodes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
size_t n_subgraphs = 0;
|
||||
size_t max_tmp_size = 0;
|
||||
{
|
||||
// For MoE models it may make sense to delay the AllReduce in order to reduce I/O:
|
||||
auto get_i_delayed = [&](const int i) -> int {
|
||||
int id = i; // i_delayed
|
||||
int idr = i; // i_delayed return, last safe return value
|
||||
{
|
||||
// For MoE models it may make sense to delay the AllReduce in order to reduce I/O:
|
||||
auto get_i_delayed = [&](const int i) -> int {
|
||||
int id = i; // i_delayed
|
||||
int idr = i; // i_delayed return, last safe return value
|
||||
|
||||
ggml_tensor * node = cgraph->nodes[id];
|
||||
int32_t n_used = ggml_node_get_use_count(cgraph, id);
|
||||
if (id + 1 >= cgraph->n_nodes) {
|
||||
return idr;
|
||||
}
|
||||
{
|
||||
ggml_tensor * next = cgraph->nodes[id+1];
|
||||
if (next->op == GGML_OP_ADD_ID && next->src[0] == node &&
|
||||
ggml_backend_meta_get_split_state(next->src[1], false).axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL &&
|
||||
ggml_backend_meta_get_split_state(next->src[2], false).axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
|
||||
node = next;
|
||||
ggml_tensor * node = cgraph->nodes[id];
|
||||
int32_t n_used = ggml_node_get_use_count(cgraph, id);
|
||||
|
||||
// Skip MIRRORED nodes that don't consume node
|
||||
auto skip_unrelated = [&]() {
|
||||
while (id + 1 < cgraph->n_nodes) {
|
||||
ggml_tensor * next = cgraph->nodes[id+1];
|
||||
if (ggml_backend_meta_get_split_state(next, false).axis != GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
|
||||
break;
|
||||
}
|
||||
bool safe = true;
|
||||
for (int s = 0; s < GGML_MAX_SRC; s++) {
|
||||
if (next->src[s] == nullptr) {
|
||||
continue;
|
||||
}
|
||||
if (next->src[s] == node) {
|
||||
safe = false;
|
||||
break;
|
||||
}
|
||||
if (ggml_backend_meta_get_split_state(next->src[s], false).axis != GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
|
||||
safe = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!safe) {
|
||||
break;
|
||||
}
|
||||
id++;
|
||||
}
|
||||
};
|
||||
|
||||
skip_unrelated();
|
||||
if (id + 1 >= cgraph->n_nodes) {
|
||||
return idr;
|
||||
}
|
||||
{
|
||||
ggml_tensor * next = cgraph->nodes[id+1];
|
||||
if (next->op == GGML_OP_ADD_ID && next->src[0] == node &&
|
||||
ggml_backend_meta_get_split_state(next->src[1], false).axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL &&
|
||||
ggml_backend_meta_get_split_state(next->src[2], false).axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
|
||||
node = next;
|
||||
id++;
|
||||
idr = id;
|
||||
n_used = ggml_node_get_use_count(cgraph, id);
|
||||
}
|
||||
}
|
||||
// Chain of MULs with MIRRORED src[1]
|
||||
while (true) {
|
||||
skip_unrelated();
|
||||
if (id + 1 >= cgraph->n_nodes) {
|
||||
return idr;
|
||||
}
|
||||
ggml_tensor * next = cgraph->nodes[id+1];
|
||||
if (next->op == GGML_OP_MUL && next->src[0] == node &&
|
||||
ggml_backend_meta_get_split_state(next->src[1], false).axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
|
||||
node = next;
|
||||
id++;
|
||||
idr = id;
|
||||
n_used = ggml_node_get_use_count(cgraph, id);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (n_used != node->ne[1] || id + 2*n_used-1 >= cgraph->n_nodes) {
|
||||
return idr;
|
||||
}
|
||||
for (int32_t k = 0; k < n_used; k++) {
|
||||
ggml_tensor * next = cgraph->nodes[id+1];
|
||||
if (next->op != GGML_OP_VIEW || next->view_src != node || next->view_offs != k*node->nb[1] ||
|
||||
next->ne[0] != node->ne[0] || next->ne[1] != node->ne[2] || next->nb[1] != node->nb[2] ||
|
||||
ggml_node_get_use_count(cgraph, id+1) != 1) {
|
||||
return idr;
|
||||
}
|
||||
id++;
|
||||
idr = id;
|
||||
n_used = ggml_node_get_use_count(cgraph, id);
|
||||
}
|
||||
}
|
||||
if (id + 1 >= cgraph->n_nodes) {
|
||||
return idr;
|
||||
}
|
||||
{
|
||||
ggml_tensor * next = cgraph->nodes[id+1];
|
||||
if (next->op == GGML_OP_MUL && next->src[0] == node &&
|
||||
ggml_backend_meta_get_split_state(next->src[1], false).axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
|
||||
node = next;
|
||||
{
|
||||
ggml_tensor * next = cgraph->nodes[id+1];
|
||||
if (next->op != GGML_OP_ADD || next->src[0] != cgraph->nodes[id - (n_used-1)] ||
|
||||
next->src[1] != cgraph->nodes[id - (n_used-2)] || ggml_node_get_use_count(cgraph, id+1) != 1) {
|
||||
return idr;
|
||||
}
|
||||
id++;
|
||||
idr = id;
|
||||
n_used = ggml_node_get_use_count(cgraph, id);
|
||||
}
|
||||
}
|
||||
|
||||
if (n_used != node->ne[1] || id + 2*n_used-1 >= cgraph->n_nodes) {
|
||||
for (int32_t k = 0; k < n_used - 2; k++) {
|
||||
ggml_tensor * next = cgraph->nodes[id+1];
|
||||
if (next->op != GGML_OP_ADD || next->src[0] != cgraph->nodes[id] ||
|
||||
next->src[1] != cgraph->nodes[id - (n_used-2)] || ggml_node_get_use_count(cgraph, id+1) != 1) {
|
||||
return idr;
|
||||
}
|
||||
id++;
|
||||
}
|
||||
idr = id;
|
||||
return idr;
|
||||
}
|
||||
for (int32_t k = 0; k < n_used; k++) {
|
||||
ggml_tensor * next = cgraph->nodes[id+1];
|
||||
if (next->op != GGML_OP_VIEW || next->view_src != node || next->view_offs != k*node->nb[1] ||
|
||||
next->ne[0] != node->ne[0] || next->ne[1] != node->ne[2] || next->nb[1] != node->nb[2] ||
|
||||
ggml_node_get_use_count(cgraph, id+1) != 1) {
|
||||
return idr;
|
||||
}
|
||||
id++;
|
||||
}
|
||||
{
|
||||
ggml_tensor * next = cgraph->nodes[id+1];
|
||||
if (next->op != GGML_OP_ADD || next->src[0] != cgraph->nodes[id - (n_used-1)] ||
|
||||
next->src[1] != cgraph->nodes[id - (n_used-2)] || ggml_node_get_use_count(cgraph, id+1) != 1) {
|
||||
return idr;
|
||||
}
|
||||
id++;
|
||||
}
|
||||
for (int32_t k = 0; k < n_used - 2; k++) {
|
||||
ggml_tensor * next = cgraph->nodes[id+1];
|
||||
if (next->op != GGML_OP_ADD || next->src[0] != cgraph->nodes[id] ||
|
||||
next->src[1] != cgraph->nodes[id - (n_used-2)] || ggml_node_get_use_count(cgraph, id+1) != 1) {
|
||||
return idr;
|
||||
}
|
||||
id++;
|
||||
}
|
||||
idr = id;
|
||||
return idr;
|
||||
};
|
||||
};
|
||||
|
||||
int i_start = 0;
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
if (node->view_src != nullptr && node->view_src->op == GGML_OP_NONE && ggml_backend_buffer_is_host(node->view_src->buffer)) {
|
||||
continue;
|
||||
}
|
||||
const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(node, /*assume_sync =*/ false);
|
||||
if (split_state.axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL) {
|
||||
max_tmp_size = std::max(max_tmp_size, ggml_nbytes(node));
|
||||
}
|
||||
const bool new_subgraph = i + 1 == cgraph->n_nodes || split_state.axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL;
|
||||
if (!new_subgraph) {
|
||||
continue;
|
||||
}
|
||||
int i_start = 0;
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
if (node->view_src != nullptr && node->view_src->op == GGML_OP_NONE && ggml_backend_buffer_is_host(node->view_src->buffer)) {
|
||||
continue;
|
||||
}
|
||||
const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(node, /*assume_sync =*/ false);
|
||||
if (split_state.axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL) {
|
||||
max_tmp_size = std::max(max_tmp_size, ggml_nbytes(node));
|
||||
}
|
||||
const bool new_subgraph = i + 1 == cgraph->n_nodes || split_state.axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL;
|
||||
if (!new_subgraph) {
|
||||
continue;
|
||||
}
|
||||
|
||||
i = get_i_delayed(i);
|
||||
i = get_i_delayed(i);
|
||||
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
bcj.cgraphs[n_subgraphs].offset = i_start;
|
||||
}
|
||||
n_subgraphs++;
|
||||
i_start = i + 1;
|
||||
}
|
||||
GGML_ASSERT(i_start == cgraph->n_nodes);
|
||||
}
|
||||
|
||||
backend_ctx->uid = cgraph->uid;
|
||||
backend_ctx->n_subgraphs = n_subgraphs;
|
||||
|
||||
if (max_tmp_size > backend_ctx->max_tmp_size) {
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
bcj.cgraphs[n_subgraphs].offset = i_start;
|
||||
for (size_t i = 0; i < backend_ctx->n_reduce_steps; i++) {
|
||||
bcj.bufs[i].reset(ggml_backend_alloc_buffer(bcj.backend, max_tmp_size));
|
||||
}
|
||||
}
|
||||
n_subgraphs++;
|
||||
i_start = i + 1;
|
||||
backend_ctx->max_tmp_size = max_tmp_size;
|
||||
}
|
||||
GGML_ASSERT(i_start == cgraph->n_nodes);
|
||||
}
|
||||
|
||||
if (max_tmp_size > backend_ctx->max_tmp_size) {
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
bcj.buf.reset(ggml_backend_alloc_buffer(bcj.backend, max_tmp_size));
|
||||
}
|
||||
backend_ctx->max_tmp_size = max_tmp_size;
|
||||
}
|
||||
|
||||
|
||||
if (max_nnodes_raised || n_subgraphs > backend_ctx->max_subgraphs) {
|
||||
backend_ctx->max_subgraphs = std::max(backend_ctx->max_subgraphs, n_subgraphs);
|
||||
const size_t n_reduce_steps = backend_ctx->n_reduce_steps();
|
||||
const size_t n_nodes_per_device = 2 * n_reduce_steps; // tmp + ADD per step
|
||||
const size_t n_cgraphs_per_device = n_reduce_steps; // 1 ADD graph per step
|
||||
const size_t mem_per_device_graphs_main = backend_ctx->max_subgraphs*ggml_graph_overhead_custom(backend_ctx->max_nnodes, cgraph->grads);
|
||||
const size_t mem_per_device_graphs_aux = n_cgraphs_per_device*backend_ctx->max_subgraphs*ggml_graph_overhead_custom(1, cgraph->grads);
|
||||
const size_t mem_per_device_nodes_aux = n_nodes_per_device*backend_ctx->max_subgraphs*ggml_tensor_overhead();
|
||||
ggml_init_params params = {
|
||||
/*.mem_size =*/ n_backends * (mem_per_device_graphs_main + mem_per_device_graphs_aux + mem_per_device_nodes_aux),
|
||||
/*.mem_buffer =*/ nullptr,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
backend_ctx->ctx.reset(ggml_init(params));
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
for (size_t i = 0; i < n_subgraphs; i++) {
|
||||
bcj.cgraphs[i].cgraph_main = ggml_new_graph_custom(backend_ctx->ctx.get(), cgraph->n_nodes, /*grads =*/ false);
|
||||
if (max_nnodes_raised || n_subgraphs > backend_ctx->max_subgraphs) {
|
||||
backend_ctx->max_subgraphs = std::max(backend_ctx->max_subgraphs, n_subgraphs);
|
||||
const size_t n_nodes_per_device = 3 * backend_ctx->n_reduce_steps; // tmp + ADD (+zeroing) graph per step and device
|
||||
const size_t n_cgraphs_per_device = 2 * backend_ctx->n_reduce_steps; // ADD ( + zeroing) graph per step and device
|
||||
const size_t mem_per_device_graphs_main = backend_ctx->max_subgraphs*ggml_graph_overhead_custom(backend_ctx->max_nnodes, cgraph->grads);
|
||||
const size_t mem_per_device_graphs_aux = n_cgraphs_per_device*backend_ctx->max_subgraphs*ggml_graph_overhead_custom(1, cgraph->grads);
|
||||
const size_t mem_per_device_nodes_aux = n_nodes_per_device*backend_ctx->max_subgraphs*ggml_tensor_overhead();
|
||||
ggml_init_params params = {
|
||||
/*.mem_size =*/ n_backends * (mem_per_device_graphs_main + mem_per_device_graphs_aux + mem_per_device_nodes_aux),
|
||||
/*.mem_buffer =*/ nullptr,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
backend_ctx->ctx.reset(ggml_init(params));
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
for (size_t i = 0; i < n_subgraphs; i++) {
|
||||
bcj.cgraphs[i].cgraph_main = ggml_new_graph_custom(backend_ctx->ctx.get(), cgraph->n_nodes, /*grads =*/ false);
|
||||
}
|
||||
}
|
||||
backend_ctx->cgraphs_aux.resize(n_backends*n_cgraphs_per_device*backend_ctx->max_subgraphs);
|
||||
for (size_t k = 0; k < backend_ctx->cgraphs_aux.size(); k++) {
|
||||
backend_ctx->cgraphs_aux[k] = ggml_new_graph_custom(backend_ctx->ctx.get(), 1, cgraph->grads);
|
||||
}
|
||||
backend_ctx->nodes_aux.resize(n_backends*n_nodes_per_device*backend_ctx->max_subgraphs);
|
||||
for (size_t k = 0; k < backend_ctx->nodes_aux.size(); k++) {
|
||||
backend_ctx->nodes_aux[k] = ggml_new_tensor_1d(backend_ctx->ctx.get(), GGML_TYPE_F32, 1);
|
||||
}
|
||||
}
|
||||
backend_ctx->cgraphs_aux.resize(n_backends*n_cgraphs_per_device*backend_ctx->max_subgraphs);
|
||||
for (size_t k = 0; k < backend_ctx->cgraphs_aux.size(); k++) {
|
||||
backend_ctx->cgraphs_aux[k] = ggml_new_graph_custom(backend_ctx->ctx.get(), 1, cgraph->grads);
|
||||
}
|
||||
backend_ctx->nodes_aux.resize(n_backends*n_nodes_per_device*backend_ctx->max_subgraphs);
|
||||
for (size_t k = 0; k < backend_ctx->nodes_aux.size(); k++) {
|
||||
backend_ctx->nodes_aux[k] = ggml_new_tensor_1d(backend_ctx->ctx.get(), GGML_TYPE_F32, 1);
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
for (size_t i_graph = 0; i_graph < n_subgraphs; i_graph++) {
|
||||
ggml_cgraph * cgraph_ij = bcj.cgraphs[i_graph].cgraph_main;
|
||||
const size_t i_node_start = bcj.cgraphs[i_graph].offset;
|
||||
const size_t i_node_stop = i_graph + 1 < n_subgraphs ? bcj.cgraphs[i_graph + 1].offset : cgraph->n_nodes;
|
||||
cgraph_ij->n_nodes = i_node_stop - i_node_start;
|
||||
ggml_hash_set_reset(&cgraph_ij->visited_hash_set);
|
||||
for (size_t i_node = i_node_start; i_node < i_node_stop; i_node++) {
|
||||
ggml_tensor * node_ij = bcj.nodes[i_node];
|
||||
cgraph_ij->nodes[i_node - i_node_start] = node_ij;
|
||||
const size_t hash_pos_orig = ggml_hash_find(&cgraph->visited_hash_set, cgraph->nodes[i_node]);
|
||||
const size_t hash_pos_ij = ggml_hash_insert(&cgraph_ij->visited_hash_set, node_ij);
|
||||
cgraph_ij->use_counts[hash_pos_ij] = cgraph->use_counts[hash_pos_orig];
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
for (size_t i_graph = 0; i_graph < n_subgraphs; i_graph++) {
|
||||
ggml_cgraph * cgraph_ij = bcj.cgraphs[i_graph].cgraph_main;
|
||||
const size_t i_node_start = bcj.cgraphs[i_graph].offset;
|
||||
const size_t i_node_stop = i_graph + 1 < n_subgraphs ? bcj.cgraphs[i_graph + 1].offset : cgraph->n_nodes;
|
||||
cgraph_ij->n_nodes = i_node_stop - i_node_start;
|
||||
ggml_hash_set_reset(&cgraph_ij->visited_hash_set);
|
||||
for (size_t i_node = i_node_start; i_node < i_node_stop; i_node++) {
|
||||
ggml_tensor * node_ij = bcj.nodes[i_node];
|
||||
cgraph_ij->nodes[i_node - i_node_start] = node_ij;
|
||||
const size_t hash_pos_orig = ggml_hash_find(&cgraph->visited_hash_set, cgraph->nodes[i_node]);
|
||||
const size_t hash_pos_ij = ggml_hash_insert(&cgraph_ij->visited_hash_set, node_ij);
|
||||
cgraph_ij->use_counts[hash_pos_ij] = cgraph->use_counts[hash_pos_orig];
|
||||
}
|
||||
cgraph_ij->uid = ggml_graph_next_uid();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1761,11 +1869,6 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
|
||||
size_t iga = 0; // i graph aux
|
||||
size_t ina = 0; // i node aux
|
||||
|
||||
// FIXME usage_counts
|
||||
auto get_cgraph_aux = [&]() -> ggml_cgraph * {
|
||||
ggml_cgraph * ret = backend_ctx->cgraphs_aux[iga++];
|
||||
return ret;
|
||||
};
|
||||
auto get_node_aux = [&](ggml_tensor * t) -> ggml_tensor * {
|
||||
ggml_tensor * ret = backend_ctx->nodes_aux[ina++];
|
||||
memset(ret, 0, sizeof(ggml_tensor));
|
||||
@@ -1777,75 +1880,110 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
|
||||
}
|
||||
return ret;
|
||||
};
|
||||
auto set_tmp_data = [&](ggml_tensor * tensor, const size_t j, const size_t i_buf) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
ggml_backend_buffer_ptr & buf_ptr = bcj.bufs[i_buf];
|
||||
if (!buf_ptr || ggml_backend_buffer_get_size(buf_ptr.get()) < backend_ctx->max_tmp_size) {
|
||||
buf_ptr.reset(ggml_backend_alloc_buffer(bcj.backend, backend_ctx->max_tmp_size));
|
||||
}
|
||||
tensor->buffer = buf_ptr.get();
|
||||
tensor->data = ggml_backend_buffer_get_base(buf_ptr.get());
|
||||
};
|
||||
// FIXME usage_counts
|
||||
auto get_cgraph_aux = [&]() -> ggml_cgraph * {
|
||||
ggml_cgraph * ret = backend_ctx->cgraphs_aux[iga++];
|
||||
return ret;
|
||||
};
|
||||
|
||||
// Preferentially use backend-specific allreduce_tensor_async (e.g. NCCL for CUDA), use a generic fallback if unavailable:
|
||||
auto allreduce_fallback = [&](size_t i) -> ggml_status {
|
||||
std::vector<ggml_cgraph *> step_cgraphs(n_backends, nullptr);
|
||||
|
||||
for (size_t offset_j = 1; offset_j < n_backends; offset_j *= 2) {
|
||||
// Zero out nodes that were disabled due to having a zero-sized slice:
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
ggml_tensor * node = bcj.cgraphs[i].cgraph_main->nodes[bcj.cgraphs[i].cgraph_main->n_nodes - 1];
|
||||
if (node->flags & GGML_TENSOR_FLAG_COMPUTE) {
|
||||
continue;
|
||||
}
|
||||
ggml_tensor * node_zero = get_node_aux(node);
|
||||
node_zero->op = GGML_OP_SCALE; // FIXME 0.0f * NaN == NaN
|
||||
node_zero->src[0] = node;
|
||||
ggml_set_op_params_f32(node_zero, 0, 0.0f);
|
||||
node_zero->data = node->data;
|
||||
node_zero->flags |= GGML_TENSOR_FLAG_COMPUTE;
|
||||
|
||||
step_cgraphs[j] = get_cgraph_aux();
|
||||
step_cgraphs[j]->nodes[0] = node_zero;
|
||||
step_cgraphs[j]->n_nodes = 1;
|
||||
const ggml_status status = ggml_backend_graph_compute_async(bcj.backend, step_cgraphs[j]);
|
||||
if (status != GGML_STATUS_SUCCESS) {
|
||||
return status;
|
||||
}
|
||||
}
|
||||
std::fill(step_cgraphs.begin(), step_cgraphs.end(), nullptr);
|
||||
|
||||
auto push_data = [&](const size_t j_src, const size_t j_dst, const size_t i_buf) {
|
||||
assert(step_cgraphs[j_dst] == nullptr);
|
||||
auto & bcj_src = backend_ctx->backend_configs[j_src];
|
||||
auto & bcj_dst = backend_ctx->backend_configs[j_dst];
|
||||
|
||||
ggml_tensor * node_src = bcj_src.cgraphs[i].cgraph_main->nodes[bcj_src.cgraphs[i].cgraph_main->n_nodes - 1];
|
||||
ggml_tensor * node_dst = bcj_dst.cgraphs[i].cgraph_main->nodes[bcj_dst.cgraphs[i].cgraph_main->n_nodes - 1];
|
||||
GGML_ASSERT(ggml_is_contiguous(node_src));
|
||||
GGML_ASSERT(ggml_is_contiguous(node_dst));
|
||||
|
||||
ggml_tensor * node_tmp = get_node_aux(node_dst);
|
||||
set_tmp_data(node_tmp, j_dst, i_buf);
|
||||
|
||||
ggml_backend_tensor_copy_async(bcj_src.backend, bcj_dst.backend, node_src, node_tmp);
|
||||
|
||||
ggml_tensor * node_red = get_node_aux(node_dst);
|
||||
node_red->view_src = node_dst->view_src == nullptr ? node_dst : node_dst->view_src;
|
||||
node_red->view_offs = node_dst->view_offs;
|
||||
node_red->op = GGML_OP_ADD;
|
||||
node_red->src[0] = node_dst;
|
||||
node_red->src[1] = node_tmp;
|
||||
node_red->flags |= GGML_TENSOR_FLAG_COMPUTE;
|
||||
ggml_backend_view_init(node_red);
|
||||
|
||||
ggml_cgraph * cgraph_aux = get_cgraph_aux();
|
||||
cgraph_aux->nodes[0] = node_red;
|
||||
cgraph_aux->n_nodes = 1;
|
||||
step_cgraphs[j_dst] = cgraph_aux;
|
||||
};
|
||||
|
||||
size_t offset_j = n_backends/2;
|
||||
while ((offset_j & (offset_j - 1)) != 0) {
|
||||
offset_j--;
|
||||
}
|
||||
const size_t offset_j_max = offset_j;
|
||||
size_t i_buf = 0;
|
||||
|
||||
// If n_backends is not a power of 2, fold in the excess prior to butterfly reduction:
|
||||
for (size_t j_src = 2*offset_j_max; j_src < n_backends; j_src++) {
|
||||
const size_t j_dst = j_src - 2*offset_j_max;
|
||||
push_data(j_src, j_dst, i_buf);
|
||||
const ggml_status status = ggml_backend_graph_compute_async(backend_ctx->backend_configs[j_dst].backend, step_cgraphs[j_dst]);
|
||||
if (status != GGML_STATUS_SUCCESS) {
|
||||
return status;
|
||||
}
|
||||
i_buf = 1;
|
||||
}
|
||||
|
||||
// Butterfly reduction:
|
||||
for (; offset_j >= 1; offset_j /= 2) {
|
||||
std::fill(step_cgraphs.begin(), step_cgraphs.end(), nullptr);
|
||||
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
for (size_t j = 0; j < 2*offset_j_max; j++) {
|
||||
const size_t j_other = j ^ offset_j;
|
||||
if (j_other > j) {
|
||||
if (j_other >= n_backends) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto & bcj1 = backend_ctx->backend_configs[j];
|
||||
auto & bcj2 = backend_ctx->backend_configs[j_other];
|
||||
|
||||
ggml_tensor * node1 = bcj1.cgraphs[i].cgraph_main->nodes[bcj1.cgraphs[i].cgraph_main->n_nodes - 1];
|
||||
ggml_tensor * node2 = bcj2.cgraphs[i].cgraph_main->nodes[bcj2.cgraphs[i].cgraph_main->n_nodes - 1];
|
||||
GGML_ASSERT(ggml_is_contiguous(node1));
|
||||
GGML_ASSERT(ggml_is_contiguous(node2));
|
||||
|
||||
// Tmp tensors to receive P2P copies
|
||||
ggml_tensor * node_tmp_1 = get_node_aux(node1);
|
||||
node_tmp_1->buffer = bcj1.buf.get();
|
||||
node_tmp_1->data = ggml_backend_buffer_get_base(bcj1.buf.get());
|
||||
|
||||
ggml_tensor * node_tmp_2 = get_node_aux(node2);
|
||||
node_tmp_2->buffer = bcj2.buf.get();
|
||||
node_tmp_2->data = ggml_backend_buffer_get_base(bcj2.buf.get());
|
||||
|
||||
// 2 P2P copies: exchange full buffers
|
||||
ggml_backend_tensor_copy_async(bcj1.backend, bcj2.backend, node1, node_tmp_2);
|
||||
ggml_backend_tensor_copy_async(bcj2.backend, bcj1.backend, node2, node_tmp_1);
|
||||
|
||||
// Local ADD: node1 += tmp1 (in-place via view)
|
||||
ggml_tensor * node_red_1 = get_node_aux(node1);
|
||||
node_red_1->view_src = node1->view_src == nullptr ? node1 : node1->view_src;
|
||||
node_red_1->view_offs = node1->view_offs;
|
||||
node_red_1->op = GGML_OP_ADD;
|
||||
node_red_1->src[0] = node1;
|
||||
node_red_1->src[1] = node_tmp_1;
|
||||
node_red_1->flags |= GGML_TENSOR_FLAG_COMPUTE;
|
||||
ggml_backend_view_init(node_red_1);
|
||||
|
||||
// Local ADD: node2 += tmp2 (in-place via view)
|
||||
ggml_tensor * node_red_2 = get_node_aux(node2);
|
||||
node_red_2->view_src = node2->view_src == nullptr ? node2 : node2->view_src;
|
||||
node_red_2->view_offs = node2->view_offs;
|
||||
node_red_2->op = GGML_OP_ADD;
|
||||
node_red_2->src[0] = node2;
|
||||
node_red_2->src[1] = node_tmp_2;
|
||||
node_red_2->flags |= GGML_TENSOR_FLAG_COMPUTE;
|
||||
ggml_backend_view_init(node_red_2);
|
||||
|
||||
// Build 1-node cgraphs for the ADD ops
|
||||
ggml_cgraph * cgraph_aux_1 = get_cgraph_aux();
|
||||
cgraph_aux_1->nodes[0] = node_red_1;
|
||||
cgraph_aux_1->n_nodes = 1;
|
||||
step_cgraphs[j] = cgraph_aux_1;
|
||||
|
||||
ggml_cgraph * cgraph_aux_2 = get_cgraph_aux();
|
||||
cgraph_aux_2->nodes[0] = node_red_2;
|
||||
cgraph_aux_2->n_nodes = 1;
|
||||
step_cgraphs[j_other] = cgraph_aux_2;
|
||||
push_data(j, j_other, i_buf);
|
||||
}
|
||||
|
||||
// Execute local ADDs for this step
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
for (size_t j = 0; j < 2*offset_j_max; j++) {
|
||||
if (step_cgraphs[j] == nullptr) {
|
||||
continue;
|
||||
}
|
||||
@@ -1855,12 +1993,25 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
|
||||
return status;
|
||||
}
|
||||
}
|
||||
i_buf++;
|
||||
}
|
||||
assert(i_buf == backend_ctx->n_reduce_steps);
|
||||
|
||||
// If n_backends is not a power of 2, copy back the reduced tensors to the excess:
|
||||
for (size_t j = 2*offset_j_max; j < n_backends; j++) {
|
||||
auto & bcj_src = backend_ctx->backend_configs[j - 2*offset_j_max];
|
||||
auto & bcj_dst = backend_ctx->backend_configs[j];
|
||||
|
||||
ggml_tensor * node_src = bcj_src.cgraphs[i].cgraph_main->nodes[bcj_src.cgraphs[i].cgraph_main->n_nodes - 1];
|
||||
ggml_tensor * node_dst = bcj_dst.cgraphs[i].cgraph_main->nodes[bcj_dst.cgraphs[i].cgraph_main->n_nodes - 1];
|
||||
ggml_backend_tensor_copy_async(bcj_src.backend, bcj_dst.backend, node_src, node_dst);
|
||||
}
|
||||
|
||||
return GGML_STATUS_SUCCESS;
|
||||
};
|
||||
|
||||
|
||||
for (size_t i = 0; i < n_subgraphs; i++) {
|
||||
for (size_t i = 0; i < backend_ctx->n_subgraphs; i++) {
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
const ggml_status status = ggml_backend_graph_compute_async(bcj.backend, bcj.cgraphs[i].cgraph_main);
|
||||
@@ -1869,7 +2020,7 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
|
||||
}
|
||||
}
|
||||
|
||||
if (n_backends > 1 && i < n_subgraphs - 1) {
|
||||
if (n_backends > 1 && i < backend_ctx->n_subgraphs - 1) {
|
||||
bool backend_allreduce_success = false;
|
||||
if (backend_ctx->comm_ctx) {
|
||||
std::vector<ggml_tensor *> nodes;
|
||||
|
||||
@@ -83,7 +83,6 @@
|
||||
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
|
||||
// quants.c
|
||||
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
|
||||
#define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
|
||||
|
||||
@@ -151,8 +151,6 @@ void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
const block_q1_0 * GGML_RESTRICT x = vx;
|
||||
const block_q8_0 * GGML_RESTRICT y = vy;
|
||||
|
||||
float sumf = 0.0f;
|
||||
|
||||
#if defined(__ARM_NEON)
|
||||
float32x4_t sumv = vdupq_n_f32(0.0f);
|
||||
|
||||
@@ -212,31 +210,13 @@ void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
}
|
||||
|
||||
sumf = vaddvq_f32(sumv);
|
||||
*s = vaddvq_f32(sumv);
|
||||
#else
|
||||
// Scalar fallback
|
||||
for (int i = 0; i < nb; i++) {
|
||||
const float d0 = GGML_FP16_TO_FP32(x[i].d);
|
||||
|
||||
// Process 4 Q8_0 blocks
|
||||
for (int k = 0; k < 4; k++) {
|
||||
const float d1 = GGML_FP16_TO_FP32(y[i*4 + k].d);
|
||||
|
||||
int sumi = 0;
|
||||
for (int j = 0; j < QK8_0; j++) {
|
||||
const int bit_index = k * QK8_0 + j;
|
||||
const int byte_index = bit_index / 8;
|
||||
const int bit_offset = bit_index % 8;
|
||||
|
||||
const int xi = ((x[i].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
|
||||
sumi += xi * y[i*4 + k].qs[j];
|
||||
}
|
||||
sumf += d0 * d1 * sumi;
|
||||
}
|
||||
}
|
||||
UNUSED(nb);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
ggml_vec_dot_q1_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -274,6 +274,18 @@ static inline __m256 quad_mx_delta_float(const uint8_t x0, const float y0, const
|
||||
}
|
||||
#endif
|
||||
#elif defined(__SSSE3__)
|
||||
static inline __m128i bytes_from_bits_16(const uint8_t * x) {
|
||||
uint16_t x16;
|
||||
memcpy(&x16, x, sizeof(uint16_t));
|
||||
|
||||
const __m128i shuf_mask = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
|
||||
__m128i bytes = _mm_shuffle_epi8(_mm_set1_epi16((short) x16), shuf_mask);
|
||||
const __m128i bit_mask = _mm_set_epi64x(0x7fbfdfeff7fbfdfe, 0x7fbfdfeff7fbfdfe);
|
||||
bytes = _mm_or_si128(bytes, bit_mask);
|
||||
|
||||
return _mm_cmpeq_epi8(bytes, _mm_set1_epi64x(-1));
|
||||
}
|
||||
|
||||
// horizontally add 4x4 floats
|
||||
static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) {
|
||||
__m128 res_0 =_mm_hadd_ps(a, b);
|
||||
@@ -540,6 +552,152 @@ static inline __m128i get_scale_shuffle(int i) {
|
||||
}
|
||||
#endif
|
||||
|
||||
void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
const int qk = QK1_0;
|
||||
const int nb = n / qk;
|
||||
|
||||
assert(n % qk == 0);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
|
||||
const block_q1_0 * GGML_RESTRICT x = vx;
|
||||
const block_q8_0 * GGML_RESTRICT y = vy;
|
||||
|
||||
#if defined(__AVX2__)
|
||||
const __m256i ones_8 = _mm256_set1_epi8(1);
|
||||
const __m256i ones_16 = _mm256_set1_epi16(1);
|
||||
const __m256i byte_shuf = _mm256_setr_epi8(
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3);
|
||||
const __m256i bit_masks = _mm256_setr_epi8(
|
||||
1, 2, 4, 8, 16, 32, 64, (char) -128, 1, 2, 4, 8, 16, 32, 64, (char) -128,
|
||||
1, 2, 4, 8, 16, 32, 64, (char) -128, 1, 2, 4, 8, 16, 32, 64, (char) -128);
|
||||
const __m256i zero = _mm256_setzero_si256();
|
||||
__m256 acc = _mm256_setzero_ps();
|
||||
|
||||
for (int ib = 0; ib < nb; ++ib) {
|
||||
const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
|
||||
const uint32_t * GGML_RESTRICT qs32 = (const uint32_t *) x[ib].qs;
|
||||
const block_q8_0 * GGML_RESTRICT y_ptr = &y[ib * 4];
|
||||
|
||||
__m256 acc_block;
|
||||
{
|
||||
const __m256i qy = _mm256_loadu_si256((const __m256i *) y_ptr[0].qs);
|
||||
const __m256i sm = _mm256_cmpeq_epi8(
|
||||
_mm256_and_si256(_mm256_shuffle_epi8(_mm256_set1_epi32((int) qs32[0]), byte_shuf), bit_masks), zero);
|
||||
const __m256i sy = _mm256_sub_epi8(_mm256_xor_si256(qy, sm), sm);
|
||||
const __m256i s32 = _mm256_madd_epi16(_mm256_maddubs_epi16(ones_8, sy), ones_16);
|
||||
acc_block = _mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[0].d)), _mm256_cvtepi32_ps(s32));
|
||||
}
|
||||
for (int K = 1; K < 4; ++K) {
|
||||
const __m256i qy = _mm256_loadu_si256((const __m256i *) y_ptr[K].qs);
|
||||
const __m256i sm = _mm256_cmpeq_epi8(
|
||||
_mm256_and_si256(_mm256_shuffle_epi8(_mm256_set1_epi32((int) qs32[K]), byte_shuf), bit_masks), zero);
|
||||
const __m256i sy = _mm256_sub_epi8(_mm256_xor_si256(qy, sm), sm);
|
||||
const __m256i s32 = _mm256_madd_epi16(_mm256_maddubs_epi16(ones_8, sy), ones_16);
|
||||
acc_block = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[K].d)), _mm256_cvtepi32_ps(s32), acc_block);
|
||||
}
|
||||
acc = _mm256_fmadd_ps(_mm256_set1_ps(d0), acc_block, acc);
|
||||
}
|
||||
|
||||
*s = hsum_float_8(acc);
|
||||
#elif defined(__AVX__)
|
||||
const __m128i ones_8 = _mm_set1_epi8(1);
|
||||
const __m128i ones_16 = _mm_set1_epi16(1);
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
__m256 acc = _mm256_setzero_ps();
|
||||
|
||||
for (int ib = 0; ib < nb; ++ib) {
|
||||
const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
|
||||
const block_q8_0 * GGML_RESTRICT y_ptr = &y[ib * 4];
|
||||
__m256 acc_block;
|
||||
{
|
||||
const __m256i bit_mask = bytes_from_bits_32(&x[ib].qs[0]);
|
||||
const __m128i bit_mask_0 = _mm256_castsi256_si128(bit_mask);
|
||||
const __m128i bit_mask_1 = _mm256_extractf128_si256(bit_mask, 1);
|
||||
const __m128i qy_0 = _mm_loadu_si128((const __m128i *) &y_ptr[0].qs[0]);
|
||||
const __m128i qy_1 = _mm_loadu_si128((const __m128i *) &y_ptr[0].qs[16]);
|
||||
const __m128i sign_mask_0 = _mm_cmpeq_epi8(bit_mask_0, zero);
|
||||
const __m128i sign_mask_1 = _mm_cmpeq_epi8(bit_mask_1, zero);
|
||||
const __m128i sy_0 = _mm_sub_epi8(_mm_xor_si128(qy_0, sign_mask_0), sign_mask_0);
|
||||
const __m128i sy_1 = _mm_sub_epi8(_mm_xor_si128(qy_1, sign_mask_1), sign_mask_1);
|
||||
const __m128i sum16_0 = _mm_maddubs_epi16(ones_8, sy_0);
|
||||
const __m128i sum16_1 = _mm_maddubs_epi16(ones_8, sy_1);
|
||||
const __m128i sum32_0 = _mm_madd_epi16(sum16_0, ones_16);
|
||||
const __m128i sum32_1 = _mm_madd_epi16(sum16_1, ones_16);
|
||||
const __m256 q = _mm256_cvtepi32_ps(MM256_SET_M128I(sum32_1, sum32_0));
|
||||
acc_block = _mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[0].d)), q);
|
||||
}
|
||||
for(int K = 1; K < 4; ++K) {
|
||||
const __m256i bit_mask = bytes_from_bits_32(&x[ib].qs[(K) * 4]);
|
||||
const __m128i bit_mask_0 = _mm256_castsi256_si128(bit_mask);
|
||||
const __m128i bit_mask_1 = _mm256_extractf128_si256(bit_mask, 1);
|
||||
const __m128i qy_0 = _mm_loadu_si128((const __m128i *) &y_ptr[(K)].qs[0]);
|
||||
const __m128i qy_1 = _mm_loadu_si128((const __m128i *) &y_ptr[(K)].qs[16]);
|
||||
const __m128i sign_mask_0 = _mm_cmpeq_epi8(bit_mask_0, zero);
|
||||
const __m128i sign_mask_1 = _mm_cmpeq_epi8(bit_mask_1, zero);
|
||||
const __m128i sy_0 = _mm_sub_epi8(_mm_xor_si128(qy_0, sign_mask_0), sign_mask_0);
|
||||
const __m128i sy_1 = _mm_sub_epi8(_mm_xor_si128(qy_1, sign_mask_1), sign_mask_1);
|
||||
const __m128i sum16_0 = _mm_maddubs_epi16(ones_8, sy_0);
|
||||
const __m128i sum16_1 = _mm_maddubs_epi16(ones_8, sy_1);
|
||||
const __m128i sum32_0 = _mm_madd_epi16(sum16_0, ones_16);
|
||||
const __m128i sum32_1 = _mm_madd_epi16(sum16_1, ones_16);
|
||||
const __m256 q = _mm256_cvtepi32_ps(MM256_SET_M128I(sum32_1, sum32_0));
|
||||
acc_block = _mm256_add_ps(acc_block, _mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[(K)].d)), q));
|
||||
}
|
||||
#undef Q1_AVX_BLOCK
|
||||
|
||||
acc = _mm256_add_ps(acc, _mm256_mul_ps(_mm256_set1_ps(d0), acc_block));
|
||||
}
|
||||
|
||||
*s = hsum_float_8(acc);
|
||||
#elif defined(__SSSE3__)
|
||||
const __m128i ones_8 = _mm_set1_epi8(1);
|
||||
const __m128i ones_16 = _mm_set1_epi16(1);
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
__m128 acc_0 = _mm_setzero_ps();
|
||||
__m128 acc_1 = _mm_setzero_ps();
|
||||
__m128 acc_2 = _mm_setzero_ps();
|
||||
__m128 acc_3 = _mm_setzero_ps();
|
||||
|
||||
for (int ib = 0; ib < nb; ++ib) {
|
||||
const __m128 d0 = _mm_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d));
|
||||
const block_q8_0 * GGML_RESTRICT y_ptr = &y[ib * 4];
|
||||
|
||||
#define Q1_SSSE3_BLOCK(QS_OFF, Y_IDX, ACC) \
|
||||
{ \
|
||||
const __m128i bit_mask_0 = bytes_from_bits_16(&x[ib].qs[(QS_OFF) + 0]); \
|
||||
const __m128i bit_mask_1 = bytes_from_bits_16(&x[ib].qs[(QS_OFF) + 2]); \
|
||||
const __m128i qy_0 = _mm_loadu_si128((const __m128i *) &y_ptr[(Y_IDX)].qs[0]); \
|
||||
const __m128i qy_1 = _mm_loadu_si128((const __m128i *) &y_ptr[(Y_IDX)].qs[16]); \
|
||||
const __m128i sign_mask_0 = _mm_cmpeq_epi8(bit_mask_0, zero); \
|
||||
const __m128i sign_mask_1 = _mm_cmpeq_epi8(bit_mask_1, zero); \
|
||||
const __m128i sy_0 = _mm_sub_epi8(_mm_xor_si128(qy_0, sign_mask_0), sign_mask_0); \
|
||||
const __m128i sy_1 = _mm_sub_epi8(_mm_xor_si128(qy_1, sign_mask_1), sign_mask_1); \
|
||||
const __m128i sum_0 = _mm_madd_epi16(_mm_maddubs_epi16(ones_8, sy_0), ones_16); \
|
||||
const __m128i sum_1 = _mm_madd_epi16(_mm_maddubs_epi16(ones_8, sy_1), ones_16); \
|
||||
const __m128 q = _mm_cvtepi32_ps(_mm_add_epi32(sum_0, sum_1)); \
|
||||
(ACC) = _mm_add_ps((ACC), _mm_mul_ps(_mm_mul_ps(d0, _mm_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[(Y_IDX)].d))), q)); \
|
||||
}
|
||||
Q1_SSSE3_BLOCK(0, 0, acc_0)
|
||||
Q1_SSSE3_BLOCK(4, 1, acc_1)
|
||||
Q1_SSSE3_BLOCK(8, 2, acc_2)
|
||||
Q1_SSSE3_BLOCK(12, 3, acc_3)
|
||||
#undef Q1_SSSE3_BLOCK
|
||||
}
|
||||
|
||||
*s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
|
||||
#else
|
||||
UNUSED(nb);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
ggml_vec_dot_q1_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
||||
@@ -137,22 +137,28 @@ void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
||||
float sumf = 0.0;
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
const float d0 = GGML_FP16_TO_FP32(x[i].d);
|
||||
const float d0 = GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
|
||||
float sumi = 0.0f;
|
||||
|
||||
for (int k = 0; k < 4; k++) {
|
||||
const float d1 = GGML_FP16_TO_FP32(y[i*4 + k].d);
|
||||
|
||||
const block_q8_0 * GGML_RESTRICT yb = &y[i * 4 + k];
|
||||
const float d1 = GGML_CPU_FP16_TO_FP32(yb->d);
|
||||
int sumi_block = 0;
|
||||
|
||||
for (int j = 0; j < QK8_0; j++) {
|
||||
const int bit_index = k * QK8_0 + j;
|
||||
const int byte_index = bit_index / 8;
|
||||
const int bit_offset = bit_index % 8;
|
||||
const uint8_t * GGML_RESTRICT bits = &x[i].qs[k * 4];
|
||||
const int8_t * GGML_RESTRICT qy = yb->qs;
|
||||
|
||||
const int xi = ((x[i].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
|
||||
sumi_block += xi * y[i*4 + k].qs[j];
|
||||
for (int b = 0; b < 4; ++b, qy += 8) {
|
||||
const unsigned mask = bits[b];
|
||||
sumi_block += ((mask & 0x01) ? qy[0] : -qy[0])
|
||||
+ ((mask & 0x02) ? qy[1] : -qy[1])
|
||||
+ ((mask & 0x04) ? qy[2] : -qy[2])
|
||||
+ ((mask & 0x08) ? qy[3] : -qy[3])
|
||||
+ ((mask & 0x10) ? qy[4] : -qy[4])
|
||||
+ ((mask & 0x20) ? qy[5] : -qy[5])
|
||||
+ ((mask & 0x40) ? qy[6] : -qy[6])
|
||||
+ ((mask & 0x80) ? qy[7] : -qy[7]);
|
||||
}
|
||||
|
||||
sumi += d1 * sumi_block;
|
||||
|
||||
@@ -269,10 +269,6 @@ static const char * cu_get_error_str(CUresult err) {
|
||||
#define FLASH_ATTN_AVAILABLE
|
||||
#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)
|
||||
|
||||
#if defined(TURING_MMA_AVAILABLE)
|
||||
#define LDMATRIX_TRANS_AVAILABLE
|
||||
#endif // defined(TURING_MMA_AVAILABLE)
|
||||
|
||||
static bool fp16_available(const int cc) {
|
||||
return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL ||
|
||||
(GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_PH1);
|
||||
@@ -1187,6 +1183,7 @@ struct ggml_cuda_graph {
|
||||
bool disable_due_to_gpu_arch = false;
|
||||
bool warmup_complete = false;
|
||||
uint64_t uid = 0;
|
||||
int64_t last_used_time = 0;
|
||||
struct node_properties {
|
||||
ggml_tensor node;
|
||||
void * node_src_data_ptrs[GGML_MAX_SRC];
|
||||
@@ -1368,12 +1365,28 @@ struct ggml_backend_cuda_context {
|
||||
// when the computation is split across CPU/GPU (e.g., with --n-cpu-moe)
|
||||
std::unordered_map<const void *, std::unique_ptr<ggml_cuda_graph>> cuda_graphs;
|
||||
|
||||
int64_t last_graph_eviction_sweep = 0;
|
||||
|
||||
ggml_cuda_graph * cuda_graph(const void * first_node_ptr) {
|
||||
const int64_t time_now = ggml_time_us();
|
||||
|
||||
// sweep every 5s, evicting cuda graphs unused for >=10s
|
||||
if (time_now - last_graph_eviction_sweep >= 5'000'000) {
|
||||
last_graph_eviction_sweep = time_now;
|
||||
for (auto it = cuda_graphs.begin(); it != cuda_graphs.end(); ) {
|
||||
if (time_now - it->second->last_used_time >= 10'000'000) {
|
||||
it = cuda_graphs.erase(it);
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto it = cuda_graphs.find(first_node_ptr);
|
||||
if (it == cuda_graphs.end()) {
|
||||
cuda_graphs[first_node_ptr] = std::make_unique<ggml_cuda_graph>();
|
||||
return cuda_graphs[first_node_ptr].get();
|
||||
it = cuda_graphs.emplace(first_node_ptr, std::make_unique<ggml_cuda_graph>()).first;
|
||||
}
|
||||
it->second->last_used_time = time_now;
|
||||
return it->second.get();
|
||||
}
|
||||
|
||||
|
||||
@@ -305,12 +305,13 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_tile(
|
||||
const half2 * const __restrict__ KV, half2 * const __restrict__ tile_KV, const int D2, const int stride_KV, const int i_sup) {
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
// K/V data is loaded with decreasing granularity for D for better memory bandwidth.
|
||||
// The minimum granularity with cp.async is 16 bytes, with synchronous data loading it's 4 bytes.
|
||||
// The minimum granularity is 16 bytes.
|
||||
constexpr int h2_per_chunk = 16/sizeof(half2);
|
||||
const int chunks_per_row = D2 / h2_per_chunk;
|
||||
if constexpr (use_cp_async) {
|
||||
static_assert(warp_size == 32, "bad warp_size");
|
||||
static_assert(!oob_check, "OOB check not compatible with cp_async");
|
||||
constexpr int preload = 64;
|
||||
constexpr int h2_per_chunk = 16/sizeof(half2);
|
||||
const int chunks_per_row = D2 / h2_per_chunk;
|
||||
|
||||
const unsigned int tile_KV_32 = ggml_cuda_cvta_generic_to_shared(tile_KV);
|
||||
|
||||
@@ -348,11 +349,11 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_tile(
|
||||
// 6: max 1*16= 16 bytes, 8 half
|
||||
ggml_cuda_unroll<6>{}(load);
|
||||
} else {
|
||||
// TODO use ggml_cuda_memcpy_1
|
||||
const half2 zero[4] = {{0.0f, 0.0f}, {0.0f, 0.0f}, {0.0f, 0.0f}, {0.0f, 0.0f}};
|
||||
auto load = [&] __device__ (const int n) {
|
||||
const int stride_k = warp_size >> n;
|
||||
const int k0_start = stride_k == warp_size ? 0 : D2 - D2 % (2*stride_k);
|
||||
const int k0_stop = D2 - D2 % (1*stride_k);
|
||||
const int stride_k = 32 >> n;
|
||||
const int k0_start = stride_k == 32 ? 0 : chunks_per_row - chunks_per_row % (2*stride_k);
|
||||
const int k0_stop = chunks_per_row - chunks_per_row % (1*stride_k);
|
||||
const int stride_i = warp_size / stride_k;
|
||||
|
||||
if (k0_start == k0_stop) {
|
||||
@@ -371,15 +372,18 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_tile(
|
||||
for (int k0 = k0_start; k0 < k0_stop; k0 += stride_k) {
|
||||
const int k = k0 + (stride_k == warp_size ? threadIdx.x : threadIdx.x % stride_k);
|
||||
|
||||
tile_KV[i*stride_tile + k] = !oob_check || i < i_sup ? KV[i*stride_KV + k] : make_half2(0.0f, 0.0f);
|
||||
ggml_cuda_memcpy_1<16>(tile_KV + i*stride_tile + k*4,
|
||||
!oob_check || i < i_sup ? KV + i*stride_KV + k*h2_per_chunk : zero);
|
||||
}
|
||||
}
|
||||
};
|
||||
// 1: max 32* 4=128 bytes, 64 half
|
||||
// 2: max 16* 4= 64 bytes, 32 half
|
||||
// 3: max 8* 4= 32 bytes, 16 half
|
||||
// 4: max 4* 4= 16 bytes, 8 half
|
||||
ggml_cuda_unroll<4>{}(load);
|
||||
// 1: max 32*16=512 bytes, 256 half
|
||||
// 2: max 16*16=256 bytes, 128 half
|
||||
// 3: max 8*16=128 bytes, 64 half
|
||||
// 4: max 4*16= 64 bytes, 32 half
|
||||
// 5: max 2*16= 32 bytes, 16 half
|
||||
// 6: max 1*16= 16 bytes, 8 half
|
||||
ggml_cuda_unroll<6>{}(load);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -862,11 +866,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
||||
}
|
||||
|
||||
|
||||
#if defined(AMD_WMMA_AVAILABLE) && !defined(LDMATRIX_TRANS_AVAILABLE)
|
||||
T_A_VKQ A_identity;
|
||||
make_identity_mat(A_identity);
|
||||
#endif // defined(AMD_WMMA_AVAILABLE) && !defined(LDMATRIX_TRANS_AVAILABLE)
|
||||
|
||||
// Calculate VKQ tile, need to use logical rather than physical elements for i0 due to transposition of V:
|
||||
#pragma unroll
|
||||
for (int i0_start = 0; i0_start < DV; i0_start += 2*nbatch_V2) {
|
||||
@@ -897,29 +896,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
||||
const int k0 = k00 + (threadIdx.y % np)*T_A_VKQ::J;
|
||||
|
||||
T_A_VKQ A; // Transposed in SRAM but not in registers, gets transposed on load.
|
||||
#if defined(LDMATRIX_TRANS_AVAILABLE)
|
||||
load_ldmatrix_trans(A, tile_V_i + 2*k0*stride_tile_V + (i_VKQ_0 - i0_start)/2, stride_tile_V);
|
||||
#elif defined(AMD_MFMA_AVAILABLE)
|
||||
// MFMA A register layout: A_mat[i=lane%16][k=4*(lane/16)+reg].
|
||||
// Normal load gives A_mat[seq][dv] but we need A_mat[dv][seq] = V^T.
|
||||
// Load with transposed addressing: 4 strided half loads.
|
||||
{
|
||||
const half2 * xs0 = tile_V_i + 2*k0*stride_tile_V + (i_VKQ_0 - i0_start)/2;
|
||||
const half * xs0_h = (const half *) xs0;
|
||||
const int stride_h = stride_tile_V * 2; // stride in half units
|
||||
half * A_h = (half *) A.x;
|
||||
#pragma unroll
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
A_h[l] = xs0_h[(4*(threadIdx.x / 16) + l) * stride_h + threadIdx.x % 16];
|
||||
}
|
||||
}
|
||||
#else
|
||||
// TODO: Try to transpose tile_V when loading gmem to smem.
|
||||
// Use mma to transpose T_A_VKQ for RDNA.
|
||||
T_A_VKQ A_trans;
|
||||
load_ldmatrix(A_trans, tile_V_i + 2*k0*stride_tile_V + (i_VKQ_0 - i0_start)/2, stride_tile_V);
|
||||
mma(A, A_trans, A_identity);
|
||||
#endif // defined(LDMATRIX_TRANS_AVAILABLE)
|
||||
if constexpr (T_B_KQ::I == 8) {
|
||||
mma(VKQ_C[i_VKQ_0/i0_stride], A, B[k00/(np*T_A_VKQ::J)]);
|
||||
} else {
|
||||
|
||||
@@ -368,15 +368,21 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||
}
|
||||
|
||||
~ggml_cuda_pool_leg() {
|
||||
clear_pool();
|
||||
GGML_ASSERT(pool_size == 0);
|
||||
}
|
||||
|
||||
void clear_pool() {
|
||||
ggml_cuda_set_device(device);
|
||||
for (int i = 0; i < MAX_BUFFERS; ++i) {
|
||||
ggml_cuda_buffer & b = buffer_pool[i];
|
||||
if (b.ptr != nullptr) {
|
||||
CUDA_CHECK(cudaFree(b.ptr));
|
||||
pool_size -= b.size;
|
||||
b.ptr = nullptr;
|
||||
b.size = 0;
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(pool_size == 0);
|
||||
}
|
||||
|
||||
void * alloc(size_t size, size_t * actual_size) override {
|
||||
@@ -421,7 +427,20 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||
size_t look_ahead_size = (size_t) (1.05 * size);
|
||||
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
|
||||
ggml_cuda_set_device(device);
|
||||
CUDA_CHECK(ggml_cuda_device_malloc(&ptr, look_ahead_size, device));
|
||||
cudaError_t err = ggml_cuda_device_malloc(&ptr, look_ahead_size, device);
|
||||
if (err == cudaErrorMemoryAllocation) {
|
||||
(void)cudaGetLastError();
|
||||
const size_t cached_bytes = pool_size;
|
||||
GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: alloc of %.2f MiB failed, flushing %.2f MiB of cached buffers and retrying\n",
|
||||
device, look_ahead_size/1024.0/1024.0, cached_bytes/1024.0/1024.0);
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
clear_pool();
|
||||
err = ggml_cuda_device_malloc(&ptr, look_ahead_size, device);
|
||||
if (err == cudaSuccess) {
|
||||
GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: retry succeeded\n", device);
|
||||
}
|
||||
}
|
||||
CUDA_CHECK(err);
|
||||
*actual_size = look_ahead_size;
|
||||
pool_size += look_ahead_size;
|
||||
#ifdef DEBUG_CUDA_MALLOC
|
||||
@@ -1203,6 +1222,13 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
|
||||
// For small tensors, simply reduce them as FP32.
|
||||
// The following heuristic for how "small" a tensor should be is based on RTX 4090s connected via 16x PCIe 4.0.
|
||||
if ((n_backends <= 2 && ne < 32768) || (n_backends == 3 && ne < 131072) || (n_backends >= 4 && ne < 262144)) {
|
||||
for (size_t i = 0; i < n_backends; ++i) {
|
||||
if ((tensors[i]->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) comm_ctx->backends[i]->context;
|
||||
ggml_cuda_set_device(cuda_ctx->device);
|
||||
CUDA_CHECK(cudaMemsetAsync(tensors[i]->data, 0, ggml_nbytes(tensors[i]), cuda_ctx->stream()));
|
||||
}
|
||||
}
|
||||
NCCL_CHECK(ncclGroupStart());
|
||||
for (size_t i = 0; i < n_backends; ++i) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) comm_ctx->backends[i]->context;
|
||||
@@ -1224,7 +1250,11 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
|
||||
tmp[i].alloc(ne);
|
||||
|
||||
ggml_cuda_set_device(cuda_ctx->device);
|
||||
to_bf16(tensors[i]->data, tmp[i].get(), ne, cuda_ctx->stream());
|
||||
if (tensors[i]->flags & GGML_TENSOR_FLAG_COMPUTE) {
|
||||
to_bf16(tensors[i]->data, tmp[i].get(), ne, cuda_ctx->stream());
|
||||
} else {
|
||||
CUDA_CHECK(cudaMemsetAsync(tmp[i].get(), 0, ne * sizeof(nv_bfloat16), cuda_ctx->stream()));
|
||||
}
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
@@ -3562,6 +3592,30 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph,
|
||||
return true;
|
||||
}
|
||||
|
||||
if (ops.size() == 2 && ops.begin()[0] == GGML_OP_UNARY && ops.begin()[1] == GGML_OP_SQR
|
||||
&& unary_ops.size() == 1 && unary_ops.begin()[0] == GGML_UNARY_OP_RELU) {
|
||||
const ggml_tensor * unary = cgraph->nodes[node_idx];
|
||||
const ggml_tensor * sqr = cgraph->nodes[node_idx+1];
|
||||
|
||||
if (ggml_get_unary_op(unary) != GGML_UNARY_OP_RELU) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (unary->type != GGML_TYPE_F32 && unary->type != GGML_TYPE_F16) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (unary->type != sqr->type) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!ggml_is_contiguous(unary->src[0])) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
if (ops.size() == 3 && ops.begin()[0] == GGML_OP_SCALE && ops.begin()[1] == GGML_OP_UNARY && ops.begin()[2] == GGML_OP_SCALE
|
||||
&& unary_ops.size() == 1 && unary_ops.begin()[0] == GGML_UNARY_OP_TANH) {
|
||||
const ggml_tensor *scale = cgraph->nodes[node_idx];
|
||||
@@ -4070,6 +4124,12 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_UNARY, GGML_OP_SQR }, { GGML_UNARY_OP_RELU })) {
|
||||
ggml_cuda_op_relu_sqr(*cuda_ctx, node, cgraph->nodes[i+1]);
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_SCALE, GGML_OP_UNARY, GGML_OP_SCALE }, { GGML_UNARY_OP_TANH })) {
|
||||
i += 2;
|
||||
ggml_cuda_op_softcap(*cuda_ctx, cgraph->nodes[i], node);
|
||||
|
||||
+79
-166
@@ -86,17 +86,12 @@ namespace ggml_cuda_mma {
|
||||
// - (I_MAJOR, I_MAJOR_MIRRORED) -> I_MAJOR
|
||||
// - (I_MAJOR, J_MAJOR_MIRRORED) -> I_MAJOR
|
||||
|
||||
static constexpr bool is_i_major(const data_layout dl) {
|
||||
return dl == DATA_LAYOUT_I_MAJOR ||
|
||||
dl == DATA_LAYOUT_I_MAJOR_MIRRORED;
|
||||
}
|
||||
|
||||
static constexpr __device__ data_layout get_input_data_layout() {
|
||||
#if defined(RDNA3) || __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
#if defined(RDNA3) || defined(VOLTA_MMA_AVAILABLE)
|
||||
return DATA_LAYOUT_I_MAJOR_MIRRORED;
|
||||
#else
|
||||
return DATA_LAYOUT_I_MAJOR;
|
||||
#endif // defined(RDNA3) || __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
#endif // defined(RDNA3) || defined(VOLTA_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
template <int I_, int J_, typename T, data_layout ds_=DATA_LAYOUT_I_MAJOR>
|
||||
@@ -113,7 +108,6 @@ namespace ggml_cuda_mma {
|
||||
T x[ne] = {0};
|
||||
|
||||
static constexpr __device__ bool supported() {
|
||||
if (I == 64 && J == 2) return true;
|
||||
if (I == 16 && J == 8) return true;
|
||||
if (I == 32 && J == 4) return true;
|
||||
if (I == 16 && J == 16) return true;
|
||||
@@ -122,7 +116,7 @@ namespace ggml_cuda_mma {
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int get_i(const int l) {
|
||||
if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
|
||||
if constexpr (I == 16 && J == 4) {
|
||||
return threadIdx.x % 16;
|
||||
} else if constexpr (I == 16 && J == 8) {
|
||||
return threadIdx.x % 16;
|
||||
@@ -139,8 +133,8 @@ namespace ggml_cuda_mma {
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int get_j(const int l) {
|
||||
if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
|
||||
return (2 * ((threadIdx.x / 16) % 2) + l);
|
||||
if constexpr (I == 16 && J == 4) {
|
||||
return threadIdx.x / 16;
|
||||
} else if constexpr (I == 16 && J == 8) {
|
||||
return 2 * (threadIdx.x / 16) + l;
|
||||
} else if constexpr (I == 32 && J == 4) {
|
||||
@@ -154,7 +148,7 @@ namespace ggml_cuda_mma {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
#elif __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
#elif defined(VOLTA_MMA_AVAILABLE)
|
||||
static constexpr int ne = I * J / 32;
|
||||
T x[ne] = {0};
|
||||
|
||||
@@ -283,7 +277,7 @@ namespace ggml_cuda_mma {
|
||||
static constexpr int J = J_;
|
||||
static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR;
|
||||
|
||||
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
#if defined(VOLTA_MMA_AVAILABLE)
|
||||
static constexpr int ne = I * J / WARP_SIZE;
|
||||
half2 x[ne] = {{0.0f, 0.0f}};
|
||||
|
||||
@@ -407,7 +401,7 @@ namespace ggml_cuda_mma {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
#endif // defined(VOLTA_MMA_AVAILABLE)
|
||||
};
|
||||
|
||||
template <int I_, int J_>
|
||||
@@ -701,57 +695,12 @@ namespace ggml_cuda_mma {
|
||||
}
|
||||
#endif // defined(TURING_MMA_AVAILABLE)
|
||||
|
||||
static __device__ __forceinline__ void make_identity_mat(tile<16, 8, half2> & t) {
|
||||
#if defined(RDNA4)
|
||||
const int row = t.get_i(0);
|
||||
const int left_right = t.get_j(0) / 4;
|
||||
const int up_down = row / 8;
|
||||
const int idx = row % 8;
|
||||
reinterpret_cast<half*>(t.x)[idx] = left_right == up_down ? 1.0f : 0.0f;
|
||||
#else
|
||||
GGML_UNUSED_VARS(t);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // defined(RDNA4)
|
||||
}
|
||||
|
||||
template <int I, int J, typename T, data_layout dl>
|
||||
static __device__ __forceinline__ void load_generic(tile<I, J, T, dl> & t, const T * __restrict__ xs0, const int stride) {
|
||||
#if defined(AMD_MFMA_AVAILABLE)
|
||||
if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
|
||||
#pragma unroll
|
||||
for (int l = 0; l < t.ne; ++l) {
|
||||
t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)];
|
||||
}
|
||||
} else {
|
||||
ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
|
||||
}
|
||||
#elif defined(AMD_WMMA_AVAILABLE)
|
||||
// All wmma layout has contiguous data when i-major.
|
||||
if constexpr (is_i_major(dl)) {
|
||||
// the data must be aligned to 16 bytes when bigger than ggml_cuda_get_max_cpy_bytes()
|
||||
constexpr int aligned_copy_bytes = ggml_cuda_get_max_cpy_bytes();
|
||||
if constexpr (sizeof(t.x) > aligned_copy_bytes) {
|
||||
static_assert(sizeof(t.x) % aligned_copy_bytes == 0, "bad type size");
|
||||
constexpr int aligned_copy_count = sizeof(t.x)/aligned_copy_bytes;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < aligned_copy_count; ++i) {
|
||||
ggml_cuda_memcpy_1<aligned_copy_bytes>(t.x + t.ne/aligned_copy_count*i, xs0 + t.get_i(0) * stride + t.get_j(t.ne/aligned_copy_count*i));
|
||||
}
|
||||
} else {
|
||||
ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
|
||||
}
|
||||
} else {
|
||||
#pragma unroll
|
||||
for (int l = 0; l < t.ne; ++l) {
|
||||
t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)];
|
||||
}
|
||||
}
|
||||
#else
|
||||
#pragma unroll
|
||||
for (int l = 0; l < t.ne; ++l) {
|
||||
t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)];
|
||||
}
|
||||
#endif // defined(AMD_MFMA_AVAILABLE)
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@@ -764,26 +713,37 @@ namespace ggml_cuda_mma {
|
||||
: "=r"(xi[0]), "=r"(xi[1])
|
||||
: "l"(xs));
|
||||
#else
|
||||
load_generic(t, xs0, stride);
|
||||
GGML_UNUSED_VARS(t, xs0, stride);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // TURING_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
template <typename T, data_layout dl>
|
||||
static __device__ __forceinline__ void load_ldmatrix(
|
||||
tile<16, 4, T> & t, const T * __restrict__ xs0, const int stride) {
|
||||
tile<16, 4, T, dl> & t, const T * __restrict__ xs0, const int stride) {
|
||||
#ifdef TURING_MMA_AVAILABLE
|
||||
int * xi = (int *) t.x;
|
||||
const int * xs = (const int *) xs0 + (threadIdx.x % t.I) * stride;
|
||||
asm volatile("ldmatrix.sync.aligned.m8n8.x2.b16 {%0, %1}, [%2];"
|
||||
: "=r"(xi[0]), "=r"(xi[1])
|
||||
: "l"(xs));
|
||||
#elif defined(AMD_WMMA_AVAILABLE)
|
||||
#ifdef RDNA3
|
||||
static_assert(dl == DATA_LAYOUT_I_MAJOR_MIRRORED, "bad data layout");
|
||||
static_assert(sizeof(t.x) == 16, "bad ne");
|
||||
ggml_cuda_memcpy_1<8>(t.x + 0, xs0 + t.get_i(0)*stride + 0);
|
||||
ggml_cuda_memcpy_1<8>(t.x + 2, xs0 + t.get_i(0)*stride + 2);
|
||||
#else
|
||||
static_assert(dl == DATA_LAYOUT_I_MAJOR, "bad data layout");
|
||||
static_assert(sizeof(t.x) == 8, "bad ne");
|
||||
ggml_cuda_memcpy_1<8>(t.x, xs0 + t.get_i(0)*stride + t.get_j(0));
|
||||
#endif // RDNA3
|
||||
#elif defined(AMD_MFMA_AVAILABLE)
|
||||
static_assert(sizeof(t.x) == 4, "bad ne");
|
||||
ggml_cuda_memcpy_1<4>(t.x, xs0 + t.get_i(0)*stride + t.get_j(0));
|
||||
#else
|
||||
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
GGML_UNUSED_VARS(t, xs0, stride);
|
||||
NO_DEVICE_CODE;
|
||||
#else
|
||||
load_generic(t, xs0, stride);
|
||||
#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
#endif // TURING_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
@@ -796,19 +756,26 @@ namespace ggml_cuda_mma {
|
||||
asm volatile("ldmatrix.sync.aligned.m8n8.x4.b16 {%0, %1, %2, %3}, [%4];"
|
||||
: "=r"(xi[0]), "=r"(xi[1]), "=r"(xi[2]), "=r"(xi[3])
|
||||
: "l"(xs));
|
||||
#else
|
||||
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
#if 1
|
||||
// TODO: more generic handling
|
||||
static_assert(sizeof(T) == 4, "bad type size");
|
||||
#elif defined(VOLTA_MMA_AVAILABLE)
|
||||
ggml_cuda_memcpy_1<4*sizeof(T)>(t.x + 0, xs0 + t.get_i(0)*stride + 0);
|
||||
ggml_cuda_memcpy_1<4*sizeof(T)>(t.x + 4, xs0 + t.get_i(4)*stride + 4);
|
||||
#elif defined(AMD_WMMA_AVAILABLE)
|
||||
#ifdef RDNA3
|
||||
static_assert(dl == DATA_LAYOUT_I_MAJOR_MIRRORED, "bad data layout");
|
||||
static_assert(sizeof(t.x) == 32, "bad ne");
|
||||
ggml_cuda_memcpy_1<16>(t.x + 0, xs0 + t.get_i(0)*stride + 0);
|
||||
ggml_cuda_memcpy_1<16>(t.x + 4, xs0 + t.get_i(0)*stride + 4);
|
||||
#else
|
||||
load_generic(t, xs0, stride);
|
||||
#endif // 1
|
||||
static_assert(dl == DATA_LAYOUT_I_MAJOR, "bad data layout");
|
||||
static_assert(sizeof(t.x) == 16, "bad ne");
|
||||
ggml_cuda_memcpy_1<16>(t.x, xs0 + t.get_i(0)*stride + t.get_j(0));
|
||||
#endif // RDNA3
|
||||
#elif defined(AMD_MFMA_AVAILABLE)
|
||||
static_assert(sizeof(t.x) == 8, "bad ne");
|
||||
ggml_cuda_memcpy_1<8>(t.x, xs0 + t.get_i(0)*stride + t.get_j(0));
|
||||
#else
|
||||
load_generic(t, xs0, stride);
|
||||
#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
GGML_UNUSED_VARS(t, xs0, stride);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // TURING_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
@@ -827,23 +794,30 @@ namespace ggml_cuda_mma {
|
||||
|
||||
static __device__ __forceinline__ void load_ldmatrix(
|
||||
tile<32, 4, half2> & t, const half2 * __restrict__ xs0, const int stride) {
|
||||
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
#if defined(VOLTA_MMA_AVAILABLE)
|
||||
ggml_cuda_memcpy_1<4*sizeof(half2)>(t.x, xs0 + t.get_i(0)*stride);
|
||||
#else
|
||||
GGML_UNUSED_VARS(t, xs0, stride);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
#endif // defined(VOLTA_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__ void load_ldmatrix_trans(
|
||||
tile<16, 8, T> & t, const T * __restrict__ xs0, const int stride) {
|
||||
#ifdef TURING_MMA_AVAILABLE
|
||||
int * xi = (int * ) t.x;
|
||||
int * xi = (int *) t.x;
|
||||
const int * xs = (const int *) xs0 + (threadIdx.x % t.I) * stride + (threadIdx.x / t.I) * (t.J / 2);
|
||||
asm volatile("ldmatrix.sync.aligned.m8n8.x4.trans.b16 {%0, %1, %2, %3}, [%4];"
|
||||
: "=r"(xi[0]), "=r"(xi[2]), "=r"(xi[1]), "=r"(xi[3])
|
||||
: "l"(xs));
|
||||
#elif defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
half * xh = (half *) t.x;
|
||||
#pragma unroll
|
||||
for (int l = 0; l < t.ne; ++l) {
|
||||
xh[2*l + 0] = ((const half *) xs0)[(2*t.get_j(l) + 0)*(2*stride) + t.get_i(l)];
|
||||
xh[2*l + 1] = ((const half *) xs0)[(2*t.get_j(l) + 1)*(2*stride) + t.get_i(l)];
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED_VARS(t, xs0, stride);
|
||||
NO_DEVICE_CODE;
|
||||
@@ -1218,73 +1192,27 @@ namespace ggml_cuda_mma {
|
||||
using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
|
||||
int32x4_t * acc = (int32x4_t *) D.x;
|
||||
#if defined(CDNA4) || defined(CDNA3)
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_16x16x32_i8(((int64_t *) A.x)[0],
|
||||
((int64_t *) B.x)[0],
|
||||
acc[0],
|
||||
0, 0, 0);
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_16x16x32_i8(((int64_t *) A.x)[0], ((int64_t *) B.x)[0], acc[0], 0, 0, 0);
|
||||
#elif defined(CDNA2) || defined(CDNA1)
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[0],
|
||||
B.x[0],
|
||||
acc[0],
|
||||
0, 0, 0);
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[1],
|
||||
B.x[1],
|
||||
acc[0],
|
||||
0, 0, 0);
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[0], B.x[0], acc[0], 0, 0, 0);
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[1], B.x[1], acc[0], 0, 0, 0);
|
||||
#endif // defined(CDNA4) || defined(CDNA3)
|
||||
|
||||
#elif defined(AMD_WMMA_AVAILABLE)
|
||||
|
||||
using int32x8_t = __attribute__((__vector_size__(8 * sizeof(int)))) int;
|
||||
int32x8_t * acc = (int32x8_t *) D.x;
|
||||
|
||||
#if defined(RDNA4)
|
||||
using int32x2_t = __attribute__((__vector_size__(2 * sizeof(int)))) int;
|
||||
int32x2_t * a_vec = (int32x2_t *) A.x;
|
||||
int32x2_t * b_vec = (int32x2_t *) B.x;
|
||||
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
|
||||
true,
|
||||
a_vec[0],
|
||||
true,
|
||||
b_vec[0],
|
||||
acc[0],
|
||||
true
|
||||
);
|
||||
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
|
||||
true,
|
||||
a_vec[1],
|
||||
true,
|
||||
b_vec[1],
|
||||
acc[0],
|
||||
true
|
||||
);
|
||||
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(true, a_vec[0], true, b_vec[0], acc[0], true);
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(true, a_vec[1], true, b_vec[1], acc[0], true);
|
||||
#elif defined(RDNA3)
|
||||
using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
|
||||
int32x4_t * a_vec = (int32x4_t *) A.x;
|
||||
int32x4_t * b_vec = (int32x4_t *) B.x;
|
||||
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(
|
||||
true,
|
||||
a_vec[0],
|
||||
true,
|
||||
b_vec[0],
|
||||
acc[0],
|
||||
true
|
||||
);
|
||||
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(
|
||||
true,
|
||||
a_vec[1],
|
||||
true,
|
||||
b_vec[1],
|
||||
acc[0],
|
||||
true
|
||||
);
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(true, a_vec[0], true, b_vec[0], acc[0], true);
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(true, a_vec[1], true, b_vec[1], acc[0], true);
|
||||
#endif // RDNA4
|
||||
|
||||
#else
|
||||
GGML_UNUSED_VARS(D, A, B);
|
||||
NO_DEVICE_CODE;
|
||||
@@ -1297,19 +1225,10 @@ namespace ggml_cuda_mma {
|
||||
using int32x16_t = __attribute__((__vector_size__(16 * sizeof(int)))) int;
|
||||
int32x16_t * acc = (int32x16_t *) D.x;
|
||||
#if defined(CDNA4) || defined(CDNA3)
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_32x32x16_i8(((int64_t *) A.x)[0],
|
||||
((int64_t *) B.x)[0],
|
||||
acc[0],
|
||||
0, 0, 0);
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_32x32x16_i8(((int64_t *) A.x)[0], ((int64_t *) B.x)[0], acc[0], 0, 0, 0);
|
||||
#elif defined(CDNA2) || defined(CDNA1)
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[0],
|
||||
B.x[0],
|
||||
acc[0],
|
||||
0, 0, 0);
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[1],
|
||||
B.x[1],
|
||||
acc[0],
|
||||
0, 0, 0);
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[0], B.x[0], acc[0], 0, 0, 0);
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[1], B.x[1], acc[0], 0, 0, 0);
|
||||
#endif // defined(CDNA4) || defined(CDNA3)
|
||||
|
||||
#else
|
||||
@@ -1329,7 +1248,7 @@ namespace ggml_cuda_mma {
|
||||
|
||||
static __device__ __forceinline__ void mma(
|
||||
tile<32, 8, float> & D, const tile<32, 4, half2> & A, const tile<8, 4, half2, DATA_LAYOUT_I_MAJOR_MIRRORED> & B) {
|
||||
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
#if defined(VOLTA_MMA_AVAILABLE)
|
||||
const int * Axi = (const int *) A.x;
|
||||
const int * Bxi = (const int *) B.x;
|
||||
int * Dxi = (int *) D.x;
|
||||
@@ -1344,12 +1263,12 @@ namespace ggml_cuda_mma {
|
||||
#else
|
||||
GGML_UNUSED_VARS(D, A, B);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
||||
#endif // defined(VOLTA_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void mma(
|
||||
tile<32, 4, half2> & D, const tile<32, 4, half2> & A, const tile<8, 4, half2, DATA_LAYOUT_J_MAJOR_MIRRORED> & B) {
|
||||
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
#if defined(VOLTA_MMA_AVAILABLE)
|
||||
const int * Axi = (const int *) A.x;
|
||||
const int * Bxi = (const int *) B.x;
|
||||
int * Dxi = (int *) D.x;
|
||||
@@ -1364,41 +1283,35 @@ namespace ggml_cuda_mma {
|
||||
#else
|
||||
GGML_UNUSED_VARS(D, A, B);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
||||
#endif // defined(VOLTA_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
template <data_layout dl_d, data_layout dl_ab>
|
||||
static __device__ __forceinline__ void mma(
|
||||
tile<16, 16, int, dl_d> & D, const tile<16, 4, int, dl_ab> & A, const tile<16, 4, int, dl_ab> & B) {
|
||||
#if defined(AMD_WMMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE)
|
||||
using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
|
||||
int32x4_t * acc = (int32x4_t *) D.x;
|
||||
#if defined(CDNA4) || defined(CDNA3)
|
||||
const int64_t xA = uint32_t(A.x[0]);
|
||||
const int64_t xB = uint32_t(B.x[0]);
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_16x16x32_i8(xA, xB, acc[0], 0, 0, 0);
|
||||
#elif defined(CDNA2) || defined(CDNA1)
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[0], B.x[0], acc[0], 0, 0, 0);
|
||||
#endif // defined(CDNA4) || defined(CDNA3)
|
||||
#elif defined(AMD_WMMA_AVAILABLE)
|
||||
using int32x8_t = __attribute__((__vector_size__(8 * sizeof(int)))) int;
|
||||
int32x8_t * acc = (int32x8_t *) D.x;
|
||||
#if defined(RDNA4)
|
||||
using int32x2_t = __attribute__((__vector_size__(2 * sizeof(int)))) int;
|
||||
int32x2_t * a_vec = (int32x2_t *) A.x;
|
||||
int32x2_t * b_vec = (int32x2_t *) B.x;
|
||||
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
|
||||
true,
|
||||
a_vec[0],
|
||||
true,
|
||||
b_vec[0],
|
||||
acc[0],
|
||||
false
|
||||
);
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(true, a_vec[0], true, b_vec[0], acc[0], false);
|
||||
#elif defined(RDNA3)
|
||||
using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
|
||||
int32x4_t * a_vec = (int32x4_t *) A.x;
|
||||
int32x4_t * b_vec = (int32x4_t *) B.x;
|
||||
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(
|
||||
true,
|
||||
a_vec[0],
|
||||
true,
|
||||
b_vec[0],
|
||||
acc[0],
|
||||
false
|
||||
);
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(true, a_vec[0], true, b_vec[0], acc[0], false);
|
||||
#endif // RDNA4
|
||||
#else
|
||||
GGML_UNUSED(D);
|
||||
|
||||
+16
-185
@@ -104,7 +104,7 @@ struct tile_x_sizes {
|
||||
};
|
||||
|
||||
static int get_mmq_x_max_host(const int cc) {
|
||||
return (amd_mfma_available(cc) || turing_mma_available(cc) || amd_wmma_available(cc)) ? 128 :
|
||||
return (turing_mma_available(cc) || amd_wmma_available(cc)) ? 128 :
|
||||
GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA ?
|
||||
#ifdef GGML_CUDA_FORCE_MMQ
|
||||
128 : 64;
|
||||
@@ -114,9 +114,9 @@ static int get_mmq_x_max_host(const int cc) {
|
||||
}
|
||||
|
||||
static constexpr __device__ int get_mmq_x_max_device() {
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
#if defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
return 128;
|
||||
#else // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
#else // defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
|
||||
#if defined(GGML_USE_HIP)
|
||||
return 64;
|
||||
@@ -1054,13 +1054,13 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma(
|
||||
tile_A A[ntx];
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
load_generic(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q8_0 + k0, MMQ_MMA_TILE_X_K_Q8_0);
|
||||
load_ldmatrix(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q8_0 + k0, MMQ_MMA_TILE_X_K_Q8_0);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
|
||||
tile_B B;
|
||||
load_generic(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
load_ldmatrix(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
|
||||
float dB;
|
||||
const int j = j0 + tile_C::get_j(0);
|
||||
@@ -1295,13 +1295,13 @@ static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma(
|
||||
tile_A A[ntx];
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
load_generic(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q8_1 + k0, MMQ_MMA_TILE_X_K_Q8_1);
|
||||
load_ldmatrix(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q8_1 + k0, MMQ_MMA_TILE_X_K_Q8_1);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
|
||||
tile_B B;
|
||||
load_generic(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
load_ldmatrix(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
|
||||
const int j = j0 + tile_C::get_j(0);
|
||||
const float2 dsB = __half22float2(y_dm[j*MMQ_TILE_Y_K + k01/QI8_1]);
|
||||
@@ -1435,57 +1435,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_dp4a(
|
||||
template <int mmq_x, int mmq_y>
|
||||
static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma(
|
||||
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
|
||||
#if defined(AMD_MFMA_AVAILABLE)
|
||||
constexpr data_layout input_layout = get_input_data_layout();
|
||||
typedef tile<16, 8, int, input_layout> tile_A;
|
||||
typedef tile<16, 8, int, input_layout> tile_B;
|
||||
typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
|
||||
typedef tile<64, 2, int, input_layout> tile_load;
|
||||
|
||||
constexpr int granularity = mmq_get_granularity_device(mmq_x);
|
||||
constexpr int rows_per_warp = granularity;
|
||||
constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp.
|
||||
|
||||
y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K);
|
||||
|
||||
const int * x_qs = (const int *) x;
|
||||
const float * x_df = (const float *) x_qs + MMQ_TILE_NE_K*2;
|
||||
const int * y_qs = (const int *) y + 4;
|
||||
const float * y_df = (const float *) y;
|
||||
|
||||
const int i0 = (threadIdx.y / ntx) * rows_per_warp;
|
||||
|
||||
for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 4) {
|
||||
const int k0 = k00 + k01;
|
||||
|
||||
tile_A A[ntx];
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
load_generic(((tile_load *) A)[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q3_K + k0, MMQ_MMA_TILE_X_K_Q3_K);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
|
||||
tile_B B[1];
|
||||
load_generic(((tile_load *) B)[0], y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
|
||||
const int j = j0 + tile_C::get_j(0);
|
||||
const float dB = y_df[j*MMQ_TILE_Y_K + k01/QI8_1] / 2;
|
||||
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
tile_C C;
|
||||
mma(C, A[n], B[0]);
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < tile_C::ne; ++l) {
|
||||
const int i = i0 + n*tile_C::I + tile_C::get_i(l);
|
||||
sum[(j0/tile_C::J + n)*tile_C::ne + l] += C.x[l] * x_df[i*MMQ_MMA_TILE_X_K_Q3_K + k0/4] * dB;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
constexpr data_layout input_layout = get_input_data_layout();
|
||||
typedef tile<16, 4, int, input_layout> tile_A;
|
||||
typedef tile<16, 4, int, input_layout> tile_B;
|
||||
@@ -1510,13 +1460,13 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma(
|
||||
tile_A A[ntx];
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
load_generic(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q3_K + k0, MMQ_MMA_TILE_X_K_Q3_K);
|
||||
load_ldmatrix(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q3_K + k0, MMQ_MMA_TILE_X_K_Q3_K);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
|
||||
tile_B B;
|
||||
load_generic(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
load_ldmatrix(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
|
||||
const int j = j0 + tile_C::get_j(0);
|
||||
const float dB = y_df[j*MMQ_TILE_Y_K + k01/QI8_1];
|
||||
@@ -1742,74 +1692,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_dp4a(
|
||||
template <int mmq_x, int mmq_y>
|
||||
static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma(
|
||||
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
|
||||
#if defined(AMD_MFMA_AVAILABLE)
|
||||
constexpr data_layout input_layout = get_input_data_layout();
|
||||
typedef tile<16, 8, int, input_layout> tile_A;
|
||||
typedef tile<16, 8, int, input_layout> tile_B;
|
||||
typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
|
||||
typedef tile<64, 2, int, input_layout> tile_load;
|
||||
|
||||
constexpr int granularity = mmq_get_granularity_device(mmq_x);
|
||||
constexpr int rows_per_warp = granularity;
|
||||
constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp.
|
||||
|
||||
y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K);
|
||||
|
||||
const int * x_qs = (const int *) x;
|
||||
const half2 * x_dm = (const half2 *) x_qs + MMQ_TILE_NE_K*2;
|
||||
const int * y_qs = (const int *) y + 4;
|
||||
const half2 * y_ds = (const half2 *) y;
|
||||
|
||||
const int i0 = (threadIdx.y / ntx) * rows_per_warp;
|
||||
|
||||
for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 4) {
|
||||
const int k0 = k00 + k01;
|
||||
|
||||
tile_A A[ntx];
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
load_generic(((tile_load *) A)[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q2_K + k0, MMQ_MMA_TILE_X_K_Q2_K);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
|
||||
tile_B B[1];
|
||||
load_generic(((tile_load *) B)[0], y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
|
||||
const int j = j0 + tile_C::get_j(0);
|
||||
const float dB = (k01 < MMQ_TILE_NE_K/2) ? __half22float2(y_ds[j*MMQ_TILE_Y_K]).x/2 : __half22float2(y_ds[j*MMQ_TILE_Y_K]).y/2;
|
||||
const float sB = (k01 >= MMQ_TILE_NE_K * 3/4) ? 0
|
||||
: (((k01/4)%2) ? __half22float2(y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]).y
|
||||
: __half22float2(y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]).x);
|
||||
|
||||
tile_C Cm;
|
||||
if (k01 >= MMQ_TILE_NE_K * 3/4) {
|
||||
tile_A A1;
|
||||
A1.x[0] = 0x01010101;
|
||||
A1.x[1] = 0x01010101;
|
||||
mma(Cm, A1, B[0]);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
tile_C Cd;
|
||||
mma(Cd, A[n], B[0]);
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < tile_C::ne; ++l) {
|
||||
const int i = i0 + n*tile_C::I + tile_C::get_i(l);
|
||||
const float2 dm = __half22float2(x_dm[i*MMQ_MMA_TILE_X_K_Q2_K + k0/4]);
|
||||
float tmp = Cd.x[l]*dm.x;
|
||||
if (k01 >= MMQ_TILE_NE_K * 3/4) {
|
||||
tmp -= Cm.x[l]*dm.y;
|
||||
}
|
||||
sum[(j0/tile_C::J + n)*tile_C::ne + l] += tmp*dB;
|
||||
sum[(j0/tile_C::J + n)*tile_C::ne + l] -= dm.y*sB;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
constexpr data_layout input_layout = get_input_data_layout();
|
||||
typedef tile<16, 4, int, input_layout> tile_A;
|
||||
typedef tile<16, 4, int, input_layout> tile_B;
|
||||
@@ -1834,13 +1717,13 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma(
|
||||
tile_A A[ntx];
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
load_generic(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q2_K + k0, MMQ_MMA_TILE_X_K_Q2_K);
|
||||
load_ldmatrix(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q2_K + k0, MMQ_MMA_TILE_X_K_Q2_K);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
|
||||
tile_B B;
|
||||
load_generic(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
load_ldmatrix(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
|
||||
const int j = j0 + tile_C::get_j(0);
|
||||
const float dB = (k01 < MMQ_TILE_NE_K/2) ? __half22float2(y_ds[j*MMQ_TILE_Y_K]).x : __half22float2(y_ds[j*MMQ_TILE_Y_K]).y;
|
||||
@@ -2573,59 +2456,7 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_dp4a(
|
||||
template <int mmq_x, int mmq_y>
|
||||
static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
|
||||
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
|
||||
#if defined(AMD_MFMA_AVAILABLE)
|
||||
constexpr data_layout input_layout = get_input_data_layout();
|
||||
typedef tile<16, 8, int, input_layout> tile_A;
|
||||
typedef tile<16, 8, int, input_layout> tile_B;
|
||||
typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
|
||||
typedef tile<64, 2, int, input_layout> tile_load;
|
||||
|
||||
constexpr int granularity = mmq_get_granularity_device(mmq_x);
|
||||
constexpr int rows_per_warp = granularity;
|
||||
constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp.
|
||||
|
||||
y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K);
|
||||
|
||||
const int * x_qs = (const int *) x;
|
||||
const float * x_df = (const float *) x_qs + MMQ_TILE_NE_K*2;
|
||||
const int * x_sc = (const int *) x_df + MMQ_TILE_NE_K/QI6_K;
|
||||
const int * y_qs = (const int *) y + 4;
|
||||
const float * y_df = (const float *) y;
|
||||
|
||||
const int i0 = (threadIdx.y / ntx) * rows_per_warp;
|
||||
|
||||
for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 4) {
|
||||
const int k0 = k00 + k01;
|
||||
|
||||
tile_A A[ntx];
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
load_generic(((tile_load *) A)[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q6_K + k0, MMQ_MMA_TILE_X_K_Q6_K);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
|
||||
tile_B B[1];
|
||||
load_generic(((tile_load *) B)[0], y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
|
||||
const int j = j0 + tile_C::get_j(0);
|
||||
const float dB = y_df[j*MMQ_TILE_Y_K + k01/QI8_1] / 2;
|
||||
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
tile_C C;
|
||||
mma(C, A[n], B[0]);
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < tile_C::ne; ++l) {
|
||||
const int i = i0 + n*tile_C::I + tile_C::get_i(l);
|
||||
const int8_t * sc = (const int8_t *) (x_sc + i*MMQ_MMA_TILE_X_K_Q6_K + k00/16);
|
||||
sum[(j0/tile_C::J + n)*tile_C::ne + l] += C.x[l] * sc[k01/4] * x_df[i*MMQ_MMA_TILE_X_K_Q6_K] * dB;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
constexpr data_layout input_layout = get_input_data_layout();
|
||||
typedef tile<16, 4, int, input_layout> tile_A;
|
||||
typedef tile<16, 4, int, input_layout> tile_B;
|
||||
@@ -2651,13 +2482,13 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
|
||||
tile_A A[ntx];
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
load_generic(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q6_K + k0, MMQ_MMA_TILE_X_K_Q6_K);
|
||||
load_ldmatrix(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q6_K + k0, MMQ_MMA_TILE_X_K_Q6_K);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
|
||||
tile_B B;
|
||||
load_generic(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
load_ldmatrix(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
|
||||
const int j = j0 + tile_C::get_j(0);
|
||||
const float dB = y_df[j*MMQ_TILE_Y_K + k01/QI8_1];
|
||||
|
||||
@@ -65,6 +65,11 @@ static __device__ __forceinline__ float op_sqr(float x) {
|
||||
return x * x;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float op_relu_sqr(float x) {
|
||||
const float r = fmaxf(x, 0.0f);
|
||||
return r * r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float op_sqrt(float x) {
|
||||
return sqrtf(x);
|
||||
}
|
||||
@@ -615,3 +620,21 @@ void ggml_cuda_op_unary_mul(ggml_backend_cuda_context & ctx, ggml_tensor * unary
|
||||
GGML_ABORT("Unsupported unary op for fused unary+mul");
|
||||
}
|
||||
}
|
||||
|
||||
/* fused relu + sqr */
|
||||
|
||||
void ggml_cuda_op_relu_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * relu_node, ggml_tensor * sqr_node) {
|
||||
const ggml_tensor * src = relu_node->src[0];
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src));
|
||||
GGML_ASSERT(src->type == GGML_TYPE_F32 || src->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src->type == sqr_node->type);
|
||||
|
||||
const int k = ggml_nelements(src);
|
||||
if (src->type == GGML_TYPE_F16) {
|
||||
unary_cuda<op_relu_sqr>((const half *)src->data, (half *)sqr_node->data, k, stream);
|
||||
} else {
|
||||
unary_cuda<op_relu_sqr>((const float *)src->data, (float *)sqr_node->data, k, stream);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -91,6 +91,8 @@ void ggml_cuda_op_xielu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_unary_mul(ggml_backend_cuda_context & ctx, ggml_tensor * unary_node, ggml_tensor * mul_node);
|
||||
|
||||
void ggml_cuda_op_relu_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * relu_node, ggml_tensor * sqr_node);
|
||||
|
||||
__device__ __forceinline__ float ggml_cuda_op_silu_single(float x) {
|
||||
return x / (1.0f + expf(-x));
|
||||
}
|
||||
|
||||
Vendored
+1
-1
@@ -33,7 +33,6 @@
|
||||
#define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
|
||||
#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
|
||||
#define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }}
|
||||
#define NCCL_CHECK(fn) {ncclResult_t err = fn; if(err != ncclSuccess) { GGML_ABORT("RCCL Failure RCCL returned: %i\n", err); }}
|
||||
#define __shfl_sync(mask, var, laneMask, width) __shfl(var, laneMask, width)
|
||||
#define __shfl_up_sync(mask, var, laneMask, width) __shfl_up(var, laneMask, width)
|
||||
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
||||
@@ -59,6 +58,7 @@
|
||||
#define cudaDeviceProp hipDeviceProp_t
|
||||
#define cudaDeviceSynchronize hipDeviceSynchronize
|
||||
#define cudaError_t hipError_t
|
||||
#define cudaErrorMemoryAllocation hipErrorOutOfMemory
|
||||
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
|
||||
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
|
||||
#define cudaEventCreateWithFlags hipEventCreateWithFlags
|
||||
|
||||
Vendored
+1
@@ -42,6 +42,7 @@
|
||||
#define cudaDeviceProp musaDeviceProp
|
||||
#define cudaDeviceSynchronize musaDeviceSynchronize
|
||||
#define cudaError_t musaError_t
|
||||
#define cudaErrorMemoryAllocation musaErrorMemoryAllocation
|
||||
#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
|
||||
#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
|
||||
#define cudaEventCreateWithFlags musaEventCreateWithFlags
|
||||
|
||||
@@ -2596,6 +2596,29 @@ static bool ggml_hexagon_supported_cumsum(const struct ggml_hexagon_session * se
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_hexagon_supported_diag(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
||||
const struct ggml_tensor * src0 = op->src[0];
|
||||
const struct ggml_tensor * dst = op;
|
||||
|
||||
// diag only supports F32 currently
|
||||
if (src0->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Input must have ne[1] == 1 (vector input)
|
||||
if (src0->ne[1] != 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Output must be square in first two dimensions
|
||||
if (dst->ne[0] != dst->ne[1] || dst->ne[0] != src0->ne[0]) {
|
||||
return false;
|
||||
}
|
||||
|
||||
GGML_UNUSED(sess);
|
||||
return true;
|
||||
}
|
||||
|
||||
static const char * ggml_backend_hexagon_name(ggml_backend_t backend) {
|
||||
auto sess = static_cast<ggml_hexagon_session *>(backend->context);
|
||||
return sess->c_name();
|
||||
@@ -2632,6 +2655,8 @@ static htp_op_code op_remap_to_htp(const ggml_tensor * t) {
|
||||
case GGML_OP_ROPE: return HTP_OP_ROPE;
|
||||
case GGML_OP_REPEAT: return HTP_OP_REPEAT;
|
||||
case GGML_OP_CUMSUM: return HTP_OP_CUMSUM;
|
||||
case GGML_OP_FILL: return HTP_OP_FILL;
|
||||
case GGML_OP_DIAG: return HTP_OP_DIAG;
|
||||
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(t)) {
|
||||
@@ -3029,6 +3054,17 @@ static bool ggml_hexagon_supported_repeat(const struct ggml_hexagon_session * se
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_hexagon_supported_fill(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
||||
const struct ggml_tensor * dst = op;
|
||||
|
||||
if (dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16) {
|
||||
return false;
|
||||
}
|
||||
|
||||
GGML_UNUSED(sess);
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
||||
auto sess = static_cast<ggml_hexagon_session *>(dev->context);
|
||||
|
||||
@@ -3159,6 +3195,14 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
|
||||
supp = ggml_hexagon_supported_cumsum(sess, op);
|
||||
break;
|
||||
|
||||
case GGML_OP_FILL:
|
||||
supp = ggml_hexagon_supported_fill(sess, op);
|
||||
break;
|
||||
|
||||
case GGML_OP_DIAG:
|
||||
supp = ggml_hexagon_supported_diag(sess, op);
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -34,6 +34,8 @@ add_library(${HTP_LIB} SHARED
|
||||
argsort-ops.c
|
||||
ssm-conv.c
|
||||
cumsum-ops.c
|
||||
fill-ops.c
|
||||
diag-ops.c
|
||||
)
|
||||
|
||||
target_compile_definitions(${HTP_LIB} PRIVATE
|
||||
|
||||
@@ -0,0 +1,216 @@
|
||||
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
|
||||
|
||||
#include <HAP_farf.h>
|
||||
#include <HAP_perf.h>
|
||||
|
||||
#define GGML_COMMON_DECL_C
|
||||
#include "ggml-common.h"
|
||||
#include "htp-ctx.h"
|
||||
#include "htp-ops.h"
|
||||
#include "hvx-types.h"
|
||||
#include "hex-utils.h"
|
||||
#include "hvx-copy.h"
|
||||
#include "hex-dma.h"
|
||||
|
||||
#define htp_diag_tensors_preamble \
|
||||
const struct htp_tensor * restrict src0 = octx->src[0]; \
|
||||
const struct htp_tensor * restrict dst = octx->dst; \
|
||||
\
|
||||
const uint32_t ne02 = src0->ne[2]; \
|
||||
\
|
||||
const uint32_t ne0 = dst->ne[0]; \
|
||||
const uint32_t ne1 = dst->ne[1]; \
|
||||
\
|
||||
const uint32_t nb02 = src0->nb[2]; \
|
||||
const uint32_t nb03 = src0->nb[3]; \
|
||||
\
|
||||
const uint32_t nb1 = dst->nb[1]; \
|
||||
const uint32_t nb2 = dst->nb[2]; \
|
||||
const uint32_t nb3 = dst->nb[3];
|
||||
|
||||
struct htp_diag_context {
|
||||
struct htp_ops_context * octx;
|
||||
size_t src_batch_size;
|
||||
size_t dst_row_size;
|
||||
size_t src_batch_size_aligned;
|
||||
size_t dst_row_size_aligned;
|
||||
uint32_t batches_per_thread;
|
||||
uint32_t total_batches;
|
||||
};
|
||||
|
||||
#define htp_diag_preamble \
|
||||
struct htp_diag_context * dctx = (struct htp_diag_context *) data; \
|
||||
struct htp_ops_context * octx = dctx->octx; \
|
||||
htp_diag_tensors_preamble;
|
||||
|
||||
static inline void hvx_diag_row_f32(const float * restrict src, float * restrict dst,
|
||||
uint32_t row_idx, uint32_t n) {
|
||||
hvx_splat_f32_a((uint8_t *) dst, 0.0f, n);
|
||||
dst[row_idx] = src[row_idx];
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Per thread worker: DMA src fetch, compute in VTCM, DMA dst writeback
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
static void diag_thread_f32_dma(unsigned int nth, unsigned int ith, void * data) {
|
||||
htp_diag_preamble;
|
||||
dma_queue * dma_queue = octx->ctx->dma[ith];
|
||||
|
||||
uint64_t t1, t2;
|
||||
t1 = HAP_perf_get_qtimer_count();
|
||||
|
||||
const uint32_t ib0 = dctx->batches_per_thread * ith;
|
||||
const uint32_t ib1 = MIN(ib0 + dctx->batches_per_thread, dctx->total_batches);
|
||||
|
||||
if (ib0 >= ib1) {
|
||||
return;
|
||||
}
|
||||
|
||||
const size_t src_batch_size = dctx->src_batch_size;
|
||||
const size_t dst_row_size = dctx->dst_row_size;
|
||||
const size_t src_batch_size_aligned = dctx->src_batch_size_aligned;
|
||||
const size_t dst_row_size_aligned = dctx->dst_row_size_aligned;
|
||||
|
||||
const uint8_t * src_data = (const uint8_t *) src0->data;
|
||||
uint8_t * dst_data = (uint8_t *) dst->data;
|
||||
|
||||
// 1 src buffer + 1 dst row buffer per thread in VTCM
|
||||
uint8_t * src_spad = octx->src0_spad.data + (ith * src_batch_size_aligned);
|
||||
uint8_t * dst_spad = octx->dst_spad.data + (ith * dst_row_size_aligned);
|
||||
|
||||
for (uint32_t ib = ib0; ib < ib1; ib++) {
|
||||
const uint32_t i3 = ib / ne02;
|
||||
const uint32_t i2 = ib % ne02;
|
||||
|
||||
const uint8_t * src_batch = src_data + i3 * nb03 + i2 * nb02;
|
||||
|
||||
// Fetch source vector into VTCM
|
||||
dma_queue_push_ddr_to_vtcm(dma_queue,
|
||||
dma_make_ptr(src_spad, src_batch),
|
||||
src_batch_size_aligned, src_batch_size, 1);
|
||||
dma_queue_flush(dma_queue);
|
||||
|
||||
const float * src_spad_f32 = (const float *) src_spad;
|
||||
float * dst_spad_f32 = (float *) dst_spad;
|
||||
|
||||
for (uint32_t i1 = 0; i1 < ne1; i1++) {
|
||||
// Compute row in VTCM
|
||||
hvx_diag_row_f32(src_spad_f32, dst_spad_f32, i1, ne0);
|
||||
|
||||
// Write completed row back to DDR
|
||||
uint8_t * dst_row = dst_data + i3 * nb3 + i2 * nb2 + i1 * nb1;
|
||||
dma_queue_push_vtcm_to_ddr(dma_queue,
|
||||
dma_make_ptr(dst_row, dst_spad),
|
||||
dst_row_size, dst_row_size_aligned, 1);
|
||||
dma_queue_flush(dma_queue);
|
||||
}
|
||||
}
|
||||
|
||||
t2 = HAP_perf_get_qtimer_count();
|
||||
|
||||
FARF(HIGH, "diag-f32-dma %d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n",
|
||||
ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], ib0, ib1,
|
||||
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
|
||||
(unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Per thread worker: Direct HVX (no DMA)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
static void diag_thread_f32(unsigned int nth, unsigned int ith, void * data) {
|
||||
htp_diag_preamble;
|
||||
|
||||
uint64_t t1, t2;
|
||||
t1 = HAP_perf_get_qtimer_count();
|
||||
|
||||
const uint8_t * src_data = (const uint8_t *) src0->data;
|
||||
uint8_t * dst_data = (uint8_t *) dst->data;
|
||||
|
||||
const uint32_t ib0 = dctx->batches_per_thread * ith;
|
||||
const uint32_t ib1 = MIN(ib0 + dctx->batches_per_thread, dctx->total_batches);
|
||||
|
||||
for (uint32_t ib = ib0; ib < ib1; ib++) {
|
||||
const uint32_t i3 = ib / ne02;
|
||||
const uint32_t i2 = ib % ne02;
|
||||
|
||||
const float * restrict src_batch = (const float *)(src_data + i3 * nb03 + i2 * nb02);
|
||||
|
||||
for (uint32_t i1 = 0; i1 < ne1; i1++) {
|
||||
float * restrict dst_row = (float *)(dst_data + i3 * nb3 + i2 * nb2 + i1 * nb1);
|
||||
hvx_diag_row_f32(src_batch, dst_row, i1, ne0);
|
||||
}
|
||||
}
|
||||
|
||||
t2 = HAP_perf_get_qtimer_count();
|
||||
|
||||
FARF(HIGH, "diag-f32 %d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n",
|
||||
ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], ib0, ib1,
|
||||
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
|
||||
(unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
|
||||
}
|
||||
|
||||
int op_diag_f32(struct htp_ops_context * octx) {
|
||||
const struct htp_tensor * src0 = octx->src[0];
|
||||
const struct htp_tensor * dst = octx->dst;
|
||||
|
||||
if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
|
||||
const uint32_t total_batches = src0->ne[2] * src0->ne[3];
|
||||
const uint32_t n_threads = MIN(octx->n_threads, total_batches);
|
||||
|
||||
const size_t src_batch_size = src0->ne[0] * sizeof(float);
|
||||
const size_t dst_row_size = dst->ne[0] * sizeof(float);
|
||||
const size_t src_batch_size_aligned = hex_round_up(src_batch_size, VLEN);
|
||||
const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN);
|
||||
|
||||
// 1 src buffer + 1 dst row buffer per thread
|
||||
const size_t spad_per_thread = src_batch_size_aligned + dst_row_size_aligned;
|
||||
|
||||
octx->src0_spad.size_per_thread = src_batch_size_aligned;
|
||||
octx->dst_spad.size_per_thread = dst_row_size_aligned;
|
||||
|
||||
octx->src0_spad.size = n_threads * octx->src0_spad.size_per_thread;
|
||||
octx->dst_spad.size = n_threads * octx->dst_spad.size_per_thread;
|
||||
|
||||
octx->src0_spad.data = octx->ctx->vtcm_base; octx->src0_spad.src = NULL;
|
||||
octx->dst_spad.data = octx->src0_spad.data + octx->src0_spad.size; octx->dst_spad.src = NULL;
|
||||
|
||||
struct htp_diag_context dctx = {
|
||||
.octx = octx,
|
||||
.src_batch_size = src_batch_size,
|
||||
.dst_row_size = dst_row_size,
|
||||
.src_batch_size_aligned = src_batch_size_aligned,
|
||||
.dst_row_size_aligned = dst_row_size_aligned,
|
||||
.batches_per_thread = (total_batches + n_threads - 1) / n_threads,
|
||||
.total_batches = total_batches,
|
||||
};
|
||||
|
||||
if (octx->ctx->vtcm_size < spad_per_thread * n_threads) {
|
||||
worker_pool_run_func(octx->ctx->worker_pool, diag_thread_f32, &dctx, n_threads);
|
||||
} else {
|
||||
worker_pool_run_func(octx->ctx->worker_pool, diag_thread_f32_dma, &dctx, n_threads);
|
||||
}
|
||||
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
|
||||
int op_diag(struct htp_ops_context * octx) {
|
||||
const struct htp_tensor * dst = octx->dst;
|
||||
|
||||
int err = HTP_STATUS_OK;
|
||||
|
||||
switch (dst->type) {
|
||||
case HTP_TYPE_F32:
|
||||
err = op_diag_f32(octx);
|
||||
break;
|
||||
default:
|
||||
err = HTP_STATUS_NO_SUPPORT;
|
||||
break;
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
@@ -0,0 +1,123 @@
|
||||
#pragma clang diagnostic ignored "-Wunused-variable"
|
||||
#pragma clang diagnostic ignored "-Wunused-function"
|
||||
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
|
||||
|
||||
#include <HAP_farf.h>
|
||||
#include <HAP_perf.h>
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include "hvx-copy.h"
|
||||
#include "hvx-utils.h"
|
||||
|
||||
#define GGML_COMMON_DECL_C
|
||||
#include "ggml-common.h"
|
||||
#include "htp-ctx.h"
|
||||
#include "htp-ops.h"
|
||||
|
||||
// ggml op_params layout for FILL:
|
||||
// op_params[0] (as float) - the scalar fill value
|
||||
|
||||
#define fill_preamble \
|
||||
const struct htp_tensor * dst = octx->dst; \
|
||||
\
|
||||
const uint32_t ne0 = dst->ne[0]; \
|
||||
const uint32_t ne1 = dst->ne[1]; \
|
||||
const uint32_t ne2 = dst->ne[2]; \
|
||||
const uint32_t ne3 = dst->ne[3]; \
|
||||
\
|
||||
const uint32_t nb1 = dst->nb[1]; \
|
||||
const uint32_t nb2 = dst->nb[2]; \
|
||||
const uint32_t nb3 = dst->nb[3]; \
|
||||
\
|
||||
const uint32_t nr = ne1 * ne2 * ne3;
|
||||
|
||||
struct htp_fill_context {
|
||||
struct htp_ops_context * octx;
|
||||
uint32_t nrows_per_thread;
|
||||
uint32_t total_rows; // ne1 * ne2 * ne3
|
||||
bool opt_path;
|
||||
HVX_Vector splat_vec;
|
||||
uint32_t elem_size;
|
||||
};
|
||||
|
||||
static void fill_thread(unsigned int nth, unsigned int ith, void * data) {
|
||||
const struct htp_fill_context * fctx = (const struct htp_fill_context *) data;
|
||||
struct htp_ops_context * octx = fctx->octx;
|
||||
fill_preamble;
|
||||
|
||||
// Parallelise over the flat row index spanning ne1*ne2*ne3
|
||||
const uint32_t ir0 = fctx->nrows_per_thread * ith;
|
||||
const uint32_t ir1 = MIN(ir0 + fctx->nrows_per_thread, fctx->total_rows);
|
||||
|
||||
uint64_t t1 = HAP_perf_get_qtimer_count();
|
||||
|
||||
if (fctx->opt_path) {
|
||||
// Opt path: tensor is fully contiguous, treat as flat array
|
||||
const uint32_t elem_start = ir0 * ne0;
|
||||
const uint32_t elem_end = ir1 * ne0;
|
||||
uint8_t * dst_ptr = (uint8_t *) dst->data + elem_start * fctx->elem_size;
|
||||
hvx_splat_u(dst_ptr, fctx->splat_vec, elem_end - elem_start, fctx->elem_size);
|
||||
} else {
|
||||
// Non-contiguous path: must respect strides
|
||||
for (uint32_t ir = ir0; ir < ir1; ++ir) {
|
||||
const uint32_t i1 = ir % ne1;
|
||||
const uint32_t i2 = (ir / ne1) % ne2;
|
||||
const uint32_t i3 = ir / (ne1 * ne2);
|
||||
uint8_t * dst_ptr = (uint8_t *) dst->data + i1*nb1 + i2*nb2 + i3*nb3;
|
||||
hvx_splat_u(dst_ptr, fctx->splat_vec, ne0, fctx->elem_size);
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t t2 = HAP_perf_get_qtimer_count();
|
||||
FARF(HIGH, "fill %u/%u: rows %u:%u usec %u\n",
|
||||
ith, nth, ir0, ir1, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
|
||||
}
|
||||
|
||||
int op_fill(struct htp_ops_context * octx) {
|
||||
fill_preamble;
|
||||
|
||||
if (dst->type != HTP_TYPE_F32 && dst->type != HTP_TYPE_F16) {
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
|
||||
if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
|
||||
// nr = ne1*ne2*ne3 (flat row count across all outer dims); parallelise over it.
|
||||
const uint32_t n_threads = MIN(nr, octx->n_threads);
|
||||
|
||||
// Optimize if fully contiguous: skip stride arithmetic, treat as flat array
|
||||
const bool opt_path = (nb2 == nb1 * ne1) && (nb3 == nb2 * ne2);
|
||||
|
||||
FARF(HIGH, "fill: (%ux%ux%ux%u) type=%u opt=%d\n",
|
||||
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], dst->type, (int) opt_path);
|
||||
|
||||
float val_f32 = 0.f;
|
||||
memcpy(&val_f32, &octx->op_params[0], sizeof(float));
|
||||
|
||||
struct htp_fill_context fctx = {
|
||||
.octx = octx,
|
||||
.nrows_per_thread = (nr + n_threads - 1) / n_threads,
|
||||
.total_rows = nr,
|
||||
.opt_path = opt_path,
|
||||
};
|
||||
|
||||
switch (dst->type) {
|
||||
case HTP_TYPE_F32:
|
||||
fctx.splat_vec = hvx_vec_splat_f32(val_f32);
|
||||
fctx.elem_size = sizeof(float);
|
||||
break;
|
||||
case HTP_TYPE_F16:
|
||||
fctx.splat_vec = hvx_vec_splat_f16((_Float16) val_f32);
|
||||
fctx.elem_size = sizeof(_Float16);
|
||||
break;
|
||||
default:
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
|
||||
worker_pool_run_func(octx->ctx->worker_pool, fill_thread, &fctx, n_threads);
|
||||
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
@@ -98,5 +98,7 @@ int op_repeat(struct htp_ops_context * octx);
|
||||
int op_argsort(struct htp_ops_context * octx);
|
||||
int op_ssm_conv(struct htp_ops_context * octx);
|
||||
int op_cumsum(struct htp_ops_context * octx);
|
||||
int op_fill(struct htp_ops_context * octx);
|
||||
int op_diag(struct htp_ops_context * octx);
|
||||
|
||||
#endif /* HTP_CTX_H */
|
||||
|
||||
@@ -80,6 +80,8 @@ enum htp_op_code {
|
||||
HTP_OP_SSM_CONV,
|
||||
HTP_OP_REPEAT,
|
||||
HTP_OP_CUMSUM,
|
||||
HTP_OP_FILL,
|
||||
HTP_OP_DIAG,
|
||||
|
||||
HTP_OP_INVALID
|
||||
};
|
||||
|
||||
@@ -514,6 +514,12 @@ static int execute_op(struct htp_ops_context * octx) {
|
||||
case HTP_OP_CUMSUM:
|
||||
return op_cumsum(octx);
|
||||
|
||||
case HTP_OP_FILL:
|
||||
return op_fill(octx);
|
||||
|
||||
case HTP_OP_DIAG:
|
||||
return op_diag(octx);
|
||||
|
||||
case HTP_OP_INVALID:
|
||||
break;
|
||||
|
||||
|
||||
@@ -18,6 +18,7 @@ libggml-htp-v68.so = 1
|
||||
libggml-htp-v69.so = 1
|
||||
libggml-htp-v73.so = 1
|
||||
libggml-htp-v75.so = 1
|
||||
libggml-htp-v79.so = 1
|
||||
libggml-htp-v81.so = 1
|
||||
|
||||
[ControlFlags]
|
||||
@@ -31,6 +32,7 @@ libggml-htp-v68.so,,,0x10 ;COPYFLG_NO_OVERWRITE
|
||||
libggml-htp-v69.so,,,0x10 ;COPYFLG_NO_OVERWRITE
|
||||
libggml-htp-v73.so,,,0x10 ;COPYFLG_NO_OVERWRITE
|
||||
libggml-htp-v75.so,,,0x10 ;COPYFLG_NO_OVERWRITE
|
||||
libggml-htp-v79.so,,,0x10 ;COPYFLG_NO_OVERWRITE
|
||||
libggml-htp-v81.so,,,0x10 ;COPYFLG_NO_OVERWRITE
|
||||
|
||||
[Strings]
|
||||
|
||||
@@ -931,13 +931,13 @@ void ggml_metal_device_rsets_keep_alive(ggml_metal_device_t dev) {
|
||||
}
|
||||
|
||||
struct ggml_metal_event {
|
||||
void * obj; // id<MTLEvent>
|
||||
void * obj; // id<MTLSharedEvent>
|
||||
|
||||
atomic_int value;
|
||||
};
|
||||
|
||||
void ggml_metal_event_encode_signal(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf_raw) {
|
||||
id<MTLEvent> event = (id<MTLEvent>)ev->obj;
|
||||
id<MTLSharedEvent> event = (id<MTLSharedEvent>)ev->obj;
|
||||
|
||||
id<MTLCommandBuffer> cmd_buf = (id<MTLCommandBuffer>) cmd_buf_raw;
|
||||
|
||||
@@ -945,7 +945,7 @@ void ggml_metal_event_encode_signal(ggml_metal_event_t ev, ggml_metal_cmd_buf_t
|
||||
}
|
||||
|
||||
void ggml_metal_event_encode_wait(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf_raw) {
|
||||
id<MTLEvent> event = (id<MTLEvent>)ev->obj;
|
||||
id<MTLSharedEvent> event = (id<MTLSharedEvent>)ev->obj;
|
||||
|
||||
id<MTLCommandBuffer> cmd_buf = (id<MTLCommandBuffer>) cmd_buf_raw;
|
||||
|
||||
@@ -953,7 +953,7 @@ void ggml_metal_event_encode_wait(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cm
|
||||
}
|
||||
|
||||
ggml_metal_event_t ggml_metal_device_event_init(ggml_metal_device_t dev) {
|
||||
id<MTLEvent> event = [dev->mtl_device newEvent];
|
||||
id<MTLSharedEvent> event = [dev->mtl_device newSharedEvent];
|
||||
|
||||
ggml_metal_event_t ev = calloc(1, sizeof(struct ggml_metal_event));
|
||||
|
||||
@@ -964,7 +964,7 @@ ggml_metal_event_t ggml_metal_device_event_init(ggml_metal_device_t dev) {
|
||||
}
|
||||
|
||||
void ggml_metal_device_event_free(ggml_metal_device_t dev, ggml_metal_event_t ev) {
|
||||
id<MTLEvent> event = ev->obj;
|
||||
id<MTLSharedEvent> event = ev->obj;
|
||||
[event release];
|
||||
|
||||
free(ev);
|
||||
@@ -973,14 +973,13 @@ void ggml_metal_device_event_free(ggml_metal_device_t dev, ggml_metal_event_t ev
|
||||
}
|
||||
|
||||
void ggml_metal_device_event_synchronize(ggml_metal_device_t dev, ggml_metal_event_t ev) {
|
||||
@autoreleasepool {
|
||||
id<MTLEvent> event = ev->obj;
|
||||
|
||||
id<MTLCommandBuffer> cmd_buf = [dev->mtl_queue commandBuffer];
|
||||
[cmd_buf encodeWaitForEvent:event value:atomic_load_explicit(&ev->value, memory_order_relaxed)];
|
||||
[cmd_buf commit];
|
||||
[cmd_buf waitUntilCompleted];
|
||||
id<MTLSharedEvent> event = ev->obj;
|
||||
const bool res = [event waitUntilSignaledValue:atomic_load_explicit(&ev->value, memory_order_relaxed) timeoutMS:60000];
|
||||
if (!res) {
|
||||
GGML_ABORT("%s: failed to wait for event\n", __func__);
|
||||
}
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total) {
|
||||
|
||||
@@ -918,6 +918,10 @@ ggml_backend_reg_t ggml_backend_metal_reg(void) {
|
||||
static std::vector<ggml_backend_device_ptr> devs;
|
||||
|
||||
if (!initialized) {
|
||||
// workaround macOS limitation (kIOGPUCommandBufferCallbackErrorImpactingInteractivity) until proper fix becomes possible
|
||||
// ref: https://github.com/ggml-org/llama.cpp/issues/20141#issuecomment-4272947703
|
||||
setenv("AGX_RELAX_CDM_CTXSTORE_TIMEOUT", "1", true);
|
||||
|
||||
static ggml_backend_metal_reg_ptr reg_ctx(ggml_backend_metal_reg_init());
|
||||
|
||||
for (int i = 0; i < g_devices; ++i) {
|
||||
|
||||
@@ -5116,115 +5116,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
GGML_ASSERT(tensor->ne[2] == 1);
|
||||
GGML_ASSERT(tensor->ne[3] == 1);
|
||||
|
||||
// Transpose weights
|
||||
size_t q_size_bytes = K * M / 4 * sizeof(float);
|
||||
cl_buffer_region region;
|
||||
region.origin = 0;
|
||||
region.size = q_size_bytes;
|
||||
cl_mem qT_d = clCreateSubBuffer(
|
||||
backend_ctx->prealloc_quant_trans.buffer,
|
||||
0,
|
||||
CL_BUFFER_CREATE_TYPE_REGION,
|
||||
®ion,
|
||||
&err);
|
||||
CL_CHECK(err);
|
||||
|
||||
cl_mem q_d_image1D;
|
||||
cl_mem qT_d_image1D;
|
||||
|
||||
cl_image_format img_fmt_1d;
|
||||
cl_image_desc img_desc_1d;
|
||||
|
||||
img_fmt_1d = { CL_RGBA, CL_FLOAT };
|
||||
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
||||
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc_1d.image_width = M * K / 4 / 4;
|
||||
img_desc_1d.buffer = extra->q;
|
||||
q_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
img_fmt_1d = { CL_RGBA, CL_FLOAT };
|
||||
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
||||
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc_1d.image_width = M * K / 4 / 4;
|
||||
img_desc_1d.buffer = qT_d;
|
||||
qT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
int height_q = M / 4;
|
||||
int width_q = K / 4 / 4;
|
||||
kernel = backend_ctx->kernel_transpose_32;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_d_image1D));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &qT_d_image1D));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_q));
|
||||
|
||||
size_t local_size_q[3] = {4, 16, 1};
|
||||
size_t global_size_q[3] = {static_cast<size_t>(width_q), static_cast<size_t>(height_q), 1};
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_q, local_size_q, 0, NULL, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
|
||||
// Transpose scales
|
||||
size_t d_size_bytes = M * (K / 32) * 2;
|
||||
region.origin = 0;
|
||||
region.size = d_size_bytes;
|
||||
cl_mem dT_d = clCreateSubBuffer(
|
||||
backend_ctx->prealloc_scales_trans.buffer,
|
||||
0,
|
||||
CL_BUFFER_CREATE_TYPE_REGION,
|
||||
®ion,
|
||||
&err);
|
||||
CL_CHECK(err);
|
||||
|
||||
cl_mem d_d_image1D;
|
||||
cl_mem dT_d_image1D;
|
||||
|
||||
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
||||
img_fmt_1d = { CL_R, CL_HALF_FLOAT };
|
||||
img_desc_1d.image_width = M * K / 32;
|
||||
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc_1d.buffer = extra->d;
|
||||
d_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
|
||||
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
||||
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc_1d.image_width = M * K / 32 / 4;
|
||||
img_desc_1d.buffer = dT_d;
|
||||
dT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
int height_s = M / 4;
|
||||
int width_s = K / 32;
|
||||
|
||||
kernel = backend_ctx->kernel_transpose_16_4x1;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_d_image1D));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &dT_d_image1D));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_s));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_s));
|
||||
|
||||
size_t local_size_s[3] = {4, 16, 1};
|
||||
size_t global_size_s[3] = {static_cast<size_t>(width_s), static_cast<size_t>(height_s), 1};
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_s, local_size_s, 0, NULL, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
|
||||
// copy transposed buffer contents to original buffers
|
||||
CL_CHECK(clEnqueueCopyBuffer(queue, qT_d, extra->q, 0, 0, q_size_bytes, 0, NULL, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
|
||||
CL_CHECK(clEnqueueCopyBuffer(queue, dT_d, extra->d, 0, 0, d_size_bytes, 0, NULL, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
|
||||
CL_CHECK(clReleaseMemObject(qT_d));
|
||||
CL_CHECK(clReleaseMemObject(dT_d));
|
||||
|
||||
CL_CHECK(clReleaseMemObject(q_d_image1D));
|
||||
CL_CHECK(clReleaseMemObject(d_d_image1D));
|
||||
CL_CHECK(clReleaseMemObject(qT_d_image1D));
|
||||
CL_CHECK(clReleaseMemObject(dT_d_image1D));
|
||||
transpose_2d_as_32b(backend_ctx, extra->q, extra->q, size_q, K/4, M);
|
||||
transpose_2d_as_16b(backend_ctx, extra->d, extra->d, size_d, K/32, M);
|
||||
} // end transpose
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
|
||||
@@ -9956,19 +9849,18 @@ static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_t
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
|
||||
const enum ggml_type src0t = src0->type;
|
||||
const enum ggml_type src1t = src1->type;
|
||||
|
||||
GGML_ASSERT(src0t == GGML_TYPE_Q8_0);
|
||||
GGML_ASSERT(src1t == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_Q8_0);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra;
|
||||
|
||||
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
GGML_ASSERT(src1->view_offs == 0);
|
||||
GGML_ASSERT(dst->view_offs == 0);
|
||||
|
||||
@@ -9989,148 +9881,112 @@ static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_t
|
||||
cl_context context = backend_ctx->context;
|
||||
cl_kernel kernel;
|
||||
|
||||
// init CL objects
|
||||
cl_int status;
|
||||
cl_image_format img_fmt_1d;
|
||||
cl_image_desc img_desc_1d;
|
||||
cl_int err;
|
||||
cl_image_format img_fmt;
|
||||
cl_image_desc img_desc;
|
||||
cl_buffer_region region;
|
||||
cl_mem A_image1d;
|
||||
cl_mem B_image1d;
|
||||
cl_mem B_sub_buffer;
|
||||
cl_mem S_image1d;
|
||||
// for B transpose
|
||||
cl_mem B_image1d_trans = nullptr;
|
||||
cl_mem B_d = nullptr;
|
||||
|
||||
cl_mem D_image1d;
|
||||
cl_mem D_sub_buffer;
|
||||
|
||||
int M = ne01;
|
||||
int N = ne1;
|
||||
int K = ne00;
|
||||
|
||||
// create an image for A
|
||||
img_fmt_1d = { CL_R, CL_FLOAT};
|
||||
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
||||
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc_1d.image_width = M * K / 4; // Divide by 4 for char -> float
|
||||
img_desc_1d.buffer = extra0_q8_0->q;
|
||||
A_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
|
||||
CL_CHECK(status);
|
||||
if (ne1 == 1) {
|
||||
cl_mem q_img = nullptr;
|
||||
cl_mem b_sub_buf = nullptr;
|
||||
cl_mem b_img = nullptr;
|
||||
|
||||
// create an image for Scale
|
||||
img_fmt_1d = { CL_R, CL_HALF_FLOAT};
|
||||
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
||||
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc_1d.image_width = M * K / 32; // Block size is 32
|
||||
img_desc_1d.buffer = extra0_q8_0->d;
|
||||
S_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
|
||||
CL_CHECK(status);
|
||||
// image for q
|
||||
img_fmt = { CL_R, CL_UNSIGNED_INT32};
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = M * K / 4;
|
||||
img_desc.buffer = extra0_q8_0->q;
|
||||
CL_CHECK((q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
// create a sub_buffer for B
|
||||
region.origin = (extra1->offset); // + src1->view_offs);
|
||||
region.size = K * N * sizeof(float);
|
||||
B_sub_buffer = clCreateSubBuffer((extra1->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
||||
CL_CHECK(status);
|
||||
// create a sub_buffer for B
|
||||
region.origin = offset1;
|
||||
region.size = K * N * sizeof(float);
|
||||
CL_CHECK((b_sub_buf = clCreateSubBuffer((extra1->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
|
||||
// create an image for B from sub_buffer: RGBA (OCL)
|
||||
img_fmt_1d = {CL_RGBA, CL_FLOAT};
|
||||
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
||||
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc_1d.image_width = K * N / 4;
|
||||
img_desc_1d.buffer = B_sub_buffer;
|
||||
B_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
|
||||
CL_CHECK(status);
|
||||
// image for activations
|
||||
img_fmt = {CL_RGBA, CL_FLOAT};
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = K * N / 4;
|
||||
img_desc.buffer = b_sub_buf;
|
||||
CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
// Create subbuffer and image1d_buffer for dst
|
||||
region.origin = (extrad->offset); // + dst->view_offs;
|
||||
region.size = M * N * sizeof(float);
|
||||
D_sub_buffer = clCreateSubBuffer((extrad->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
img_fmt_1d = {CL_R, CL_FLOAT};
|
||||
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
||||
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc_1d.image_width = M * N;
|
||||
img_desc_1d.buffer = D_sub_buffer;
|
||||
D_image1d = clCreateImage(context, CL_MEM_WRITE_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
size_t local_work_size[3] = {1, 1, 1};
|
||||
size_t global_work_size[3] = {1, 1, 1};
|
||||
|
||||
if (N == 1) {
|
||||
kernel = backend_ctx->CL_mul_mat_vec_q8_0_f32;
|
||||
|
||||
int r2 = 1;
|
||||
int r3 = 1;
|
||||
cl_uint k_arg = 0;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &A_image1d));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &extra0_q8_0->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &B_image1d));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_ulong), &extra1->offset));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_ulong), &extrad->offset));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne10));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &r2));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &r3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q8_0->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &b_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &extra1->offset));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &extrad->offset));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3));
|
||||
|
||||
size_t wavesize = backend_ctx->adreno_wave_size;
|
||||
local_work_size[0] = wavesize;
|
||||
local_work_size[1] = 4; // reduce factor
|
||||
local_work_size[2] = 1;
|
||||
size_t local_work_size[] = { wavesize, 4, 1 };
|
||||
size_t global_work_size[] = { CEIL_DIV(M, wavesize)*wavesize, 4, 1 };
|
||||
|
||||
global_work_size[0] = ((M + wavesize - 1) / wavesize) * wavesize;
|
||||
global_work_size[1] = 4; // reduce factor
|
||||
global_work_size[2] = 1;
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
|
||||
CL_CHECK(clReleaseMemObject(q_img));
|
||||
CL_CHECK(clReleaseMemObject(b_img));
|
||||
CL_CHECK(clReleaseMemObject(b_sub_buf));
|
||||
} else {
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
int padding;
|
||||
cl_mem b_sub_buf = nullptr;
|
||||
cl_mem b_sub_buf_trans = nullptr;
|
||||
cl_mem b_img = nullptr;
|
||||
cl_mem b_img_trans = nullptr;
|
||||
|
||||
//how many extra elements beyond multiple of 8
|
||||
// subbuffer for activations
|
||||
region.origin = offset1;
|
||||
region.size = K * N * sizeof(float);
|
||||
CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
|
||||
// image for activations
|
||||
img_fmt = {CL_RGBA, CL_FLOAT};
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = K * N / 4;
|
||||
img_desc.buffer = b_sub_buf;
|
||||
CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
// pad N to multiple of 8
|
||||
int extra_elements = N % 8;
|
||||
|
||||
//how much padding to add
|
||||
padding = 0;
|
||||
int padding = 0;
|
||||
if (extra_elements > 0){
|
||||
padding = 8 - extra_elements;
|
||||
}
|
||||
|
||||
// Specify the starting offset (in bytes)
|
||||
// subbuffer for transposed activations
|
||||
region.origin = 0;
|
||||
// Specify the size of the sub-buffer (divide by 2 for FP16)
|
||||
region.size = K * (N + padding) * sizeof(float)/2;
|
||||
backend_ctx->prealloc_act_trans.allocate(context, region.size);
|
||||
B_d = clCreateSubBuffer(
|
||||
backend_ctx->prealloc_act_trans.buffer,
|
||||
0,
|
||||
CL_BUFFER_CREATE_TYPE_REGION,
|
||||
®ion,
|
||||
&status);
|
||||
CL_CHECK(status);
|
||||
CL_CHECK((b_sub_buf_trans = clCreateSubBuffer(backend_ctx->prealloc_act_trans.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
|
||||
cl_image_format image_format_B_d_output = { CL_RGBA, CL_HALF_FLOAT }; //(CL_HALF_FLOAT for FP16)
|
||||
cl_image_desc image_desc_B_d_output = {
|
||||
CL_MEM_OBJECT_IMAGE1D_BUFFER,
|
||||
static_cast<size_t>(K * (N + padding)/4),
|
||||
0, 0, 0, 0, 0, 0, 0, { B_d }
|
||||
};
|
||||
B_image1d_trans = clCreateImage(
|
||||
context,
|
||||
0,
|
||||
&image_format_B_d_output,
|
||||
&image_desc_B_d_output,
|
||||
NULL,
|
||||
&status);
|
||||
CL_CHECK(status);
|
||||
// image for transposed activations
|
||||
img_fmt = {CL_RGBA, CL_HALF_FLOAT};
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = K * (N + padding) / 4;
|
||||
img_desc.buffer = b_sub_buf_trans;
|
||||
CL_CHECK((b_img_trans = clCreateImage(context, 0, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
// transpose activations
|
||||
int height_B = N/4;
|
||||
if (height_B == 0) {
|
||||
height_B = 1;
|
||||
@@ -10139,58 +9995,39 @@ static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_t
|
||||
int padded_height_B = (N + padding)/4;
|
||||
|
||||
kernel = backend_ctx->kernel_transpose_32_16;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &B_image1d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &B_image1d_trans));
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &b_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &b_img_trans));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_B));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_B));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &padded_height_B));
|
||||
|
||||
size_t local_size_t[2] = { 1, 16 };
|
||||
size_t global_size_t[2] = {
|
||||
static_cast<size_t>(width_B),
|
||||
static_cast<size_t>(padded_height_B)
|
||||
};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_size_t, local_size_t, dst);
|
||||
size_t local_work_size_t[2] = { 1, 16 };
|
||||
size_t global_work_size_t[2] = { (size_t)width_B, (size_t)padded_height_B };
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size_t, local_work_size_t, dst);
|
||||
|
||||
// gemm
|
||||
kernel = backend_ctx->kernel_mul_mm_q8_0_f32_8x4;
|
||||
|
||||
int N_with_padding = N + padding;
|
||||
int padded_N = N + padding;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q8_0->q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q8_0->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &B_image1d_trans));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &b_img_trans));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &K));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &M));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &N_with_padding));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &padded_N));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &N));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &offsetd));
|
||||
|
||||
global_work_size[0] = (size_t)(N + 7) / 8;
|
||||
global_work_size[1] = (size_t)(M + 3) / 4;
|
||||
global_work_size[2] = 1;
|
||||
size_t global_work_size[] = { (size_t)CEIL_DIV(N, 8), (size_t)CEIL_DIV(M, 4), 1 };
|
||||
size_t local_work_size[] = { 2, 128, 1 };
|
||||
|
||||
local_work_size[0] = 2;
|
||||
local_work_size[1] = 128;
|
||||
local_work_size[2] = 1;
|
||||
}
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
|
||||
// enqueue kernel with profiling
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
|
||||
// deallocate sub buffers and images
|
||||
CL_CHECK(clReleaseMemObject(A_image1d));
|
||||
CL_CHECK(clReleaseMemObject(B_sub_buffer));
|
||||
CL_CHECK(clReleaseMemObject(B_image1d));
|
||||
CL_CHECK(clReleaseMemObject(S_image1d));
|
||||
CL_CHECK(clReleaseMemObject(D_sub_buffer));
|
||||
CL_CHECK(clReleaseMemObject(D_image1d));
|
||||
if (B_image1d_trans) {
|
||||
CL_CHECK(clReleaseMemObject(B_image1d_trans));
|
||||
}
|
||||
if (B_d) {
|
||||
CL_CHECK(clReleaseMemObject(B_d));
|
||||
CL_CHECK(clReleaseMemObject(b_img_trans));
|
||||
CL_CHECK(clReleaseMemObject(b_sub_buf_trans));
|
||||
CL_CHECK(clReleaseMemObject(b_img));
|
||||
CL_CHECK(clReleaseMemObject(b_sub_buf));
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(backend);
|
||||
|
||||
@@ -19,7 +19,6 @@
|
||||
#include <iomanip>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <openvino/core/dimension.hpp>
|
||||
#include <openvino/core/except.hpp>
|
||||
#include <openvino/core/node.hpp>
|
||||
@@ -207,8 +206,22 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const {
|
||||
break;
|
||||
}
|
||||
case GGML_OP_ROPE: {
|
||||
const int mode = node->op_params[2];
|
||||
switch (mode) {
|
||||
case GGML_ROPE_TYPE_NEOX: {
|
||||
op_case = 0x00010000;
|
||||
break;
|
||||
}
|
||||
case GGML_ROPE_TYPE_IMROPE: {
|
||||
op_case = 0x00020000;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
op_case = 0x00000000;
|
||||
break;
|
||||
}
|
||||
if (node->src[0]->op == GGML_OP_VIEW) {
|
||||
op_case = 2;
|
||||
op_case = (op_case | 0x00000002);
|
||||
}
|
||||
break;
|
||||
}
|
||||
@@ -573,9 +586,6 @@ std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const
|
||||
}
|
||||
|
||||
std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph, bool naive) {
|
||||
static std::mutex weights_mutex;
|
||||
std::lock_guard<std::mutex> lock(weights_mutex);
|
||||
|
||||
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
|
||||
auto * nodes = cgraph->nodes;
|
||||
auto n_nodes = cgraph->n_nodes;
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
#include <cstring>
|
||||
#include <openvino/runtime/intel_gpu/ocl/ocl.hpp>
|
||||
#include <openvino/runtime/intel_npu/level_zero/level_zero.hpp>
|
||||
#include <openvino/runtime/properties.hpp>
|
||||
#include <optional>
|
||||
|
||||
ov::Core & ov_singleton_core() {
|
||||
@@ -42,11 +43,13 @@ void ggml_openvino_device_config::init() {
|
||||
{"NPUW_DQ", "YES" },
|
||||
{"NPUW_DQ_FULL", "NO" },
|
||||
};
|
||||
if (cache_dir) {
|
||||
if (cache_dir && strlen(cache_dir) > 0) {
|
||||
compile_config["NPUW_CACHE_DIR"] = cache_dir;
|
||||
compile_config.insert(ov::cache_mode(ov::CacheMode::OPTIMIZE_SIZE));
|
||||
}
|
||||
} else if (cache_dir) {
|
||||
ov_singleton_core().set_property(ov::cache_dir(cache_dir));
|
||||
} else if (cache_dir && strlen(cache_dir) > 0) {
|
||||
compile_config.insert(ov::cache_dir(cache_dir));
|
||||
compile_config.insert(ov::cache_mode(ov::CacheMode::OPTIMIZE_SIZE));
|
||||
}
|
||||
|
||||
// Initialize remote context with queue sharing for GPU
|
||||
@@ -259,10 +262,12 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
|
||||
layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements;
|
||||
int64_t n_blocks = n_elements / layout.weights_per_block;
|
||||
layout.scales_size = n_blocks * sizeof(uint16_t);
|
||||
// For symmetric quantization, we only need one zp value (not one per block)
|
||||
// Zero points are stored in U4 or U8 format matching the weight type
|
||||
size_t n_zp_elements = layout.is_symmetric ? 1 : n_blocks;
|
||||
layout.zp_size = layout.is_u4 ? ((n_zp_elements + 1) / 2) : n_zp_elements;
|
||||
// For symmetric quantization, no zp needed (weights stored as signed)
|
||||
if (layout.is_symmetric) {
|
||||
layout.zp_size = 0;
|
||||
} else {
|
||||
layout.zp_size = layout.is_u4 ? ((n_blocks + 1) / 2) : n_blocks;
|
||||
}
|
||||
|
||||
layout.weights_offset = 0;
|
||||
layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;
|
||||
@@ -313,10 +318,12 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
|
||||
// Scales: F16 per block
|
||||
int64_t n_blocks = n_elements / layout.weights_per_block;
|
||||
layout.scales_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes
|
||||
// Zero points: U4 or U8 matching weight type
|
||||
// For symmetric quantization, we only need one zp value (not one per block)
|
||||
size_t n_zp_elements = layout.is_symmetric ? 1 : n_blocks;
|
||||
layout.zp_size = layout.is_u4 ? ((n_zp_elements + 1) / 2) : n_zp_elements;
|
||||
// For symmetric quantization, no zp needed (weights stored as signed)
|
||||
if (layout.is_symmetric) {
|
||||
layout.zp_size = 0;
|
||||
} else {
|
||||
layout.zp_size = layout.is_u4 ? ((n_blocks + 1) / 2) : n_blocks;
|
||||
}
|
||||
|
||||
// Layout in buffer: [weights | scales | zp] with alignment
|
||||
layout.weights_offset = 0;
|
||||
|
||||
@@ -145,13 +145,18 @@ static void * ggml_backend_openvino_buffer_get_base(ggml_backend_buffer_t buffer
|
||||
return ctx->data;
|
||||
}
|
||||
|
||||
static bool is_stateful_enabled() {
|
||||
static const auto * stateful = getenv("GGML_OPENVINO_STATEFUL_EXECUTION");
|
||||
return stateful && *stateful != '\0' && strcmp(stateful, "0") != 0;
|
||||
}
|
||||
|
||||
static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||
// GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name);
|
||||
ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
|
||||
|
||||
// Put kvcache on device memory for GPU (NPU memory is too small even for kvcache)
|
||||
if (strncmp(tensor->name, "cache_", 6) == 0 && !ctx->is_remote && ggml_openvino_get_device_name() == "GPU" &&
|
||||
!getenv("GGML_OPENVINO_STATEFUL_EXECUTION")) {
|
||||
!is_stateful_enabled()) {
|
||||
GGML_ASSERT(ctx->tensor_extras.empty());
|
||||
auto device = ctx->device;
|
||||
auto size = ctx->size;
|
||||
@@ -600,6 +605,14 @@ bool ggml_backend_buft_is_openvino_host(ggml_backend_buffer_type_t buft) {
|
||||
|
||||
static void ggml_backend_openvino_free(ggml_backend_t backend) {
|
||||
ggml_backend_openvino_context * ctx = (ggml_backend_openvino_context *) backend->context;
|
||||
|
||||
if (ctx->runtime_context) {
|
||||
auto r_ctx = std::static_pointer_cast<ov_runtime_context>(ctx->runtime_context);
|
||||
if (--r_ctx->backend_count == 0) {
|
||||
r_ctx->clear_caches();
|
||||
}
|
||||
}
|
||||
|
||||
delete ctx;
|
||||
delete backend;
|
||||
}
|
||||
@@ -644,7 +657,12 @@ static ggml_guid_t ggml_backend_openvino_guid(void) {
|
||||
}
|
||||
|
||||
static std::shared_ptr<ov_runtime_context> get_ov_runtime_context_ptr() {
|
||||
static std::shared_ptr<ov_runtime_context> r_ctx = std::make_shared<ov_runtime_context>();
|
||||
static std::shared_ptr<ov_runtime_context> r_ctx = [] {
|
||||
auto ctx = std::make_shared<ov_runtime_context>();
|
||||
ctx->device = ggml_openvino_get_device_name();
|
||||
ctx->stateful = is_stateful_enabled() && !ggml_openvino_is_npu();
|
||||
return ctx;
|
||||
}();
|
||||
return r_ctx;
|
||||
}
|
||||
|
||||
@@ -669,8 +687,7 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device) {
|
||||
}
|
||||
|
||||
std::shared_ptr<ov_runtime_context> r_ctx = std::static_pointer_cast<ov_runtime_context>(ctx->runtime_context);
|
||||
r_ctx->device = ggml_openvino_get_device_name();
|
||||
r_ctx->stateful = getenv("GGML_OPENVINO_STATEFUL_EXECUTION") && !ggml_openvino_is_npu();
|
||||
r_ctx->backend_count++;
|
||||
|
||||
ggml_backend_t openvino_backend = new ggml_backend{
|
||||
/* .guid = */ ggml_backend_openvino_guid(),
|
||||
@@ -883,7 +900,7 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
|
||||
const int32_t * op_params = op->op_params;
|
||||
const int n_dims = op_params[1];
|
||||
const int mode = op_params[2];
|
||||
if (mode != GGML_ROPE_TYPE_NORMAL && mode != GGML_ROPE_TYPE_NEOX) {
|
||||
if (mode != GGML_ROPE_TYPE_NORMAL && mode != GGML_ROPE_TYPE_NEOX && mode != GGML_ROPE_TYPE_IMROPE) {
|
||||
// GGML_LOG_WARN("OpenVINO backend does not support ROPE with mode %d\n", mode);
|
||||
return true;
|
||||
}
|
||||
@@ -896,14 +913,6 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
|
||||
// GGML_LOG_WARN("OpenVINO backend does not support ROPE with type %s\n", ggml_type_name(op->type));
|
||||
return true;
|
||||
}
|
||||
float freq_scale;
|
||||
float ext_factor;
|
||||
memcpy(&freq_scale, op_params + 6, sizeof(float));
|
||||
memcpy(&ext_factor, op_params + 7, sizeof(float));
|
||||
if (ext_factor != 0.0f) {
|
||||
// GGML_LOG_WARN("OpenVINO backend does not support ROPE with ext_factor %f != 0.0f\n", ext_factor);
|
||||
return true;
|
||||
}
|
||||
if (op->src[0]->op == GGML_OP_VIEW) {
|
||||
if (op->src[0]->view_src->ne[1] != op->src[0]->ne[2]) {
|
||||
// GGML_LOG_WARN(
|
||||
@@ -913,6 +922,12 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
if (mode == GGML_ROPE_TYPE_IMROPE &&
|
||||
(op->src[2] != 0 || ((const float *) op_params)[6] != 1 || ((const float *) op_params)[7] != 0 ||
|
||||
((const float *) op_params)[8] != 1)) {
|
||||
// GGML_LOG_WARN("OpenVINO backend does not support IMROPE with freq_factors, freq_scale, ext_factor, and attn_factor\n");
|
||||
return true;
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
@@ -942,6 +957,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
|
||||
// GGML_OP_SOFT_MAX,
|
||||
GGML_OP_SET_ROWS, GGML_OP_FLASH_ATTN_EXT, GGML_OP_CPY};
|
||||
static const std::set<ggml_unary_op> supported_unary_ops{
|
||||
GGML_UNARY_OP_GELU,
|
||||
GGML_UNARY_OP_SILU,
|
||||
};
|
||||
static const std::set<ggml_glu_op> supported_glu_ops{
|
||||
|
||||
@@ -46,6 +46,7 @@ void unpack_32_4(const uint8_t * data, uint8_t * dst) {
|
||||
|
||||
// Extracts (weight, scales, zp) from Q4_0 tensors.
|
||||
// Data layout is: |16 bit scale|32 x 4bit weights|.
|
||||
// When zp_arr is empty (symmetric), weights are stored as signed i4 (value - 8).
|
||||
void extract_q4_0_data(const ggml_tensor * tensor,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
@@ -55,28 +56,32 @@ void extract_q4_0_data(const ggml_tensor * tensor,
|
||||
auto * data = static_cast<uint8_t *>(tensor->data);
|
||||
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
||||
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * zp = static_cast<uint8_t *>(zp_arr.data());
|
||||
|
||||
bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization
|
||||
bool is_symmetric = (weights_arr.get_element_type() == ov::element::i4); // Signed i4 path
|
||||
|
||||
// For Q4_0, zero point is always 8
|
||||
if (is_scalar_zp) {
|
||||
zp[0] = 8 | (8 << 4); // Pack two 4-bit values
|
||||
}
|
||||
|
||||
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
|
||||
scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block)));
|
||||
// For asymmetric quantization, compute per-block zero points
|
||||
if (!is_scalar_zp) {
|
||||
if (!is_symmetric) {
|
||||
auto * zp = static_cast<uint8_t *>(zp_arr.data());
|
||||
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
|
||||
scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block)));
|
||||
// Pack two 4-bit zero points per byte
|
||||
if (i % 2 == 0) {
|
||||
zp[i / 2] = 8; // Lower nibble
|
||||
} else {
|
||||
zp[i / 2] |= (8 << 4); // Upper nibble
|
||||
}
|
||||
}
|
||||
unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16);
|
||||
});
|
||||
unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16);
|
||||
});
|
||||
} else {
|
||||
// Symmetric: unpack as u4 then convert to i4 by subtracting 8 (XOR each nibble)
|
||||
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
|
||||
scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block)));
|
||||
unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16);
|
||||
// Convert u4 to i4: subtract 8 from each nibble. XOR 0x88 flips each nibble by 8.
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
weights[i * 16 + j] ^= 0x88;
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Extracts (weight, scales, zp) from Q4_1 tensors.
|
||||
@@ -123,6 +128,7 @@ void extract_q4_1_data(const ggml_tensor * tensor,
|
||||
|
||||
// Extracts (weight, scales, zp) from Q8_0 tensors.
|
||||
// Data layout is: |16 bit scale|32 x 8bit weights|.
|
||||
// When zp_arr is empty (symmetric), weights are stored as signed i8 directly.
|
||||
void extract_q8_0_data(const ggml_tensor * tensor,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
@@ -133,29 +139,30 @@ void extract_q8_0_data(const ggml_tensor * tensor,
|
||||
auto * data = static_cast<uint8_t *>(tensor->data);
|
||||
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
||||
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * zp = static_cast<uint8_t *>(zp_arr.data());
|
||||
|
||||
bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization
|
||||
bool is_symmetric = (weights_arr.get_element_type() == ov::element::i8); // Signed i8 path
|
||||
|
||||
// For Q8_0, zero point is always 128
|
||||
if (is_scalar_zp) {
|
||||
zp[0] = 128;
|
||||
}
|
||||
|
||||
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
|
||||
uint8_t * block_data = data + i * bytes_per_block;
|
||||
scales[i] = ov::float16::from_bits(*(uint16_t *) block_data);
|
||||
// For asymmetric quantization, store per-block zero points
|
||||
if (!is_scalar_zp) {
|
||||
if (!is_symmetric) {
|
||||
auto * zp = static_cast<uint8_t *>(zp_arr.data());
|
||||
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
|
||||
uint8_t * block_data = data + i * bytes_per_block;
|
||||
scales[i] = ov::float16::from_bits(*(uint16_t *) block_data);
|
||||
zp[i] = 128;
|
||||
}
|
||||
for (size_t j = 0; j < weights_per_block; ++j) {
|
||||
uint8_t x = block_data[j + 2]; // j+2 to skip the scale bytes.
|
||||
// Original data is in int8_t, so we add a bias of -128 and invert the first bit.
|
||||
x ^= 1 << 7;
|
||||
weights[i * weights_per_block + j] = x;
|
||||
}
|
||||
});
|
||||
for (size_t j = 0; j < weights_per_block; ++j) {
|
||||
uint8_t x = block_data[j + 2];
|
||||
x ^= 1 << 7; // Convert int8 to uint8 by flipping sign bit
|
||||
weights[i * weights_per_block + j] = x;
|
||||
}
|
||||
});
|
||||
} else {
|
||||
// Symmetric: store original int8 values directly (no unsigned bias)
|
||||
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
|
||||
uint8_t * block_data = data + i * bytes_per_block;
|
||||
scales[i] = ov::float16::from_bits(*(uint16_t *) block_data);
|
||||
// Copy int8 weights as-is (the tensor element type is i8)
|
||||
memcpy(weights + i * weights_per_block, block_data + 2, weights_per_block);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
void unpack_256_4(const uint8_t * data, uint8_t * dst) {
|
||||
@@ -256,44 +263,62 @@ void extract_q6_k_data(const ggml_tensor * tensor,
|
||||
auto * data = static_cast<uint8_t *>(tensor->data);
|
||||
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
||||
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * zp = static_cast<uint8_t *>(zp_arr.data());
|
||||
|
||||
bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization
|
||||
bool is_symmetric = (weights_arr.get_element_type() == ov::element::i8); // Signed i8 path
|
||||
|
||||
// For Q6_K, zero point is always 32
|
||||
if (is_scalar_zp) {
|
||||
zp[0] = 32;
|
||||
}
|
||||
|
||||
ov::parallel_for(n_super_block, [&](size_t i) {
|
||||
uint8_t * block_data = data + i * bytes_per_block;
|
||||
|
||||
float scale_factor =
|
||||
static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data + 104))); // (128+64+16)/2
|
||||
|
||||
for (size_t j = 0; j < 16; j++) {
|
||||
scales[j + i * 16] =
|
||||
ov::float16(scale_factor * static_cast<float>(*((int8_t *) (block_data + 128 + 64 + j))));
|
||||
// For asymmetric quantization, store per-block zero points
|
||||
if (!is_scalar_zp) {
|
||||
if (!is_symmetric) {
|
||||
auto * zp = static_cast<uint8_t *>(zp_arr.data());
|
||||
ov::parallel_for(n_super_block, [&](size_t i) {
|
||||
uint8_t * block_data = data + i * bytes_per_block;
|
||||
float scale_factor = static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data + 104)));
|
||||
for (size_t j = 0; j < 16; j++) {
|
||||
scales[j + i * 16] =
|
||||
ov::float16(scale_factor * static_cast<float>(*((int8_t *) (block_data + 128 + 64 + j))));
|
||||
zp[j + i * 16] = 32;
|
||||
}
|
||||
}
|
||||
|
||||
uint8_t * ql = block_data;
|
||||
uint8_t * qh = block_data + 128;
|
||||
|
||||
for (int64_t j = 0; j < 32; ++j) {
|
||||
weights[i * 256 + j] = (ql[j] & 0xF) | (((qh[j] >> 0) & 3) << 4);
|
||||
weights[i * 256 + j + 32] = (ql[32 + j] & 0xF) | (((qh[j] >> 2) & 3) << 4);
|
||||
weights[i * 256 + j + 64] = (ql[j] >> 4) | (((qh[j] >> 4) & 3) << 4);
|
||||
weights[i * 256 + j + 96] = (ql[32 + j] >> 4) | (((qh[j] >> 6) & 3) << 4);
|
||||
weights[i * 256 + j + 128] = (ql[64 + j] & 0xF) | (((qh[32 + j] >> 0) & 3) << 4);
|
||||
weights[i * 256 + j + 160] = (ql[96 + j] & 0xF) | (((qh[32 + j] >> 2) & 3) << 4);
|
||||
weights[i * 256 + j + 192] = (ql[64 + j] >> 4) | (((qh[32 + j] >> 4) & 3) << 4);
|
||||
weights[i * 256 + j + 224] = (ql[96 + j] >> 4) | (((qh[32 + j] >> 6) & 3) << 4);
|
||||
}
|
||||
});
|
||||
uint8_t * ql = block_data;
|
||||
uint8_t * qh = block_data + 128;
|
||||
for (int64_t j = 0; j < 32; ++j) {
|
||||
weights[i * 256 + j] = (ql[j] & 0xF) | (((qh[j] >> 0) & 3) << 4);
|
||||
weights[i * 256 + j + 32] = (ql[32 + j] & 0xF) | (((qh[j] >> 2) & 3) << 4);
|
||||
weights[i * 256 + j + 64] = (ql[j] >> 4) | (((qh[j] >> 4) & 3) << 4);
|
||||
weights[i * 256 + j + 96] = (ql[32 + j] >> 4) | (((qh[j] >> 6) & 3) << 4);
|
||||
weights[i * 256 + j + 128] = (ql[64 + j] & 0xF) | (((qh[32 + j] >> 0) & 3) << 4);
|
||||
weights[i * 256 + j + 160] = (ql[96 + j] & 0xF) | (((qh[32 + j] >> 2) & 3) << 4);
|
||||
weights[i * 256 + j + 192] = (ql[64 + j] >> 4) | (((qh[32 + j] >> 4) & 3) << 4);
|
||||
weights[i * 256 + j + 224] = (ql[96 + j] >> 4) | (((qh[32 + j] >> 6) & 3) << 4);
|
||||
}
|
||||
});
|
||||
} else {
|
||||
// Symmetric: subtract 32 from each weight to store as signed i8
|
||||
ov::parallel_for(n_super_block, [&](size_t i) {
|
||||
uint8_t * block_data = data + i * bytes_per_block;
|
||||
float scale_factor = static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data + 104)));
|
||||
for (size_t j = 0; j < 16; j++) {
|
||||
scales[j + i * 16] =
|
||||
ov::float16(scale_factor * static_cast<float>(*((int8_t *) (block_data + 128 + 64 + j))));
|
||||
}
|
||||
uint8_t * ql = block_data;
|
||||
uint8_t * qh = block_data + 128;
|
||||
auto * signed_weights = reinterpret_cast<int8_t *>(weights);
|
||||
for (int64_t j = 0; j < 32; ++j) {
|
||||
signed_weights[i * 256 + j] = static_cast<int8_t>((ql[j] & 0xF) | (((qh[j] >> 0) & 3) << 4)) - 32;
|
||||
signed_weights[i * 256 + j + 32] =
|
||||
static_cast<int8_t>((ql[32 + j] & 0xF) | (((qh[j] >> 2) & 3) << 4)) - 32;
|
||||
signed_weights[i * 256 + j + 64] = static_cast<int8_t>((ql[j] >> 4) | (((qh[j] >> 4) & 3) << 4)) - 32;
|
||||
signed_weights[i * 256 + j + 96] =
|
||||
static_cast<int8_t>((ql[32 + j] >> 4) | (((qh[j] >> 6) & 3) << 4)) - 32;
|
||||
signed_weights[i * 256 + j + 128] =
|
||||
static_cast<int8_t>((ql[64 + j] & 0xF) | (((qh[32 + j] >> 0) & 3) << 4)) - 32;
|
||||
signed_weights[i * 256 + j + 160] =
|
||||
static_cast<int8_t>((ql[96 + j] & 0xF) | (((qh[32 + j] >> 2) & 3) << 4)) - 32;
|
||||
signed_weights[i * 256 + j + 192] =
|
||||
static_cast<int8_t>((ql[64 + j] >> 4) | (((qh[32 + j] >> 4) & 3) << 4)) - 32;
|
||||
signed_weights[i * 256 + j + 224] =
|
||||
static_cast<int8_t>((ql[96 + j] >> 4) | (((qh[32 + j] >> 6) & 3) << 4)) - 32;
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8_t * m) {
|
||||
@@ -389,11 +414,10 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
|
||||
size_t group_size,
|
||||
bool use_bias) {
|
||||
ov::Shape orig_shape = weight.get_shape();
|
||||
bool is_signed = (weight.get_element_type() == ov::element::i8); // Symmetric: signed weights, no ZP
|
||||
|
||||
// Expand dimensions for scales and zp/bias
|
||||
auto scale_shape = scales.get_shape();
|
||||
auto zp_shape = zp.get_shape();
|
||||
bool is_scalar_zp = zp_shape.empty(); // Symmetric quantization
|
||||
|
||||
ov::Shape packed_shape = {orig_shape[0], orig_shape[1] / group_size, group_size};
|
||||
|
||||
@@ -403,37 +427,48 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
|
||||
} else {
|
||||
scale_shape.push_back(1);
|
||||
scales.set_shape(scale_shape);
|
||||
// For symmetric quantization, zp remains scalar (don't resize)
|
||||
if (!is_scalar_zp) {
|
||||
if (!is_signed && zp.get_size() > 0) {
|
||||
auto zp_shape = zp.get_shape();
|
||||
zp_shape.push_back(1);
|
||||
zp.set_shape(zp_shape);
|
||||
}
|
||||
}
|
||||
|
||||
// Create graph nodes
|
||||
auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u8, packed_shape,
|
||||
static_cast<uint8_t *>(weight.data()), nullptr);
|
||||
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
|
||||
auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
|
||||
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
|
||||
|
||||
ov::Output<ov::Node> result;
|
||||
if (use_bias && !is_scalar_zp) {
|
||||
// Bias path: w * s + b (zp tensor holds f16 bias values)
|
||||
auto bias_f16 = std::make_shared<ov::op::v0::Constant>(zp);
|
||||
auto w_s = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
result = std::make_shared<ov::op::v1::Add>(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
if (is_signed) {
|
||||
// Signed path: q * s (no zero point subtraction needed)
|
||||
auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::i8, packed_shape,
|
||||
static_cast<uint8_t *>(weight.data()), nullptr);
|
||||
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
|
||||
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
|
||||
result = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
} else {
|
||||
// Zero point path: (w - zp) * s
|
||||
auto zero_point = std::make_shared<ov::op::v0::Constant>(zp);
|
||||
float zp_value;
|
||||
if (ov::op::util::get_single_value(zero_point, zp_value)) {
|
||||
zero_point = ov::op::v0::Constant::create(zero_point->get_element_type(), {}, {zp_value});
|
||||
// Unsigned path
|
||||
auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u8, packed_shape,
|
||||
static_cast<uint8_t *>(weight.data()), nullptr);
|
||||
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
|
||||
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
|
||||
|
||||
if (use_bias && zp.get_size() > 0) {
|
||||
// Bias path: w * s + b (zp tensor holds f16 bias values)
|
||||
auto bias_f16 = std::make_shared<ov::op::v0::Constant>(zp);
|
||||
auto w_s =
|
||||
std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
result = std::make_shared<ov::op::v1::Add>(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
} else {
|
||||
// Zero point path: (w - zp) * s
|
||||
auto zero_point = std::make_shared<ov::op::v0::Constant>(zp);
|
||||
float zp_value;
|
||||
if (ov::op::util::get_single_value(zero_point, zp_value)) {
|
||||
zero_point = ov::op::v0::Constant::create(zero_point->get_element_type(), {}, {zp_value});
|
||||
}
|
||||
auto zero_point_f16 = std::make_shared<ov::op::v0::Convert>(zero_point, ov::element::f16);
|
||||
auto w_zp =
|
||||
std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
result = std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
}
|
||||
auto zero_point_f16 = std::make_shared<ov::op::v0::Convert>(zero_point, ov::element::f16);
|
||||
auto w_zp =
|
||||
std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
result = std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
}
|
||||
|
||||
if (packed_shape.size() != 2) {
|
||||
@@ -452,11 +487,10 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
|
||||
size_t group_size,
|
||||
bool use_bias) {
|
||||
ov::Shape orig_weight_shape = weight.get_shape();
|
||||
bool is_signed = (weight.get_element_type() == ov::element::i4); // Symmetric: signed weights, no ZP
|
||||
|
||||
// Expand dimensions for scales and zp/bias
|
||||
ov::Shape scale_shape = scales.get_shape();
|
||||
auto zp_shape = zp.get_shape();
|
||||
bool is_scalar_zp = zp_shape.empty(); // Symmetric quantization
|
||||
|
||||
// Create INT4 weight tensor
|
||||
ov::Shape packed_shape = {orig_weight_shape[0], orig_weight_shape[1] / group_size, group_size};
|
||||
@@ -467,36 +501,48 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
|
||||
} else {
|
||||
scale_shape.push_back(1);
|
||||
scales.set_shape(scale_shape);
|
||||
// For symmetric quantization, zp remains scalar (don't resize)
|
||||
if (!is_scalar_zp) {
|
||||
if (!is_signed && zp.get_size() > 0) {
|
||||
auto zp_shape = zp.get_shape();
|
||||
zp_shape.push_back(1);
|
||||
zp.set_shape(zp_shape);
|
||||
}
|
||||
}
|
||||
|
||||
auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u4, packed_shape,
|
||||
static_cast<uint8_t *>(weight.data()), nullptr);
|
||||
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
|
||||
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
|
||||
auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
|
||||
|
||||
ov::Output<ov::Node> result;
|
||||
if (use_bias && !is_scalar_zp) {
|
||||
// Bias path: w * s + b (zp tensor holds f16 bias values)
|
||||
auto bias_f16 = std::make_shared<ov::op::v0::Constant>(zp);
|
||||
auto w_s = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
result = std::make_shared<ov::op::v1::Add>(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
if (is_signed) {
|
||||
// Signed path: q * s (no zero point subtraction needed)
|
||||
auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::i4, packed_shape,
|
||||
static_cast<uint8_t *>(weight.data()), nullptr);
|
||||
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
|
||||
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
|
||||
result = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
} else {
|
||||
// Zero point path: (w - zp) * s
|
||||
auto zero_points_node = std::make_shared<ov::op::v0::Constant>(zp);
|
||||
float zp_value;
|
||||
if (ov::op::util::get_single_value(zero_points_node, zp_value)) {
|
||||
zero_points_node = ov::op::v0::Constant::create(zero_points_node->get_element_type(), {}, {zp_value});
|
||||
// Unsigned path
|
||||
auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u4, packed_shape,
|
||||
static_cast<uint8_t *>(weight.data()), nullptr);
|
||||
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
|
||||
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
|
||||
|
||||
if (use_bias && zp.get_size() > 0) {
|
||||
// Bias path: w * s + b (zp tensor holds f16 bias values)
|
||||
auto bias_f16 = std::make_shared<ov::op::v0::Constant>(zp);
|
||||
auto w_s =
|
||||
std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
result = std::make_shared<ov::op::v1::Add>(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
} else {
|
||||
// Zero point path: (w - zp) * s
|
||||
auto zero_points_node = std::make_shared<ov::op::v0::Constant>(zp);
|
||||
float zp_value;
|
||||
if (ov::op::util::get_single_value(zero_points_node, zp_value)) {
|
||||
zero_points_node = ov::op::v0::Constant::create(zero_points_node->get_element_type(), {}, {zp_value});
|
||||
}
|
||||
auto zero_points_f16 = std::make_shared<ov::op::v0::Convert>(zero_points_node, ov::element::f16);
|
||||
auto w_zp =
|
||||
std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
result = std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
}
|
||||
auto zero_points_f16 = std::make_shared<ov::op::v0::Convert>(zero_points_node, ov::element::f16);
|
||||
auto w_zp =
|
||||
std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
result = std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
}
|
||||
|
||||
if (packed_shape.size() != 2) {
|
||||
@@ -699,24 +745,32 @@ OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, vo
|
||||
|
||||
// Quantized path (normal extraction or quantized requant)
|
||||
// Create weight/scale/zp tensors - shared between both paths
|
||||
ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
|
||||
// For symmetric quantization, use signed types (i4/i8) and no ZP tensor
|
||||
ov::element::Type weight_type = layout.is_symmetric ? (layout.is_u4 ? ov::element::i4 : ov::element::i8) :
|
||||
(layout.is_u4 ? ov::element::u4 : ov::element::u8);
|
||||
ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block};
|
||||
ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape;
|
||||
|
||||
if (output_base_ptr) {
|
||||
uint8_t * buf_base = static_cast<uint8_t *>(output_base_ptr);
|
||||
result.weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset);
|
||||
result.scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
|
||||
result.zp = ov::Tensor(weight_type, zp_shape, buf_base + layout.zp_offset);
|
||||
if (!layout.is_symmetric) {
|
||||
ov::element::Type zp_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
|
||||
result.zp = ov::Tensor(zp_type, scale_shape, buf_base + layout.zp_offset);
|
||||
}
|
||||
// else: result.zp remains default-constructed (empty) for symmetric
|
||||
} else {
|
||||
result.weights = ov::Tensor(weight_type, node_shape);
|
||||
result.scales = ov::Tensor(ov::element::f16, scale_shape);
|
||||
if (use_bias && !layout.is_symmetric) {
|
||||
// bias only has effect for asymmetric quant
|
||||
result.zp = ov::Tensor(ov::element::f16, zp_shape);
|
||||
} else {
|
||||
result.zp = ov::Tensor(weight_type, zp_shape);
|
||||
if (!layout.is_symmetric) {
|
||||
if (use_bias) {
|
||||
result.zp = ov::Tensor(ov::element::f16, scale_shape);
|
||||
} else {
|
||||
ov::element::Type zp_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
|
||||
result.zp = ov::Tensor(zp_type, scale_shape);
|
||||
}
|
||||
}
|
||||
// else: result.zp remains default-constructed (empty) for symmetric
|
||||
}
|
||||
|
||||
if (layout.is_requant && layout.requant_type.has_value()) {
|
||||
@@ -741,59 +795,75 @@ void quantize_q4_0(const float * x,
|
||||
|
||||
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
||||
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * zp = static_cast<uint8_t *>(zp_arr.data());
|
||||
bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization
|
||||
bool is_symmetric = (weights_arr.get_element_type() == ov::element::i4); // Signed i4 path
|
||||
|
||||
// For Q4_0, zero point is always 8
|
||||
if (is_scalar_zp) {
|
||||
zp[0] = 8 | (8 << 4); // Pack two 4-bit values
|
||||
}
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
float amax = 0.0f; // absolute max
|
||||
float max = 0.0f;
|
||||
|
||||
for (int j = 0; j < qk; j++) {
|
||||
const float v = x[i * qk + j];
|
||||
if (amax < fabsf(v)) {
|
||||
amax = fabsf(v);
|
||||
max = v;
|
||||
if (!is_symmetric) {
|
||||
auto * zp = static_cast<uint8_t *>(zp_arr.data());
|
||||
for (int i = 0; i < nb; i++) {
|
||||
float amax = 0.0f;
|
||||
float max = 0.0f;
|
||||
for (int j = 0; j < qk; j++) {
|
||||
const float v = x[i * qk + j];
|
||||
if (amax < fabsf(v)) {
|
||||
amax = fabsf(v);
|
||||
max = v;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const float d = max / -8;
|
||||
|
||||
if (d == 0) {
|
||||
scales[i] = ov::float16(1.0f);
|
||||
// zp is already set to 8 for symmetric, or set per-block for asymmetric
|
||||
if (!is_scalar_zp) {
|
||||
const float d = max / -8;
|
||||
if (d == 0) {
|
||||
scales[i] = ov::float16(1.0f);
|
||||
if (i % 2 == 0) {
|
||||
zp[i / 2] = 8;
|
||||
} else {
|
||||
zp[i / 2] |= (8 << 4);
|
||||
}
|
||||
memset(weights + i * qk / 2, 8 | (8 << 4), qk / 2);
|
||||
continue;
|
||||
}
|
||||
memset(weights + i * qk / 2, 8 | (8 << 4), qk / 2);
|
||||
continue;
|
||||
}
|
||||
|
||||
const float id = 1.0f / d;
|
||||
scales[i] = ov::float16(d);
|
||||
// For asymmetric quantization, store per-block zero points
|
||||
if (!is_scalar_zp) {
|
||||
const float id = 1.0f / d;
|
||||
scales[i] = ov::float16(d);
|
||||
if (i % 2 == 0) {
|
||||
zp[i / 2] = 8;
|
||||
} else {
|
||||
zp[i / 2] |= (8 << 4);
|
||||
}
|
||||
for (int j = 0; j < qk / 2; ++j) {
|
||||
const float x0 = x[i * qk + 2 * j] * id;
|
||||
const float x1 = x[i * qk + 2 * j + 1] * id;
|
||||
const uint8_t xi0 = MIN(15, (int8_t) (x0 + 8.5f));
|
||||
const uint8_t xi1 = MIN(15, (int8_t) (x1 + 8.5f));
|
||||
weights[i * qk / 2 + j] = xi0 | (xi1 << 4);
|
||||
}
|
||||
}
|
||||
|
||||
for (int j = 0; j < qk / 2; ++j) {
|
||||
const float x0 = x[i * qk + 2 * j] * id;
|
||||
const float x1 = x[i * qk + 2 * j + 1] * id;
|
||||
const uint8_t xi0 = MIN(15, (int8_t) (x0 + 8.5f));
|
||||
const uint8_t xi1 = MIN(15, (int8_t) (x1 + 8.5f));
|
||||
weights[i * qk / 2 + j] = xi0 | (xi1 << 4);
|
||||
} else {
|
||||
// Symmetric: produce signed i4 values in [-8, 7]
|
||||
for (int i = 0; i < nb; i++) {
|
||||
float amax = 0.0f;
|
||||
float max = 0.0f;
|
||||
for (int j = 0; j < qk; j++) {
|
||||
const float v = x[i * qk + j];
|
||||
if (amax < fabsf(v)) {
|
||||
amax = fabsf(v);
|
||||
max = v;
|
||||
}
|
||||
}
|
||||
const float d = max / -8;
|
||||
if (d == 0) {
|
||||
scales[i] = ov::float16(1.0f);
|
||||
// i4 value 0 packed: 0x00
|
||||
memset(weights + i * qk / 2, 0, qk / 2);
|
||||
continue;
|
||||
}
|
||||
const float id = 1.0f / d;
|
||||
scales[i] = ov::float16(d);
|
||||
for (int j = 0; j < qk / 2; ++j) {
|
||||
const float x0 = x[i * qk + 2 * j] * id;
|
||||
const float x1 = x[i * qk + 2 * j + 1] * id;
|
||||
// Signed i4: range [-8, 7]. Quantize as round(x*id), then pack as 4-bit two's complement.
|
||||
int8_t si0 = (int8_t) std::max(-8, std::min(7, (int) roundf(x0)));
|
||||
int8_t si1 = (int8_t) std::max(-8, std::min(7, (int) roundf(x1)));
|
||||
weights[i * qk / 2 + j] = (si0 & 0x0F) | ((si1 & 0x0F) << 4);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -809,36 +879,42 @@ void quantize_q8_0(const float * x,
|
||||
|
||||
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
||||
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * zp = static_cast<uint8_t *>(zp_arr.data());
|
||||
bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization
|
||||
bool is_symmetric = (weights_arr.get_element_type() == ov::element::i8); // Signed i8 path
|
||||
|
||||
// For Q8_0, zero point is always 128
|
||||
if (is_scalar_zp) {
|
||||
zp[0] = 128;
|
||||
}
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
float amax = 0.0f; // absolute max
|
||||
|
||||
for (int j = 0; j < qk; j++) {
|
||||
const float v = x[i * qk + j];
|
||||
if (amax < fabsf(v)) {
|
||||
amax = fabsf(v);
|
||||
if (!is_symmetric) {
|
||||
auto * zp = static_cast<uint8_t *>(zp_arr.data());
|
||||
for (int i = 0; i < nb; i++) {
|
||||
float amax = 0.0f;
|
||||
for (int j = 0; j < qk; j++) {
|
||||
const float v = x[i * qk + j];
|
||||
amax = std::max(amax, fabsf(v));
|
||||
}
|
||||
const float d = amax / 127.0f;
|
||||
const float id = d ? 1.0f / d : 0.0f;
|
||||
scales[i] = ov::float16(d);
|
||||
zp[i] = 128;
|
||||
for (int j = 0; j < qk; ++j) {
|
||||
const float x0 = x[i * qk + j] * id;
|
||||
const int8_t xi0 = roundf(x0);
|
||||
weights[i * qk + j] = (uint8_t) (xi0 + 128);
|
||||
}
|
||||
}
|
||||
|
||||
const float d = amax / 127.0f;
|
||||
const float id = d ? 1.0f / d : 0.0f;
|
||||
scales[i] = ov::float16(d);
|
||||
// For asymmetric quantization, store per-block zero points
|
||||
if (!is_scalar_zp) {
|
||||
zp[i] = 128;
|
||||
}
|
||||
|
||||
for (int j = 0; j < qk; ++j) {
|
||||
const float x0 = x[i * qk + j] * id;
|
||||
const int8_t xi0 = roundf(x0);
|
||||
weights[i * qk + j] = (uint8_t) (xi0 + 128);
|
||||
} else {
|
||||
// Symmetric: store signed int8 values directly
|
||||
auto * signed_weights = reinterpret_cast<int8_t *>(weights);
|
||||
for (int i = 0; i < nb; i++) {
|
||||
float amax = 0.0f;
|
||||
for (int j = 0; j < qk; j++) {
|
||||
const float v = x[i * qk + j];
|
||||
amax = std::max(amax, fabsf(v));
|
||||
}
|
||||
const float d = amax / 127.0f;
|
||||
const float id = d ? 1.0f / d : 0.0f;
|
||||
scales[i] = ov::float16(d);
|
||||
for (int j = 0; j < qk; ++j) {
|
||||
const float x0 = x[i * qk + j] * id;
|
||||
signed_weights[i * qk + j] = (int8_t) roundf(x0);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -861,12 +937,8 @@ void quantize_q8_1(const float * x,
|
||||
|
||||
for (int j = 0; j < qk; j++) {
|
||||
const float v = x[i * qk + j];
|
||||
if (v < min) {
|
||||
min = v;
|
||||
}
|
||||
if (v > max) {
|
||||
max = v;
|
||||
}
|
||||
min = std::min(v, min);
|
||||
max = std::max(v, max);
|
||||
}
|
||||
|
||||
const float d = (max - min) / ((1 << 8) - 1);
|
||||
|
||||
@@ -9,12 +9,17 @@
|
||||
#include <openvino/op/add.hpp>
|
||||
#include <openvino/op/concat.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/convert.hpp>
|
||||
#include <openvino/op/cos.hpp>
|
||||
#include <openvino/op/gather.hpp>
|
||||
#include <openvino/op/multiply.hpp>
|
||||
#include <openvino/op/reshape.hpp>
|
||||
#include <openvino/op/shape_of.hpp>
|
||||
#include <openvino/op/sin.hpp>
|
||||
#include <openvino/op/slice.hpp>
|
||||
#include <openvino/op/split.hpp>
|
||||
#include <openvino/op/subtract.hpp>
|
||||
#include <openvino/op/transpose.hpp>
|
||||
#include <openvino/op/unsqueeze.hpp>
|
||||
#include <vector>
|
||||
|
||||
@@ -33,6 +38,12 @@ OutputVector translate_rope(const NodeContext & context) {
|
||||
auto data_node = context.get_input(0).get_node_shared_ptr();
|
||||
auto output_shape = context.get_output_shape().to_shape();
|
||||
int32_t * op_params = context.get_output_op_params();
|
||||
const int mode = (op_case & 0xFFFF0000) >> 16;
|
||||
op_case = (op_case & 0x0000FFFF);
|
||||
|
||||
constexpr int TYPE_NORMAL = 0;
|
||||
constexpr int TYPE_NEOX = 1;
|
||||
constexpr int TYPE_IMROPE = 2;
|
||||
|
||||
Output<Node> cos_theta_node;
|
||||
Output<Node> sin_theta_node;
|
||||
@@ -45,7 +56,7 @@ OutputVector translate_rope(const NodeContext & context) {
|
||||
if (context.get_input_size() == 3) {
|
||||
rope_freqs_weight = context.get_input(2).get_node_shared_ptr();
|
||||
}
|
||||
auto sin_cos = make_sin_cos(op_params, inp_pos, rope_freqs_weight);
|
||||
auto sin_cos = make_sin_cos(op_params, inp_pos, rope_freqs_weight, mode == TYPE_IMROPE);
|
||||
sin_theta_node = sin_cos.first;
|
||||
cos_theta_node = sin_cos.second;
|
||||
}
|
||||
@@ -65,11 +76,7 @@ OutputVector translate_rope(const NodeContext & context) {
|
||||
}
|
||||
}
|
||||
|
||||
const int mode = op_params[2];
|
||||
constexpr int ROPE_TYPE_NORMAL = 0;
|
||||
constexpr int ROPE_TYPE_NEOX = 2;
|
||||
|
||||
if (mode == ROPE_TYPE_NORMAL) {
|
||||
if (mode == TYPE_NORMAL) {
|
||||
auto neg_one = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1});
|
||||
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
|
||||
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
|
||||
@@ -97,7 +104,7 @@ OutputVector translate_rope(const NodeContext & context) {
|
||||
auto data_shape = ov::op::v0::Constant::create(
|
||||
ov::element::i64, {4}, std::vector<int64_t>{1, -1, (int64_t) output_shape[2], (int64_t) output_shape[3]});
|
||||
res = std::make_shared<ov::op::v1::Reshape>(stack, data_shape, false);
|
||||
} else if (mode == ROPE_TYPE_NEOX) {
|
||||
} else if (mode == TYPE_NEOX) {
|
||||
auto data_split = std::make_shared<ov::op::v1::Split>(
|
||||
data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {-1}), 2);
|
||||
Output<Node> slice_data_node_0 = data_split->outputs()[0];
|
||||
@@ -112,6 +119,25 @@ OutputVector translate_rope(const NodeContext & context) {
|
||||
std::make_shared<ov::op::v1::Multiply>(slice_data_node_1, cos_theta_node));
|
||||
|
||||
res = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{first_half_node, second_half_node}, -1);
|
||||
} else if (mode == TYPE_IMROPE) {
|
||||
int64_t n_dims = data_node->get_shape()[3];
|
||||
auto cos_sin_shape = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{4}, std::vector<int64_t>{1,-1,1,(n_dims >> 1)});
|
||||
auto cos_reshaped = std::make_shared<ov::op::v1::Reshape>(cos_theta_node, cos_sin_shape, true);
|
||||
auto sin_reshaped = std::make_shared<ov::op::v1::Reshape>(sin_theta_node, cos_sin_shape, true);
|
||||
|
||||
auto split_axis = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {3});
|
||||
auto split_a = std::make_shared<ov::op::v1::Split>(data_node, split_axis, 2);
|
||||
auto x0 = split_a->output(0);
|
||||
auto x1 = split_a->output(1);
|
||||
auto mul_a = std::make_shared<ov::op::v1::Multiply>(x0, cos_reshaped);
|
||||
auto mul_b = std::make_shared<ov::op::v1::Multiply>(x1, sin_reshaped);
|
||||
auto sub = std::make_shared<ov::op::v1::Subtract>(mul_a, mul_b);
|
||||
|
||||
auto mul_c = std::make_shared<ov::op::v1::Multiply>(x0, sin_reshaped);
|
||||
auto mul_d = std::make_shared<ov::op::v1::Multiply>(x1, cos_reshaped);
|
||||
auto add = std::make_shared<ov::op::v1::Add>(mul_c, mul_d);
|
||||
|
||||
res = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{sub, add}, 3);
|
||||
}
|
||||
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
|
||||
@@ -0,0 +1,25 @@
|
||||
#include "../node_context.h"
|
||||
#include "../op_table.h"
|
||||
#include "../utils.h"
|
||||
|
||||
#include <openvino/core/node_output.hpp>
|
||||
#include <openvino/op/gelu.hpp>
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
OutputVector translate_unary_gelu(const NodeContext & context) {
|
||||
num_inputs_check(context, 1, 1);
|
||||
|
||||
auto input = context.get_input(0);
|
||||
auto res = std::make_shared<ov::op::v7::Gelu>(input);
|
||||
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
}
|
||||
|
||||
} // namespace op
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
@@ -31,6 +31,7 @@ std::unordered_map<std::string, CreatorFunction> get_supported_ops() {
|
||||
{"GGML_OP_SOFT_MAX", op::translate_soft_max },
|
||||
{"GGML_OP_SUB", op::translate_1to1_match_2_inputs<v1::Subtract>},
|
||||
{"GGML_OP_TRANSPOSE", op::translate_transpose },
|
||||
{"GGML_UNARY_OP_GELU", op::translate_unary_gelu },
|
||||
{"GGML_UNARY_OP_SILU", op::translate_unary_silu },
|
||||
{"GGML_OP_VIEW", op::translate_view },
|
||||
{"GGML_GLU_OP_SWIGLU", op::translate_glu_swiglu },
|
||||
|
||||
@@ -21,6 +21,7 @@ GGML_OP_CONVERTER(translate_rms_norm);
|
||||
GGML_OP_CONVERTER(translate_rope);
|
||||
GGML_OP_CONVERTER(translate_scale);
|
||||
GGML_OP_CONVERTER(translate_unary_silu);
|
||||
GGML_OP_CONVERTER(translate_unary_gelu);
|
||||
GGML_OP_CONVERTER(translate_soft_max);
|
||||
GGML_OP_CONVERTER(translate_transpose);
|
||||
GGML_OP_CONVERTER(translate_view);
|
||||
|
||||
@@ -1,123 +0,0 @@
|
||||
#include "eliminate_zp.h"
|
||||
|
||||
#include <openvino/core/graph_util.hpp>
|
||||
#include <openvino/core/parallel.hpp>
|
||||
#include <openvino/core/rt_info.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/convert.hpp>
|
||||
#include <openvino/op/multiply.hpp>
|
||||
#include <openvino/op/subtract.hpp>
|
||||
#include <openvino/pass/pattern/op/label.hpp>
|
||||
#include <openvino/pass/pattern/op/pattern.hpp>
|
||||
#include <openvino/pass/pattern/op/wrap_type.hpp>
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace pass {
|
||||
|
||||
EliminateZeroPoints::EliminateZeroPoints() {
|
||||
// Find pattern:
|
||||
// (Multiply Any(scale)
|
||||
// (Subtract (Convert Constant(data)))
|
||||
// (Convert Constant(zero_point)))
|
||||
// where zero_point is a scalar
|
||||
// If data is u4 and zp value is 8 (q4_0), Replace the Subtract with an i4 Constant whose value is data - zp_val
|
||||
// If data is u8 and zp value is 128 (q8_0) or 32 (q6_k), Replace the Subtract with an i8 Constant
|
||||
|
||||
auto m_data_constant = ov::pass::pattern::wrap_type<ov::op::v0::Constant>();
|
||||
auto m_data_convert = ov::pass::pattern::wrap_type<ov::op::v0::Convert>({m_data_constant});
|
||||
|
||||
auto m_zp_constant = ov::pass::pattern::wrap_type<ov::op::v0::Constant>();
|
||||
auto m_zp_convert = ov::pass::pattern::wrap_type<ov::op::v0::Convert>({m_zp_constant});
|
||||
|
||||
auto m_subtract = ov::pass::pattern::wrap_type<ov::op::v1::Subtract>({m_data_convert, m_zp_convert});
|
||||
auto m_scale = ov::pass::pattern::any_input();
|
||||
auto m_multiply = ov::pass::pattern::wrap_type<ov::op::v1::Multiply>({m_scale, m_subtract});
|
||||
|
||||
const auto callback = [=](ov::pass::pattern::Matcher & m) {
|
||||
const auto & pattern_map = m.get_pattern_value_map();
|
||||
|
||||
auto multiply_node =
|
||||
std::dynamic_pointer_cast<ov::op::v1::Multiply>(pattern_map.at(m_multiply).get_node_shared_ptr());
|
||||
auto subtract_node =
|
||||
std::dynamic_pointer_cast<ov::op::v1::Subtract>(pattern_map.at(m_subtract).get_node_shared_ptr());
|
||||
auto data_constant =
|
||||
std::dynamic_pointer_cast<ov::op::v0::Constant>(pattern_map.at(m_data_constant).get_node_shared_ptr());
|
||||
auto zp_constant =
|
||||
std::dynamic_pointer_cast<ov::op::v0::Constant>(pattern_map.at(m_zp_constant).get_node_shared_ptr());
|
||||
|
||||
if (!multiply_node || !subtract_node || !data_constant || !zp_constant) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (ov::shape_size(zp_constant->get_shape()) != 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto data_type = data_constant->get_element_type();
|
||||
auto zp_data = zp_constant->cast_vector<int>();
|
||||
|
||||
if (zp_data.empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
int zp_value = zp_data[0];
|
||||
|
||||
bool should_eliminate = false;
|
||||
ov::element::Type target_type;
|
||||
|
||||
if (data_type == ov::element::u4 && zp_value == 8) {
|
||||
should_eliminate = true;
|
||||
target_type = ov::element::i4;
|
||||
} else if (data_type == ov::element::u8 && (zp_value == 128 || zp_value == 32)) {
|
||||
should_eliminate = true;
|
||||
target_type = ov::element::i8;
|
||||
}
|
||||
|
||||
if (!should_eliminate) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto data_shape = data_constant->get_shape();
|
||||
size_t total_elements = ov::shape_size(data_shape);
|
||||
|
||||
std::shared_ptr<ov::op::v0::Constant> new_constant;
|
||||
|
||||
// TODO improve performance
|
||||
if (data_type == ov::element::u4) {
|
||||
auto data_values = data_constant->cast_vector<uint8_t>();
|
||||
std::vector<int8_t> adjusted_values(total_elements);
|
||||
|
||||
ov::parallel_for(total_elements, [&](size_t i) {
|
||||
adjusted_values[i] = static_cast<int8_t>(static_cast<int>(data_values[i]) - 8);
|
||||
});
|
||||
|
||||
new_constant = std::make_shared<ov::op::v0::Constant>(target_type, data_shape, adjusted_values);
|
||||
} else if (data_type == ov::element::u8) {
|
||||
auto data_values = data_constant->cast_vector<uint8_t>();
|
||||
std::vector<int8_t> adjusted_values(total_elements);
|
||||
|
||||
ov::parallel_for(total_elements, [&, zp_value](size_t i) {
|
||||
adjusted_values[i] = static_cast<int8_t>(static_cast<int>(data_values[i]) - zp_value);
|
||||
});
|
||||
|
||||
new_constant = std::make_shared<ov::op::v0::Constant>(target_type, data_shape, adjusted_values);
|
||||
}
|
||||
|
||||
auto new_convert =
|
||||
std::make_shared<ov::op::v0::Convert>(new_constant, subtract_node->get_output_element_type(0));
|
||||
ov::replace_node(subtract_node, new_convert);
|
||||
|
||||
return true;
|
||||
};
|
||||
|
||||
register_matcher(
|
||||
std::make_shared<ov::pass::pattern::Matcher>(m_multiply, "ov::frontend::ggml::pass::EliminateZeroPoints"),
|
||||
callback);
|
||||
}
|
||||
|
||||
} // namespace pass
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
@@ -1,17 +0,0 @@
|
||||
#include "openvino/pass/matcher_pass.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace pass {
|
||||
|
||||
class EliminateZeroPoints : public ov::pass::MatcherPass {
|
||||
public:
|
||||
OPENVINO_MATCHER_PASS_RTTI("ov::frontend::ggml::pass::EliminateZeroPoints")
|
||||
EliminateZeroPoints();
|
||||
};
|
||||
|
||||
} // namespace pass
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
@@ -0,0 +1,41 @@
|
||||
// Copyright (C) 2018-2026 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <openvino/core/core_visibility.hpp>
|
||||
#include <openvino/core/node.hpp>
|
||||
#include <openvino/core/runtime_attribute.hpp>
|
||||
|
||||
namespace ov {
|
||||
|
||||
/**
|
||||
* @brief Holds weightless caching attributes of a single constant.
|
||||
*
|
||||
* WeightlessCacheAttribute class represents runtime info attribute that holds
|
||||
* the values of original size of the constant in bytes and the binary offset of the
|
||||
* constant's data in the weights file used by the weightless caching mechanism. It's
|
||||
* not copyable in case the data was changed (the original node was replaced by a new
|
||||
* one produced during the tranformation pipeline) - in that case weightless caching
|
||||
* can't be used for that constant.
|
||||
*/
|
||||
class OPENVINO_API WeightlessCacheAttribute : public RuntimeAttribute {
|
||||
public:
|
||||
OPENVINO_RTTI("WeightlessCacheAttribute", "0", RuntimeAttribute)
|
||||
|
||||
WeightlessCacheAttribute() = delete;
|
||||
|
||||
WeightlessCacheAttribute(size_t original_size, size_t bin_offset, ov::element::Type original_dtype)
|
||||
: original_size(original_size),
|
||||
bin_offset(bin_offset),
|
||||
original_dtype(original_dtype) {}
|
||||
|
||||
bool is_copyable() const override;
|
||||
|
||||
size_t original_size;
|
||||
size_t bin_offset;
|
||||
ov::element::Type original_dtype;
|
||||
};
|
||||
|
||||
} // namespace ov
|
||||
@@ -3,15 +3,16 @@
|
||||
#include "ggml-openvino/openvino/node_context.h"
|
||||
#include "ggml-openvino/openvino/utils.h"
|
||||
#include "input_model.h"
|
||||
#include "pass/eliminate_zp.h"
|
||||
#include "pass/mark_decompression_convert_constant_folding.h"
|
||||
#include "pass/squeeze_matmul.h"
|
||||
#include "rt_info/weightless_caching_attributes.hpp"
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <openvino/core/node.hpp>
|
||||
#include <openvino/core/preprocess/pre_post_process.hpp>
|
||||
#include <openvino/op/add.hpp>
|
||||
#include <openvino/op/broadcast.hpp>
|
||||
#include <openvino/op/concat.hpp>
|
||||
@@ -33,7 +34,6 @@
|
||||
#include <openvino/op/unsqueeze.hpp>
|
||||
#include <openvino/pass/constant_folding.hpp>
|
||||
#include <openvino/pass/make_stateful.hpp>
|
||||
#include <openvino/core/preprocess/pre_post_process.hpp>
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
@@ -240,6 +240,31 @@ std::shared_ptr<Model> TranslateSession::translate_graph(const frontend::InputMo
|
||||
resulting_model = std::make_shared<Model>(results, used_params);
|
||||
|
||||
apply_transformations(resulting_model);
|
||||
|
||||
// Set WeightlessCacheAttribute on large constants to avoid unnecessary memory copies
|
||||
// in the NPUW plugin. Without this attribute, NPUW's LazyTensor constructor
|
||||
// (lazy_tensor.cpp, op::Const::Const) will memcpy every constant "in case export
|
||||
// occurs", doubling memory usage per compile_model call.
|
||||
//
|
||||
// The bin_offset field serves as a unique key (not a real file offset) — this is
|
||||
// the same convention the GPU plugin uses for non-IR models (see
|
||||
// Plugin::set_weightless_cache_attributes in intel_gpu/src/plugin/plugin.cpp).
|
||||
// Each constant must have a distinct bin_offset, otherwise GPU's weightless cache
|
||||
// import will map multiple constants to the same data.
|
||||
//
|
||||
// Small constants (< 16 elements) are excluded since they may be introduced by
|
||||
// optimization patterns and the overhead is negligible.
|
||||
size_t offset = 0;
|
||||
for (auto & node : resulting_model->get_ordered_ops()) {
|
||||
if (auto cnst = ov::as_type_ptr<ov::op::v0::Constant>(node);
|
||||
cnst && cnst->get_byte_size() / cnst->get_element_type().size() >= 16) {
|
||||
auto & rt_info = cnst->get_rt_info();
|
||||
if (rt_info.find(ov::WeightlessCacheAttribute::get_type_info_static()) == rt_info.end()) {
|
||||
rt_info[ov::WeightlessCacheAttribute::get_type_info_static()] =
|
||||
ov::WeightlessCacheAttribute(cnst->get_byte_size(), offset++, cnst->get_element_type());
|
||||
}
|
||||
}
|
||||
}
|
||||
return resulting_model;
|
||||
}
|
||||
|
||||
@@ -257,7 +282,6 @@ std::shared_ptr<Model> TranslateSession::apply_transformations(std::shared_ptr<M
|
||||
}
|
||||
|
||||
if (ggml_model_decoder->is_static()) {
|
||||
manager.register_pass<pass::EliminateZeroPoints>();
|
||||
manager.register_pass<pass::SqueezeMatmul>();
|
||||
}
|
||||
manager.run_passes(model);
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
#include "ggml-impl.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstddef>
|
||||
#include <ctime>
|
||||
#include <memory>
|
||||
@@ -13,6 +14,7 @@
|
||||
#include <openvino/op/gather.hpp>
|
||||
#include <openvino/op/maximum.hpp>
|
||||
#include <openvino/op/multiply.hpp>
|
||||
#include <openvino/op/reshape.hpp>
|
||||
#include <openvino/op/shape_of.hpp>
|
||||
#include <openvino/op/sin.hpp>
|
||||
#include <openvino/op/squeeze.hpp>
|
||||
@@ -87,8 +89,11 @@ ov::Output<ov::Node> rope_yarn_ramp_mix(int n_dims, const float corr_dims[2], fl
|
||||
auto ramp_y =
|
||||
std::make_shared<ov::op::v1::Divide>(std::make_shared<ov::op::v1::Subtract>(dim_ids, corr_low), denom);
|
||||
auto ramp_clamped = std::make_shared<ov::op::v0::Clamp>(ramp_y, 0.0f, 1.0f);
|
||||
// rope_yarn_ramp returns (1 - clamp(y)), so invert before scaling
|
||||
auto one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1, 1}, {1.0f});
|
||||
auto ramp_inverted = std::make_shared<ov::op::v1::Subtract>(one, ramp_clamped);
|
||||
auto ext_factor_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {ext_factor});
|
||||
auto ramp_mix = std::make_shared<ov::op::v1::Multiply>(ramp_clamped, ext_factor_node);
|
||||
auto ramp_mix = std::make_shared<ov::op::v1::Multiply>(ramp_inverted, ext_factor_node);
|
||||
return ramp_mix;
|
||||
}
|
||||
|
||||
@@ -115,6 +120,7 @@ void ggml_rope_yarn_corr_dims(int n_dims,
|
||||
std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t * rope_params,
|
||||
std::shared_ptr<ov::Node> inp_pos,
|
||||
std::shared_ptr<ov::Node> rope_freqs_weight,
|
||||
bool imrope,
|
||||
bool stateful) {
|
||||
if (stateful) {
|
||||
inp_pos = std::make_shared<ov::op::v0::Squeeze>(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {1}, {0}));
|
||||
@@ -122,6 +128,13 @@ std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t * rope_params
|
||||
auto pos_perm =
|
||||
std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{3}, std::vector<int64_t>{2, 1, 0});
|
||||
inp_pos = std::make_shared<ov::op::v1::Transpose>(inp_pos, pos_perm);
|
||||
} else if (imrope) {
|
||||
inp_pos = std::make_shared<ov::op::v0::Convert>(inp_pos, ov::element::f32);
|
||||
auto pos_shape = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{5}, {0, 0, 0, 4, -1});
|
||||
inp_pos = std::make_shared<ov::op::v1::Reshape>(inp_pos, pos_shape, true);
|
||||
auto pos_transpose_shape =
|
||||
std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{5}, std::vector<int64_t>{0, 1, 2, 4, 3});
|
||||
inp_pos = std::make_shared<ov::op::v1::Transpose>(inp_pos, pos_transpose_shape);
|
||||
} else {
|
||||
inp_pos = std::make_shared<ov::op::v0::Convert>(inp_pos, ov::element::f32);
|
||||
auto pos_perm =
|
||||
@@ -136,6 +149,7 @@ std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t * rope_params
|
||||
float beta_fast;
|
||||
float beta_slow;
|
||||
const int n_dims = rope_params[1];
|
||||
const size_t n_dims_half = n_dims >> 1;
|
||||
const int n_ctx_orig = rope_params[4];
|
||||
memcpy(&freq_base, rope_params + 5, sizeof(float));
|
||||
memcpy(&freq_scale, rope_params + 6, sizeof(float));
|
||||
@@ -146,57 +160,74 @@ std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t * rope_params
|
||||
|
||||
const float theta_scale = powf(freq_base, -2.0f / n_dims);
|
||||
|
||||
float corr_dims[2];
|
||||
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
||||
|
||||
std::vector<float> factor(n_dims / 2);
|
||||
factor[0] = 1.0f;
|
||||
for (size_t i = 1; i < factor.size(); i++) {
|
||||
factor[i] = theta_scale * factor[i - 1];
|
||||
}
|
||||
std::vector<float> factor(n_dims_half);
|
||||
|
||||
Output<Node> freq_factors;
|
||||
if (stateful) {
|
||||
freq_factors =
|
||||
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1, 1, factor.size()}, factor);
|
||||
} else {
|
||||
freq_factors =
|
||||
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1, 1, 1, factor.size()}, factor);
|
||||
}
|
||||
if (rope_freqs_weight) {
|
||||
freq_factors = std::make_shared<ov::op::v1::Divide>(freq_factors, rope_freqs_weight);
|
||||
}
|
||||
|
||||
auto theta_extrap = std::make_shared<ov::op::v1::Multiply>(freq_factors, inp_pos);
|
||||
auto theta_interp = std::make_shared<ov::op::v1::Multiply>(
|
||||
theta_extrap, ov::op::v0::Constant::create(ov::element::f32, {1}, {freq_scale}));
|
||||
|
||||
Output<Node> theta;
|
||||
float mscale = attn_factor;
|
||||
if (ext_factor == 0.0f) {
|
||||
theta = theta_interp;
|
||||
} else {
|
||||
auto ramp_mix = rope_yarn_ramp_mix(n_dims, corr_dims, ext_factor);
|
||||
Output<Node> one;
|
||||
if (stateful) {
|
||||
one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {1.0f});
|
||||
} else {
|
||||
one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1, 1}, {1.0f});
|
||||
if (imrope) {
|
||||
std::vector<int64_t> gather_indices(n_dims_half);
|
||||
for (size_t j = 0; j < n_dims_half; j++) {
|
||||
gather_indices[j] = j % 3;
|
||||
factor[j] = std::pow(theta_scale, j);
|
||||
}
|
||||
auto gather_indices_const =
|
||||
std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{n_dims_half}, gather_indices);
|
||||
auto gather_axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {4});
|
||||
inp_pos = std::make_shared<ov::op::v8::Gather>(inp_pos, gather_indices_const, gather_axis);
|
||||
auto factor_const = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{n_dims_half}, factor);
|
||||
theta = std::make_shared<ov::op::v1::Multiply>(inp_pos, factor_const);
|
||||
} else {
|
||||
float corr_dims[2];
|
||||
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
||||
factor[0] = 1.0f;
|
||||
for (size_t i = 1; i < factor.size(); i++) {
|
||||
factor[i] = theta_scale * factor[i - 1];
|
||||
}
|
||||
if (stateful) {
|
||||
freq_factors =
|
||||
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1, 1, factor.size()}, factor);
|
||||
} else {
|
||||
freq_factors =
|
||||
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1, 1, 1, factor.size()}, factor);
|
||||
}
|
||||
if (rope_freqs_weight) {
|
||||
freq_factors = std::make_shared<ov::op::v1::Divide>(freq_factors, rope_freqs_weight);
|
||||
}
|
||||
auto one_minus_ramp = std::make_shared<ov::op::v1::Subtract>(one, ramp_mix);
|
||||
|
||||
theta = std::make_shared<ov::op::v1::Add>(std::make_shared<ov::op::v1::Multiply>(theta_interp, one_minus_ramp),
|
||||
std::make_shared<ov::op::v1::Multiply>(theta_extrap, ramp_mix));
|
||||
mscale *= (1.0f + 0.1f * std::log(1.0f / freq_scale));
|
||||
auto theta_extrap = std::make_shared<ov::op::v1::Multiply>(freq_factors, inp_pos);
|
||||
auto theta_interp = std::make_shared<ov::op::v1::Multiply>(
|
||||
theta_extrap, ov::op::v0::Constant::create(ov::element::f32, {1}, {freq_scale}));
|
||||
|
||||
if (ext_factor == 0.0f) {
|
||||
theta = theta_interp;
|
||||
} else {
|
||||
auto ramp_mix = rope_yarn_ramp_mix(n_dims, corr_dims, ext_factor);
|
||||
Output<Node> one;
|
||||
if (stateful) {
|
||||
one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {1.0f});
|
||||
} else {
|
||||
one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1, 1}, {1.0f});
|
||||
}
|
||||
auto one_minus_ramp = std::make_shared<ov::op::v1::Subtract>(one, ramp_mix);
|
||||
|
||||
theta = std::make_shared<ov::op::v1::Add>(std::make_shared<ov::op::v1::Multiply>(theta_interp, one_minus_ramp),
|
||||
std::make_shared<ov::op::v1::Multiply>(theta_extrap, ramp_mix));
|
||||
mscale *= (1.0f + 0.1f * std::log(1.0f / freq_scale));
|
||||
}
|
||||
}
|
||||
|
||||
Output<Node> cos_theta = std::make_shared<ov::op::v0::Cos>(theta);
|
||||
Output<Node> sin_theta = std::make_shared<ov::op::v0::Sin>(theta);
|
||||
|
||||
auto mscale_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {mscale});
|
||||
if (!imrope) {
|
||||
auto mscale_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {mscale});
|
||||
|
||||
cos_theta = std::make_shared<ov::op::v1::Multiply>(cos_theta, mscale_node);
|
||||
sin_theta = std::make_shared<ov::op::v1::Multiply>(sin_theta, mscale_node);
|
||||
}
|
||||
|
||||
cos_theta = std::make_shared<ov::op::v1::Multiply>(cos_theta, mscale_node);
|
||||
sin_theta = std::make_shared<ov::op::v1::Multiply>(sin_theta, mscale_node);
|
||||
return std::make_pair(sin_theta, cos_theta);
|
||||
}
|
||||
|
||||
|
||||
@@ -67,6 +67,7 @@ OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std::
|
||||
std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t* rope_params,
|
||||
std::shared_ptr<ov::Node> inp_pos,
|
||||
std::shared_ptr<ov::Node> rope_freqs_weight = nullptr,
|
||||
bool imrope = false,
|
||||
bool stateful = false);
|
||||
|
||||
ov::Output<ov::Node> process_view_input(const NodeContext& context, int input_index, int slice_len = 0);
|
||||
|
||||
@@ -81,8 +81,8 @@ ov::Tensor create_ov_output_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
|
||||
enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<ov_runtime_context> r_ctx) {
|
||||
auto & core = ov_singleton_core();
|
||||
const auto & config = ggml_openvino_get_compile_config();
|
||||
auto device = r_ctx->device;
|
||||
bool stateful = r_ctx->stateful;
|
||||
const auto & device = r_ctx->device;
|
||||
const auto & stateful = r_ctx->stateful;
|
||||
static auto is_static = false;
|
||||
|
||||
if (is_naive(cgraph)) {
|
||||
@@ -106,14 +106,26 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
|
||||
int64_t infer_end_time;
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(r_ctx->ov_compute_mutex);
|
||||
|
||||
auto it = r_ctx->decoder_cache.find(key);
|
||||
|
||||
cache_hit = it != r_ctx->decoder_cache.end();
|
||||
std::shared_ptr<decoder_runtime_ctx> entry;
|
||||
ModelParams old_m_params;
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
|
||||
auto it = r_ctx->decoder_cache.find(key);
|
||||
cache_hit = it != r_ctx->decoder_cache.end();
|
||||
if (cache_hit) {
|
||||
entry = it->second;
|
||||
} else {
|
||||
auto mutex = std::make_shared<std::mutex>();
|
||||
entry = std::make_shared<decoder_runtime_ctx>(mutex);
|
||||
r_ctx->decoder_cache[key] = entry;
|
||||
}
|
||||
}
|
||||
|
||||
std::lock_guard<std::mutex> lock(*(entry->mutex));
|
||||
|
||||
if (cache_hit) {
|
||||
ggml_decoder = it->second;
|
||||
ggml_decoder = entry->ptr;
|
||||
old_m_params = ggml_decoder->get_model_params();
|
||||
cache_hit = old_m_params.can_reuse_dynamically(m_params);
|
||||
}
|
||||
@@ -126,7 +138,10 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
|
||||
ggml_decoder->update_io(cgraph);
|
||||
}
|
||||
ggml_decoder->add_extra_inputs();
|
||||
infer_request = r_ctx->infer_request_cache.at(key);
|
||||
{
|
||||
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
|
||||
infer_request = r_ctx->infer_request_cache.at(key);
|
||||
}
|
||||
|
||||
if (stateful) {
|
||||
const auto * inp_pos = get_inp_pos_tensor(cgraph);
|
||||
@@ -170,7 +185,10 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
|
||||
conversion_end_time = decoder_end_time;
|
||||
compile_end_time = decoder_end_time;
|
||||
} else {
|
||||
r_ctx->infer_request_cache.erase(key);
|
||||
{
|
||||
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
|
||||
r_ctx->infer_request_cache.erase(key);
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Model> model;
|
||||
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);
|
||||
@@ -199,8 +217,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
|
||||
}
|
||||
compile_end_time = ggml_time_us();
|
||||
infer_request = std::make_shared<ov::InferRequest>(compiled_model.create_infer_request());
|
||||
r_ctx->infer_request_cache[key] = infer_request;
|
||||
r_ctx->decoder_cache[key] = ggml_decoder;
|
||||
entry->ptr = ggml_decoder;
|
||||
|
||||
std::vector<std::string> ov_input_names;
|
||||
std::vector<std::string> ov_output_names;
|
||||
@@ -210,8 +227,13 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
|
||||
for (const auto & ov_output : model->get_results()) {
|
||||
ov_output_names.push_back(ov_output->get_friendly_name());
|
||||
}
|
||||
r_ctx->ov_input_names_cache[key] = std::move(ov_input_names);
|
||||
r_ctx->ov_output_names_cache[key] = std::move(ov_output_names);
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
|
||||
r_ctx->infer_request_cache[key] = infer_request;
|
||||
r_ctx->ov_input_names_cache[key] = std::move(ov_input_names);
|
||||
r_ctx->ov_output_names_cache[key] = std::move(ov_output_names);
|
||||
}
|
||||
|
||||
if (stateful) {
|
||||
const auto * inp_pos = get_inp_pos_tensor(cgraph);
|
||||
@@ -224,8 +246,13 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
|
||||
}
|
||||
}
|
||||
|
||||
auto ov_input_names = r_ctx->ov_input_names_cache[key];
|
||||
auto ov_output_names = r_ctx->ov_output_names_cache[key];
|
||||
std::vector<std::string> ov_input_names;
|
||||
std::vector<std::string> ov_output_names;
|
||||
{
|
||||
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
|
||||
ov_input_names = r_ctx->ov_input_names_cache[key];
|
||||
ov_output_names = r_ctx->ov_output_names_cache[key];
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < ov_input_names.size(); i++) {
|
||||
auto param_name = ov_input_names[i];
|
||||
@@ -306,12 +333,26 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
|
||||
int64_t compile_end_time;
|
||||
int64_t infer_end_time;
|
||||
|
||||
auto it = r_ctx->decoder_cache.find(key);
|
||||
|
||||
cache_hit = it != r_ctx->decoder_cache.end();
|
||||
std::shared_ptr<decoder_runtime_ctx> entry;
|
||||
ModelParams old_m_params;
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
|
||||
auto it = r_ctx->decoder_cache.find(key);
|
||||
cache_hit = it != r_ctx->decoder_cache.end();
|
||||
if (cache_hit) {
|
||||
entry = it->second;
|
||||
} else {
|
||||
auto mutex = std::make_shared<std::mutex>();
|
||||
entry = std::make_shared<decoder_runtime_ctx>(mutex);
|
||||
r_ctx->decoder_cache[key] = entry;
|
||||
}
|
||||
}
|
||||
|
||||
std::lock_guard<std::mutex> lock(*(entry->mutex));
|
||||
|
||||
if (cache_hit) {
|
||||
ggml_decoder = it->second;
|
||||
ggml_decoder = entry->ptr;
|
||||
old_m_params = ggml_decoder->get_model_params();
|
||||
cache_hit = old_m_params.can_reuse_statically(m_params);
|
||||
}
|
||||
@@ -325,14 +366,21 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
|
||||
ggml_decoder->update_io(cgraph);
|
||||
}
|
||||
ggml_decoder->add_extra_inputs();
|
||||
infer_request = is_prefill ? r_ctx->infer_request_cache_prefill.at(key) : r_ctx->infer_request_cache.at(key);
|
||||
{
|
||||
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
|
||||
infer_request =
|
||||
is_prefill ? r_ctx->infer_request_cache_prefill.at(key) : r_ctx->infer_request_cache.at(key);
|
||||
}
|
||||
|
||||
decoder_end_time = ggml_time_us();
|
||||
conversion_end_time = decoder_end_time;
|
||||
compile_end_time = decoder_end_time;
|
||||
} else {
|
||||
r_ctx->infer_request_cache.erase(key);
|
||||
r_ctx->infer_request_cache_prefill.erase(key);
|
||||
{
|
||||
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
|
||||
r_ctx->infer_request_cache.erase(key);
|
||||
r_ctx->infer_request_cache_prefill.erase(key);
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Model> model;
|
||||
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);
|
||||
@@ -372,16 +420,14 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
|
||||
compiled_model_decode = core.compile_model(model_decode, device, config);
|
||||
}
|
||||
|
||||
r_ctx->infer_request_cache_prefill[key] =
|
||||
std::make_shared<ov::InferRequest>(compiled_model_prefill.create_infer_request());
|
||||
r_ctx->infer_request_cache[key] =
|
||||
std::make_shared<ov::InferRequest>(compiled_model_decode.create_infer_request());
|
||||
auto infer_request_prefill = std::make_shared<ov::InferRequest>(compiled_model_prefill.create_infer_request());
|
||||
auto infer_request_decode = std::make_shared<ov::InferRequest>(compiled_model_decode.create_infer_request());
|
||||
compile_end_time = ggml_time_us();
|
||||
|
||||
model = is_prefill ? model_prefill : model_decode;
|
||||
ggml_decoder = is_prefill ? ggml_decoder_prefill : ggml_decoder_decode;
|
||||
infer_request = is_prefill ? r_ctx->infer_request_cache_prefill[key] : r_ctx->infer_request_cache[key];
|
||||
r_ctx->decoder_cache[key] = ggml_decoder;
|
||||
infer_request = is_prefill ? infer_request_prefill : infer_request_decode;
|
||||
entry->ptr = ggml_decoder;
|
||||
|
||||
std::vector<std::string> ov_input_names;
|
||||
std::vector<std::string> ov_output_names;
|
||||
@@ -391,18 +437,29 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
|
||||
for (const auto & ov_output : model->get_results()) {
|
||||
ov_output_names.push_back(ov_output->get_friendly_name());
|
||||
}
|
||||
r_ctx->ov_input_names_cache[key] = std::move(ov_input_names);
|
||||
r_ctx->ov_output_names_cache[key] = std::move(ov_output_names);
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
|
||||
r_ctx->infer_request_cache_prefill[key] = infer_request_prefill;
|
||||
r_ctx->infer_request_cache[key] = infer_request_decode;
|
||||
r_ctx->ov_input_names_cache[key] = std::move(ov_input_names);
|
||||
r_ctx->ov_output_names_cache[key] = std::move(ov_output_names);
|
||||
}
|
||||
}
|
||||
|
||||
auto ov_input_names = r_ctx->ov_input_names_cache[key];
|
||||
auto ov_output_names = r_ctx->ov_output_names_cache[key];
|
||||
std::vector<std::string> ov_input_names_local;
|
||||
std::vector<std::string> ov_output_names_local;
|
||||
{
|
||||
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
|
||||
ov_input_names_local = r_ctx->ov_input_names_cache[key];
|
||||
ov_output_names_local = r_ctx->ov_output_names_cache[key];
|
||||
}
|
||||
|
||||
if (is_prefill) {
|
||||
auto inp_len = inp_pos->ne[0];
|
||||
for (int chunk_index = 0; chunk_index * prefill_chunk_size < inp_len; chunk_index++) {
|
||||
for (size_t i = 0; i < ov_input_names.size(); i++) {
|
||||
auto param_name = ov_input_names[i];
|
||||
for (size_t i = 0; i < ov_input_names_local.size(); i++) {
|
||||
auto param_name = ov_input_names_local[i];
|
||||
auto input_tensor = get_ov_input_tensor_static_prefill(ggml_decoder, param_name, chunk_index);
|
||||
infer_request->set_input_tensor(i, input_tensor);
|
||||
|
||||
@@ -412,8 +469,8 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < ov_output_names.size(); i++) {
|
||||
auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names[i]);
|
||||
for (size_t i = 0; i < ov_output_names_local.size(); i++) {
|
||||
auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names_local[i]);
|
||||
auto output_tensor = create_ov_output_tensor(ggml_decoder, infer_request, i, ggml_tensor);
|
||||
infer_request->set_output_tensor(i, output_tensor);
|
||||
}
|
||||
@@ -421,16 +478,16 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
|
||||
infer_request->infer();
|
||||
|
||||
if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) {
|
||||
for (size_t i = 0; i < ov_output_names.size(); i++) {
|
||||
for (size_t i = 0; i < ov_output_names_local.size(); i++) {
|
||||
const auto output_tensor = infer_request->get_output_tensor(i);
|
||||
print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data());
|
||||
print_output_tensor_info(ov_output_names_local[i], output_tensor, output_tensor.data());
|
||||
}
|
||||
}
|
||||
}
|
||||
infer_end_time = ggml_time_us();
|
||||
} else {
|
||||
for (size_t i = 0; i < ov_input_names.size(); i++) {
|
||||
auto param_name = ov_input_names[i];
|
||||
for (size_t i = 0; i < ov_input_names_local.size(); i++) {
|
||||
auto param_name = ov_input_names_local[i];
|
||||
auto input_tensor = get_ov_input_tensor_static_decode(ggml_decoder, param_name);
|
||||
infer_request->set_input_tensor(i, input_tensor);
|
||||
|
||||
@@ -440,8 +497,8 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < ov_output_names.size(); i++) {
|
||||
auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names[i]);
|
||||
for (size_t i = 0; i < ov_output_names_local.size(); i++) {
|
||||
auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names_local[i]);
|
||||
auto output_tensor = create_ov_output_tensor(ggml_decoder, infer_request, i, ggml_tensor);
|
||||
infer_request->set_output_tensor(i, output_tensor);
|
||||
}
|
||||
@@ -450,9 +507,9 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
|
||||
infer_end_time = ggml_time_us();
|
||||
|
||||
if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) {
|
||||
for (size_t i = 0; i < ov_output_names.size(); i++) {
|
||||
for (size_t i = 0; i < ov_output_names_local.size(); i++) {
|
||||
const auto output_tensor = infer_request->get_output_tensor(i);
|
||||
print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data());
|
||||
print_output_tensor_info(ov_output_names_local[i], output_tensor, output_tensor.data());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,12 +3,15 @@
|
||||
#include "ggml-impl.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <atomic>
|
||||
#include <cstddef>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <openvino/runtime/core.hpp>
|
||||
#include <openvino/runtime/infer_request.hpp>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
struct graph_key {
|
||||
@@ -40,11 +43,17 @@ struct graph_key_hash {
|
||||
}
|
||||
};
|
||||
|
||||
struct decoder_runtime_ctx {
|
||||
decoder_runtime_ctx(std::shared_ptr<std::mutex> mutex) : mutex(std::move(mutex)) {}
|
||||
std::shared_ptr<std::mutex> mutex;
|
||||
std::shared_ptr<GgmlOvDecoder> ptr;
|
||||
};
|
||||
|
||||
struct ov_runtime_context {
|
||||
std::mutex ov_compute_mutex;
|
||||
mutable std::mutex ctx_mutex;
|
||||
std::string device;
|
||||
bool stateful;
|
||||
std::unordered_map<graph_key, std::shared_ptr<GgmlOvDecoder>, graph_key_hash> decoder_cache;
|
||||
std::unordered_map<graph_key, std::shared_ptr<decoder_runtime_ctx>, graph_key_hash> decoder_cache;
|
||||
std::unordered_map<graph_key, std::shared_ptr<ov::InferRequest>, graph_key_hash> infer_request_cache;
|
||||
std::unordered_map<graph_key, std::shared_ptr<ov::InferRequest>, graph_key_hash> infer_request_cache_prefill;
|
||||
std::unordered_map<graph_key, std::vector<std::string>, graph_key_hash> ov_input_names_cache;
|
||||
@@ -53,11 +62,22 @@ struct ov_runtime_context {
|
||||
// Simultanous stateful inference request support to be added.
|
||||
size_t stateful_kv_size;
|
||||
std::map<std::string, std::string> kv_state_input_name_map;
|
||||
std::atomic<int> backend_count;
|
||||
|
||||
ov_runtime_context() :
|
||||
device("CPU"),
|
||||
stateful(false),
|
||||
stateful_kv_size(0) {}
|
||||
stateful_kv_size(0),
|
||||
backend_count(0) {}
|
||||
|
||||
void clear_caches() {
|
||||
std::lock_guard<std::mutex> lock(ctx_mutex);
|
||||
decoder_cache.clear();
|
||||
infer_request_cache.clear();
|
||||
infer_request_cache_prefill.clear();
|
||||
ov_input_names_cache.clear();
|
||||
ov_output_names_cache.clear();
|
||||
}
|
||||
};
|
||||
|
||||
enum ggml_status ov_graph_compute(struct ggml_cgraph * cgraph, ggml_backend_t backend);
|
||||
|
||||
@@ -2,6 +2,7 @@ message(STATUS "Using RPC backend")
|
||||
|
||||
ggml_add_backend_library(ggml-rpc
|
||||
ggml-rpc.cpp
|
||||
transport.cpp
|
||||
)
|
||||
|
||||
if (WIN32)
|
||||
|
||||
+64
-742
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user