mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-13 09:16:42 +02:00
Compare commits
28 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| d8a24ccee2 | |||
| c34b92235b | |||
| e37abd6b5f | |||
| f58bad4137 | |||
| cd5044661c | |||
| ebc10770ac | |||
| 3e7bd4f39a | |||
| f7ca93d12c | |||
| 02182fc5b9 | |||
| f532be8fac | |||
| e08c226a2c | |||
| 70b54e140c | |||
| 6471e3c090 | |||
| 88a39274ec | |||
| 85f99dca8b | |||
| 099ea76fb4 | |||
| ba1df050f3 | |||
| 1593d5684d | |||
| 4c6595503f | |||
| 263cc04a54 | |||
| 17e59d6209 | |||
| fdc3db9b65 | |||
| 1af154a76f | |||
| 18ef86ecec | |||
| 1bfbdb134e | |||
| 68f30663cf | |||
| db94854ff5 | |||
| ac4cddeb0d |
@@ -1,6 +1,7 @@
|
||||
ARG UBUNTU_VERSION=24.04
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG CUDA_VERSION=12.8.1
|
||||
ARG GCC_VERSION=14
|
||||
# Target the CUDA build image
|
||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||
|
||||
@@ -12,13 +13,14 @@ ARG APP_REVISION=N/A
|
||||
|
||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
||||
|
||||
ARG GCC_VERSION
|
||||
# CUDA architecture to build for (defaults to all supported archs)
|
||||
ARG CUDA_DOCKER_ARCH=default
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y gcc-14 g++-14 build-essential cmake python3 python3-pip git libssl-dev libgomp1
|
||||
apt-get install -y gcc-${GCC_VERSION} g++-${GCC_VERSION} build-essential cmake python3 python3-pip git libssl-dev libgomp1
|
||||
|
||||
ENV CC=gcc-14 CXX=g++-14 CUDAHOSTCXX=g++-14
|
||||
ENV CC=gcc-${GCC_VERSION} CXX=g++-${GCC_VERSION} CUDAHOSTCXX=g++-${GCC_VERSION}
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
+103
-124
@@ -34,129 +34,108 @@ env:
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ubuntu-24-sycl:
|
||||
strategy:
|
||||
matrix:
|
||||
build: [fp32, fp16]
|
||||
include:
|
||||
- build: fp32
|
||||
fp16: OFF
|
||||
- build: fp16
|
||||
fp16: ON
|
||||
|
||||
# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
# in order to enable it again, we have to provision dedicated runners to run it
|
||||
# ubuntu-24-sycl:
|
||||
# strategy:
|
||||
# matrix:
|
||||
# build: [fp32]
|
||||
# include:
|
||||
# - build: fp32
|
||||
# fp16: OFF
|
||||
#
|
||||
# runs-on: ubuntu-24.04
|
||||
#
|
||||
# env:
|
||||
# ONEAPI_ROOT: /opt/intel/oneapi/
|
||||
# ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
# LEVEL_ZERO_VERSION: "1.28.2"
|
||||
# LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
|
||||
#
|
||||
# continue-on-error: true
|
||||
#
|
||||
# steps:
|
||||
# - uses: actions/checkout@v6
|
||||
#
|
||||
# - name: Use oneAPI Installation Cache
|
||||
# uses: actions/cache@v5
|
||||
# id: cache-sycl
|
||||
# with:
|
||||
# path: ${{ env.ONEAPI_ROOT }}
|
||||
# key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
#
|
||||
# - name: Download & Install oneAPI
|
||||
# shell: bash
|
||||
# if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
# run: |
|
||||
# cd /tmp
|
||||
# wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
|
||||
# sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
|
||||
#
|
||||
# - name: Install Level Zero SDK
|
||||
# shell: bash
|
||||
# run: |
|
||||
# cd /tmp
|
||||
# wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
|
||||
# wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
|
||||
# sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
|
||||
#
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: sycl-ubuntu-24-${{ matrix.build }}
|
||||
# evict-old-files: 1d
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
#
|
||||
# - name: Build
|
||||
# id: cmake_build
|
||||
# run: |
|
||||
# source /opt/intel/oneapi/setvars.sh
|
||||
# cmake -B build \
|
||||
# -G "Ninja" \
|
||||
# -DCMAKE_BUILD_TYPE=Release \
|
||||
# -DGGML_SYCL=ON \
|
||||
# -DCMAKE_C_COMPILER=icx \
|
||||
# -DCMAKE_CXX_COMPILER=icpx \
|
||||
# -DLLAMA_OPENSSL=OFF \
|
||||
# -DGGML_NATIVE=OFF \
|
||||
# -DGGML_SYCL_F16=${{ matrix.fp16 }}
|
||||
# time cmake --build build --config Release -j $(nproc)
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
# in order to enable it again, we have to provision dedicated runners to run it
|
||||
# windows-latest-sycl:
|
||||
# runs-on: windows-2022
|
||||
#
|
||||
# defaults:
|
||||
# run:
|
||||
# shell: bash
|
||||
#
|
||||
# env:
|
||||
# WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
|
||||
# WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
|
||||
# LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
|
||||
# ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||
# ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: Use oneAPI Installation Cache
|
||||
# uses: actions/cache@v5
|
||||
# id: cache-sycl
|
||||
# with:
|
||||
# path: ${{ env.ONEAPI_ROOT }}
|
||||
# key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
#
|
||||
# - name: Download & Install oneAPI
|
||||
# shell: bash
|
||||
# if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
# run: |
|
||||
# scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
#
|
||||
# - name: Install Level Zero SDK
|
||||
# shell: pwsh
|
||||
# run: |
|
||||
# Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
|
||||
# Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
|
||||
# "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||
#
|
||||
# - name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: sycl-windows-latest
|
||||
# variant: ccache
|
||||
# evict-old-files: 1d
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
#
|
||||
# # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
|
||||
#
|
||||
# - name: Build
|
||||
# id: cmake_build
|
||||
# run: examples/sycl/win-build-sycl.bat
|
||||
env:
|
||||
ONEAPI_ROOT: /opt/intel/oneapi/
|
||||
ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
LEVEL_ZERO_VERSION: "1.28.2"
|
||||
LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
|
||||
|
||||
continue-on-error: true
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Download & Install oneAPI
|
||||
shell: bash
|
||||
run: |
|
||||
cd /tmp
|
||||
wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
|
||||
sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
|
||||
|
||||
- name: Install Level Zero SDK
|
||||
shell: bash
|
||||
run: |
|
||||
cd /tmp
|
||||
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
|
||||
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
|
||||
sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: sycl-ubuntu-24-${{ matrix.build }}
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
cmake -B build \
|
||||
-G "Ninja" \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_SYCL=ON \
|
||||
-DCMAKE_C_COMPILER=icx \
|
||||
-DCMAKE_CXX_COMPILER=icpx \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_SYCL_F16=${{ matrix.fp16 }}
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
windows-latest-sycl:
|
||||
runs-on: windows-2022
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
env:
|
||||
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
|
||||
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
|
||||
LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
|
||||
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||
ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Download & Install oneAPI
|
||||
shell: bash
|
||||
run: |
|
||||
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
|
||||
- name: Install Level Zero SDK
|
||||
shell: pwsh
|
||||
run: |
|
||||
Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
|
||||
Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
|
||||
"LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: sycl-windows-latest
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
# TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: examples/sycl/win-build-sycl.bat
|
||||
|
||||
+245
-218
@@ -59,8 +59,31 @@ jobs:
|
||||
echo "should_release=false" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
get-version:
|
||||
runs-on: ubuntu-slim
|
||||
outputs:
|
||||
ui_version: ${{ steps.version.outputs.ui_version }}
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- id: version
|
||||
run: |
|
||||
# Resolve UI version: BUILD_NUMBER from cmake/build-info.cmake > git hash + epoch > fallback
|
||||
version=""
|
||||
if grep -q "BUILD_NUMBER" cmake/build-info.cmake; then
|
||||
build_number=$(grep "set(BUILD_NUMBER" cmake/build-info.cmake | grep -oP '\d+')
|
||||
if [ -n "$build_number" ] && [ "$build_number" -gt 0 ]; then
|
||||
version="b${build_number}"
|
||||
fi
|
||||
fi
|
||||
if [ -z "$version" ]; then
|
||||
version=$(git rev-parse --short HEAD)-$(date +%s)
|
||||
fi
|
||||
echo "ui_version=${version}" >> $GITHUB_OUTPUT
|
||||
|
||||
macos-cpu:
|
||||
needs: [check-release]
|
||||
needs: [check-release, get-version]
|
||||
if: ${{ needs.check-release.outputs.should_release == 'true' }}
|
||||
strategy:
|
||||
matrix:
|
||||
@@ -116,6 +139,7 @@ jobs:
|
||||
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||
-DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
@@ -141,7 +165,7 @@ jobs:
|
||||
name: llama-bin-macos-${{ matrix.build }}.tar.gz
|
||||
|
||||
ubuntu-cpu:
|
||||
needs: [check-release]
|
||||
needs: [check-release, get-version]
|
||||
if: ${{ needs.check-release.outputs.should_release == 'true' }}
|
||||
strategy:
|
||||
matrix:
|
||||
@@ -201,6 +225,7 @@ jobs:
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_CPU_ALL_VARIANTS=ON \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
@@ -227,7 +252,7 @@ jobs:
|
||||
name: llama-bin-ubuntu-${{ matrix.build }}.tar.gz
|
||||
|
||||
ubuntu-vulkan:
|
||||
needs: [check-release]
|
||||
needs: [check-release, get-version]
|
||||
if: ${{ needs.check-release.outputs.should_release == 'true' }}
|
||||
|
||||
strategy:
|
||||
@@ -287,6 +312,7 @@ jobs:
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_CPU_ALL_VARIANTS=ON \
|
||||
-DGGML_VULKAN=ON \
|
||||
-DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
@@ -312,7 +338,7 @@ jobs:
|
||||
name: llama-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz
|
||||
|
||||
android-arm64:
|
||||
needs: [check-release]
|
||||
needs: [check-release, get-version]
|
||||
if: ${{ needs.check-release.outputs.should_release == 'true' }}
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
@@ -379,6 +405,7 @@ jobs:
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||
-DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
@@ -404,7 +431,7 @@ jobs:
|
||||
name: llama-bin-android-arm64.tar.gz
|
||||
|
||||
ubuntu-24-openvino:
|
||||
needs: [check-release]
|
||||
needs: [check-release, get-version]
|
||||
if: ${{ needs.check-release.outputs.should_release == 'true' }}
|
||||
|
||||
runs-on: ubuntu-24.04
|
||||
@@ -476,7 +503,8 @@ jobs:
|
||||
source ./openvino_toolkit/setupvars.sh
|
||||
cmake -B build/ReleaseOV -G Ninja \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENVINO=ON
|
||||
-DGGML_OPENVINO=ON \
|
||||
-DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }}
|
||||
cmake --build build/ReleaseOV --config Release -j $(nproc)
|
||||
|
||||
- name: ccache-clear
|
||||
@@ -754,213 +782,205 @@ jobs:
|
||||
path: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
|
||||
name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
|
||||
|
||||
# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
# in order to enable it again, we have to provision dedicated runners to run it
|
||||
# windows-sycl:
|
||||
#
|
||||
# runs-on: windows-2022
|
||||
#
|
||||
# defaults:
|
||||
# run:
|
||||
# shell: bash
|
||||
#
|
||||
# env:
|
||||
# WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
|
||||
# WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
|
||||
# LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
|
||||
# ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||
# ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: Use oneAPI Installation Cache
|
||||
# uses: actions/cache@v5
|
||||
# id: cache-sycl
|
||||
# with:
|
||||
# path: ${{ env.ONEAPI_ROOT }}
|
||||
# key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
#
|
||||
# - name: Download & Install oneAPI
|
||||
# shell: bash
|
||||
# if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
# run: |
|
||||
# scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
#
|
||||
# - name: Install Level Zero SDK
|
||||
# shell: pwsh
|
||||
# run: |
|
||||
# Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
|
||||
# Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
|
||||
# "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||
#
|
||||
# - name: Setup Node.js
|
||||
# uses: actions/setup-node@v6
|
||||
# with:
|
||||
# node-version: "24"
|
||||
# cache: "npm"
|
||||
# cache-dependency-path: "tools/ui/package-lock.json"
|
||||
#
|
||||
# - name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: release-windows-2022-x64-sycl
|
||||
#
|
||||
# - name: Build
|
||||
# id: cmake_build
|
||||
# shell: cmd
|
||||
# run: |
|
||||
# call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||
# cmake -G "Ninja" -B build ^
|
||||
# -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^
|
||||
# -DCMAKE_BUILD_TYPE=Release ^
|
||||
# -DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
|
||||
# -DGGML_CPU=OFF -DGGML_SYCL=ON ^
|
||||
# -DLLAMA_BUILD_BORINGSSL=ON
|
||||
# cmake --build build --target ggml-sycl -j
|
||||
#
|
||||
# - name: Build the release package
|
||||
# id: pack_artifacts
|
||||
# run: |
|
||||
# echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
|
||||
#
|
||||
# cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
|
||||
#
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero_v2.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
|
||||
# ZE_LOADER_DLL=$(find "${{ env.ONEAPI_ROOT }}" "$LEVEL_ZERO_V1_SDK_PATH" -iname ze_loader.dll -print -quit 2>/dev/null || true)
|
||||
# if [ -n "$ZE_LOADER_DLL" ]; then
|
||||
# echo "Using Level Zero loader: $ZE_LOADER_DLL"
|
||||
# cp "$ZE_LOADER_DLL" ./build/bin
|
||||
# else
|
||||
# echo "Level Zero loader DLL not found in oneAPI or SDK; relying on system driver/runtime"
|
||||
# fi
|
||||
#
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-fallback-bfloat16.spv" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-native-bfloat16.spv" ./build/bin
|
||||
#
|
||||
# cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
|
||||
#
|
||||
# cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/tcm.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/libhwloc-15.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/umf/latest/bin/umf.dll" ./build/bin
|
||||
#
|
||||
# echo "cp oneAPI running time dll files to ./build/bin done"
|
||||
# 7z a -snl llama-bin-win-sycl-x64.zip ./build/bin/*
|
||||
#
|
||||
# - name: Upload the release package
|
||||
# uses: actions/upload-artifact@v6
|
||||
# with:
|
||||
# path: llama-bin-win-sycl-x64.zip
|
||||
# name: llama-bin-win-sycl-x64.zip
|
||||
windows-sycl:
|
||||
|
||||
# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
# in order to enable it again, we have to provision dedicated runners to run it
|
||||
# ubuntu-24-sycl:
|
||||
#
|
||||
# strategy:
|
||||
# matrix:
|
||||
# build: [fp32]
|
||||
# include:
|
||||
# - build: fp32
|
||||
# fp16: OFF
|
||||
#
|
||||
# runs-on: ubuntu-24.04
|
||||
#
|
||||
# env:
|
||||
# ONEAPI_ROOT: /opt/intel/oneapi/
|
||||
# ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
# LEVEL_ZERO_VERSION: "1.28.2"
|
||||
# LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
# with:
|
||||
# fetch-depth: 0
|
||||
#
|
||||
# - name: Use oneAPI Installation Cache
|
||||
# uses: actions/cache@v5
|
||||
# id: cache-sycl
|
||||
# with:
|
||||
# path: ${{ env.ONEAPI_ROOT }}
|
||||
# key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
#
|
||||
# - name: Download & Install oneAPI
|
||||
# shell: bash
|
||||
# if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
# run: |
|
||||
# cd /tmp
|
||||
# wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
|
||||
# sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
|
||||
#
|
||||
# - name: Install Level Zero SDK
|
||||
# shell: bash
|
||||
# run: |
|
||||
# cd /tmp
|
||||
# wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
|
||||
# wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
|
||||
# sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
|
||||
#
|
||||
# - name: Setup Node.js
|
||||
# uses: actions/setup-node@v6
|
||||
# with:
|
||||
# node-version: "24"
|
||||
# cache: "npm"
|
||||
# cache-dependency-path: "tools/ui/package-lock.json"
|
||||
#
|
||||
# - name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: release-ubuntu-24.04-sycl
|
||||
#
|
||||
# - name: Build
|
||||
# id: cmake_build
|
||||
# run: |
|
||||
# source /opt/intel/oneapi/setvars.sh
|
||||
# cmake -B build \
|
||||
# -G "Ninja" \
|
||||
# -DCMAKE_BUILD_TYPE=Release \
|
||||
# -DGGML_SYCL=ON \
|
||||
# -DCMAKE_C_COMPILER=icx \
|
||||
# -DCMAKE_CXX_COMPILER=icpx \
|
||||
# -DLLAMA_OPENSSL=OFF \
|
||||
# -DGGML_NATIVE=OFF \
|
||||
# -DGGML_SYCL_F16=${{ matrix.fp16 }}
|
||||
# time cmake --build build --config Release -j $(nproc)
|
||||
#
|
||||
# - name: Determine tag name
|
||||
# id: tag
|
||||
# uses: ./.github/actions/get-tag-name
|
||||
#
|
||||
# - name: Pack artifacts
|
||||
# id: pack_artifacts
|
||||
# run: |
|
||||
# cp LICENSE ./build/bin/
|
||||
# tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
|
||||
#
|
||||
# - name: Upload artifacts
|
||||
# uses: actions/upload-artifact@v6
|
||||
# with:
|
||||
# path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
|
||||
# name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
|
||||
runs-on: windows-2022
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
env:
|
||||
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
|
||||
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
|
||||
LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
|
||||
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||
ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Download & Install oneAPI
|
||||
shell: bash
|
||||
run: |
|
||||
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
|
||||
- name: Install Level Zero SDK
|
||||
shell: pwsh
|
||||
run: |
|
||||
Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
|
||||
Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
|
||||
"LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v6
|
||||
with:
|
||||
node-version: "24"
|
||||
cache: "npm"
|
||||
cache-dependency-path: "tools/ui/package-lock.json"
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: release-windows-2022-x64-sycl
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
shell: cmd
|
||||
run: |
|
||||
call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||
cmake -G "Ninja" -B build ^
|
||||
-DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^
|
||||
-DCMAKE_BUILD_TYPE=Release ^
|
||||
-DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
|
||||
-DGGML_CPU=OFF -DGGML_SYCL=ON ^
|
||||
-DLLAMA_BUILD_BORINGSSL=ON
|
||||
cmake --build build --target ggml-sycl -j %NUMBER_OF_PROCESSORS%
|
||||
|
||||
- name: ccache-clear
|
||||
uses: ./.github/actions/ccache-clear
|
||||
with:
|
||||
key: release-windows-2022-x64-sycl
|
||||
|
||||
- name: Build the release package
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero_v2.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
|
||||
ZE_LOADER_DLL=$(find "${{ env.ONEAPI_ROOT }}" "$LEVEL_ZERO_V1_SDK_PATH" -iname ze_loader.dll -print -quit 2>/dev/null || true)
|
||||
if [ -n "$ZE_LOADER_DLL" ]; then
|
||||
echo "Using Level Zero loader: $ZE_LOADER_DLL"
|
||||
cp "$ZE_LOADER_DLL" ./build/bin
|
||||
else
|
||||
echo "Level Zero loader DLL not found in oneAPI or SDK; relying on system driver/runtime"
|
||||
fi
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-fallback-bfloat16.spv" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-native-bfloat16.spv" ./build/bin
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/tcm.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/libhwloc-15.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/umf/latest/bin/umf.dll" ./build/bin
|
||||
|
||||
echo "cp oneAPI running time dll files to ./build/bin done"
|
||||
7z a -snl llama-bin-win-sycl-x64.zip ./build/bin/*
|
||||
|
||||
- name: Upload the release package
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
path: llama-bin-win-sycl-x64.zip
|
||||
name: llama-bin-win-sycl-x64.zip
|
||||
|
||||
ubuntu-24-sycl:
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
build: [fp32, fp16]
|
||||
include:
|
||||
- build: fp32
|
||||
fp16: OFF
|
||||
- build: fp16
|
||||
fp16: ON
|
||||
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
env:
|
||||
ONEAPI_ROOT: /opt/intel/oneapi/
|
||||
ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
LEVEL_ZERO_VERSION: "1.28.2"
|
||||
LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Download & Install oneAPI
|
||||
shell: bash
|
||||
run: |
|
||||
cd /tmp
|
||||
wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
|
||||
sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
|
||||
|
||||
- name: Install Level Zero SDK
|
||||
shell: bash
|
||||
run: |
|
||||
cd /tmp
|
||||
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
|
||||
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
|
||||
sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v6
|
||||
with:
|
||||
node-version: "24"
|
||||
cache: "npm"
|
||||
cache-dependency-path: "tools/ui/package-lock.json"
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: release-ubuntu-24.04-sycl-${{ matrix.build }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
cmake -B build \
|
||||
-G "Ninja" \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_SYCL=ON \
|
||||
-DCMAKE_C_COMPILER=icx \
|
||||
-DCMAKE_CXX_COMPILER=icpx \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_SYCL_F16=${{ matrix.fp16 }}
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: ccache-clear
|
||||
uses: ./.github/actions/ccache-clear
|
||||
with:
|
||||
key: release-ubuntu-24.04-sycl-${{ matrix.build }}
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
|
||||
name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
|
||||
|
||||
ubuntu-22-rocm:
|
||||
needs: [check-release]
|
||||
needs: [check-release, get-version]
|
||||
if: ${{ needs.check-release.outputs.should_release == 'true' }}
|
||||
|
||||
runs-on: ubuntu-22.04
|
||||
@@ -1052,6 +1072,7 @@ jobs:
|
||||
-DGGML_HIP=ON \
|
||||
-DHIP_PLATFORM=amd \
|
||||
-DGGML_HIP_ROCWMMA_FATTN=ON \
|
||||
-DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
@@ -1080,7 +1101,7 @@ jobs:
|
||||
name: llama-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz
|
||||
|
||||
windows-hip:
|
||||
needs: [check-release]
|
||||
needs: [check-release, get-version]
|
||||
if: ${{ needs.check-release.outputs.should_release == 'true' }}
|
||||
|
||||
runs-on: windows-2022
|
||||
@@ -1176,6 +1197,7 @@ jobs:
|
||||
-DGPU_TARGETS="${{ matrix.gpu_targets }}" `
|
||||
-DGGML_HIP_ROCWMMA_FATTN=ON `
|
||||
-DGGML_HIP=ON `
|
||||
-DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} `
|
||||
-DLLAMA_BUILD_BORINGSSL=ON
|
||||
cmake --build build --target ggml-hip -j ${env:NUMBER_OF_PROCESSORS}
|
||||
md "build\bin\rocblas\library\"
|
||||
@@ -1203,7 +1225,7 @@ jobs:
|
||||
name: llama-bin-win-hip-${{ matrix.name }}-x64.zip
|
||||
|
||||
ios-xcode:
|
||||
needs: [check-release]
|
||||
needs: [check-release, get-version]
|
||||
if: ${{ needs.check-release.outputs.should_release == 'true' }}
|
||||
runs-on: macos-26
|
||||
|
||||
@@ -1232,7 +1254,8 @@ jobs:
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=iOS \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=16.0 \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml \
|
||||
-DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }}
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||
|
||||
- name: xcodebuild for swift package
|
||||
@@ -1352,10 +1375,12 @@ jobs:
|
||||
# path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
|
||||
# name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
|
||||
|
||||
ui:
|
||||
needs: [check-release]
|
||||
ui-build:
|
||||
needs: [check-release, get-version]
|
||||
if: ${{ needs.check-release.outputs.should_release == 'true' }}
|
||||
uses: ./.github/workflows/ui-build.yml
|
||||
with:
|
||||
hf_ui_version: ${{ needs.get-version.outputs.ui_version }}
|
||||
|
||||
release:
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
@@ -1368,6 +1393,7 @@ jobs:
|
||||
runs-on: ubuntu-slim
|
||||
|
||||
needs:
|
||||
- get-version
|
||||
- windows
|
||||
- windows-cpu
|
||||
- windows-cuda
|
||||
@@ -1382,7 +1408,7 @@ jobs:
|
||||
- macos-cpu
|
||||
- ios-xcode
|
||||
#- openEuler-cann
|
||||
- ui
|
||||
- ui-build
|
||||
|
||||
outputs:
|
||||
tag_name: ${{ steps.tag.outputs.name }}
|
||||
@@ -1482,7 +1508,8 @@ jobs:
|
||||
- [Ubuntu arm64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-arm64.tar.gz)
|
||||
- [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
|
||||
- [Ubuntu x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ needs.ubuntu-24-openvino.outputs.openvino_version }}-x64.tar.gz)
|
||||
- Ubuntu x64 (SYCL FP32) [DISABLED](https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
- [Ubuntu x64 (SYCL FP32)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp32-x64.tar.gz)
|
||||
- [Ubuntu x64 (SYCL FP16)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp16-x64.tar.gz)
|
||||
|
||||
**Android:**
|
||||
- [Android arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz)
|
||||
@@ -1493,7 +1520,7 @@ jobs:
|
||||
- [Windows x64 (CUDA 12)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip) - [CUDA 12.4 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-12.4-x64.zip)
|
||||
- [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.3-x64.zip) - [CUDA 13.3 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-13.3-x64.zip)
|
||||
- [Windows x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-vulkan-x64.zip)
|
||||
- Windows x64 (SYCL) [DISABLED](https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
- [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip)
|
||||
- [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip)
|
||||
|
||||
**openEuler:**
|
||||
|
||||
@@ -2,6 +2,11 @@ name: UI Build
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
hf_ui_version:
|
||||
description: 'Version string for version.json (e.g. 12345)'
|
||||
required: false
|
||||
type: string
|
||||
|
||||
jobs:
|
||||
build:
|
||||
@@ -25,9 +30,16 @@ jobs:
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Build application
|
||||
env:
|
||||
HF_UI_VERSION: ${{ inputs.hf_ui_version || '' }}
|
||||
LLAMA_UI_VERSION: ${{ inputs.hf_ui_version || 'b0000' }}
|
||||
run: npm run build
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Run PWA unit tests (versioned build output)
|
||||
run: npx vitest --project=unit --run tests/unit/pwa.spec.ts
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Generate checksums
|
||||
run: |
|
||||
cd tools/ui/dist
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
name: UI (self-hosted)
|
||||
|
||||
# these are the same as ui.yml, but with self-hosted runners
|
||||
# the runners come with pre-installed Playwright browsers version: 1.56.1
|
||||
# the jobs are much lighter because they don't need to install node and playwright browsers
|
||||
# the jobs are lighter because they don't need to install Node.js or Playwright browsers
|
||||
# the runner has pre-installed Playwright browsers for @playwright/test (1.56.1) at /ms-playwright/
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
@@ -61,6 +61,12 @@ jobs:
|
||||
run: npm ci
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Download built UI artifacts
|
||||
uses: actions/download-artifact@v6
|
||||
with:
|
||||
name: ui-build
|
||||
path: tools/ui/dist/
|
||||
|
||||
- name: Run type checking
|
||||
if: ${{ always() && steps.setup.conclusion == 'success' }}
|
||||
run: npm run check
|
||||
@@ -72,12 +78,12 @@ jobs:
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Run Client tests
|
||||
if: ${{ always() }}
|
||||
if: ${{ always() && steps.setup.conclusion == 'success' }}
|
||||
run: npm run test:client
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Run Unit tests
|
||||
if: ${{ always() }}
|
||||
if: ${{ always() && steps.setup.conclusion == 'success' }}
|
||||
run: npm run test:unit
|
||||
working-directory: tools/ui
|
||||
|
||||
@@ -97,22 +103,23 @@ jobs:
|
||||
run: npm ci
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Build application
|
||||
if: ${{ always() && steps.setup.conclusion == 'success' }}
|
||||
run: npm run build
|
||||
working-directory: tools/ui
|
||||
- name: Download built UI artifacts
|
||||
uses: actions/download-artifact@v6
|
||||
with:
|
||||
name: ui-build
|
||||
path: tools/ui/dist/
|
||||
|
||||
- name: Build Storybook
|
||||
if: ${{ always() }}
|
||||
if: ${{ always() && steps.setup.conclusion == 'success' }}
|
||||
run: npm run build-storybook
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Run UI tests
|
||||
if: ${{ always() }}
|
||||
if: ${{ always() && steps.setup.conclusion == 'success' }}
|
||||
run: npm run test:ui -- --testTimeout=60000
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Run E2E tests
|
||||
if: ${{ always() }}
|
||||
if: ${{ always() && steps.setup.conclusion == 'success' }}
|
||||
run: npm run test:e2e
|
||||
working-directory: tools/ui
|
||||
|
||||
@@ -43,7 +43,7 @@ jobs:
|
||||
ui-checks:
|
||||
name: Checks
|
||||
needs: ui-build
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-24.04
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Checkout code
|
||||
@@ -60,6 +60,12 @@ jobs:
|
||||
cache: "npm"
|
||||
cache-dependency-path: "tools/ui/package-lock.json"
|
||||
|
||||
- name: Download built UI artifacts
|
||||
uses: actions/download-artifact@v6
|
||||
with:
|
||||
name: ui-build
|
||||
path: tools/ui/dist/
|
||||
|
||||
- name: Install dependencies
|
||||
id: setup
|
||||
if: ${{ steps.node.conclusion == 'success' }}
|
||||
@@ -87,7 +93,7 @@ jobs:
|
||||
run: npm run test:client
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Run Unit tests
|
||||
- name: Run Unit tests (uses pre-built dist/ from ui-build)
|
||||
if: ${{ always() && steps.playwright.conclusion == 'success' }}
|
||||
run: npm run test:unit
|
||||
working-directory: tools/ui
|
||||
@@ -95,7 +101,7 @@ jobs:
|
||||
e2e-tests:
|
||||
name: E2E Tests
|
||||
needs: ui-build
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v6
|
||||
@@ -117,10 +123,11 @@ jobs:
|
||||
run: npm ci
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Build application
|
||||
if: ${{ always() && steps.setup.conclusion == 'success' }}
|
||||
run: npm run build
|
||||
working-directory: tools/ui
|
||||
- name: Download built UI artifacts (reuses ui-build)
|
||||
uses: actions/download-artifact@v6
|
||||
with:
|
||||
name: ui-build
|
||||
path: tools/ui/dist/
|
||||
|
||||
- name: Install Playwright browsers
|
||||
id: playwright
|
||||
@@ -138,7 +145,7 @@ jobs:
|
||||
run: npm run test:ui -- --testTimeout=60000
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Run E2E tests
|
||||
- name: Run E2E tests (uses pre-built dist/ from ui-build)
|
||||
if: ${{ always() && steps.playwright.conclusion == 'success' }}
|
||||
run: npm run test:e2e
|
||||
working-directory: tools/ui
|
||||
|
||||
@@ -92,13 +92,6 @@
|
||||
!/examples/sycl/*.bat
|
||||
!/examples/sycl/*.sh
|
||||
|
||||
# Server Web UI temporary files (+ legacy directory)
|
||||
|
||||
/tools/server/webui/node_modules
|
||||
/tools/server/webui/dist
|
||||
/tools/ui/node_modules
|
||||
/tools/ui/dist
|
||||
|
||||
# Python
|
||||
|
||||
/.venv
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# llama.cpp
|
||||
|
||||

|
||||

|
||||
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
[](https://github.com/ggml-org/llama.cpp/releases)
|
||||
|
||||
@@ -2243,6 +2243,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.image_max_tokens = value;
|
||||
}
|
||||
).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MAX_TOKENS"));
|
||||
add_opt(common_arg(
|
||||
{"--mtmd-batch-max-tokens"}, "N",
|
||||
string_format("maximum number of image tokens per batch when encoding images (default: %d)", params.mtmd_batch_max_tokens),
|
||||
[](common_params & params, int value) {
|
||||
params.mtmd_batch_max_tokens = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MTMD_BATCH_MAX_TOKENS"));
|
||||
if (llama_supports_rpc()) {
|
||||
add_opt(common_arg(
|
||||
{"--rpc"}, "SERVERS",
|
||||
|
||||
@@ -575,6 +575,7 @@ struct common_params {
|
||||
std::vector<std::string> image; // path to image file(s) ; TODO: change the name to "media"
|
||||
int image_min_tokens = -1;
|
||||
int image_max_tokens = -1;
|
||||
int mtmd_batch_max_tokens = 1024;
|
||||
|
||||
// finetune
|
||||
struct lr_opt lr;
|
||||
|
||||
+29
-6
@@ -26,7 +26,7 @@ class common_params_fit_exception : public std::runtime_error {
|
||||
using std::runtime_error::runtime_error;
|
||||
};
|
||||
|
||||
std::vector<llama_device_memory_data> common_get_device_memory_data(
|
||||
static std::vector<llama_device_memory_data> common_get_device_memory_data_impl(
|
||||
const char * path_model,
|
||||
const llama_model_params * mparams,
|
||||
const llama_context_params * cparams,
|
||||
@@ -150,6 +150,29 @@ std::vector<llama_device_memory_data> common_get_device_memory_data(
|
||||
return ret;
|
||||
}
|
||||
|
||||
common_device_memory_data_vec common_get_device_memory_data(
|
||||
const char * path_model,
|
||||
const llama_model_params * mparams,
|
||||
const llama_context_params * cparams,
|
||||
std::vector<ggml_backend_dev_t> & devs,
|
||||
uint32_t & hp_ngl,
|
||||
uint32_t & hp_n_ctx_train,
|
||||
uint32_t & hp_n_expert,
|
||||
ggml_log_level log_level) {
|
||||
std::vector<llama_device_memory_data> impl = common_get_device_memory_data_impl(
|
||||
path_model, mparams, cparams, devs, hp_ngl, hp_n_ctx_train, hp_n_expert, log_level);
|
||||
|
||||
common_device_memory_data_vec ret(impl.size());
|
||||
for (size_t i = 0; i < impl.size(); i++) {
|
||||
ret[i].total = impl[i].total;
|
||||
ret[i].free = impl[i].free;
|
||||
ret[i].model = impl[i].mb.model;
|
||||
ret[i].context = impl[i].mb.context;
|
||||
ret[i].compute = impl[i].mb.compute;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void common_params_fit_impl(
|
||||
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
||||
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
||||
@@ -169,7 +192,7 @@ static void common_params_fit_impl(
|
||||
// step 1: get data for default parameters and check whether any changes are necessary in the first place
|
||||
|
||||
LOG_TRC("%s: getting device memory data for initial parameters:\n", __func__);
|
||||
const dmds_t dmds_full = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
const dmds_t dmds_full = common_get_device_memory_data_impl(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
const size_t nd = devs.size(); // number of devices
|
||||
|
||||
std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
|
||||
@@ -304,7 +327,7 @@ static void common_params_fit_impl(
|
||||
|
||||
int64_t sum_projected_used_min_ctx = 0;
|
||||
cparams->n_ctx = n_ctx_min;
|
||||
const dmds_t dmds_min_ctx = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
const dmds_t dmds_min_ctx = common_get_device_memory_data_impl(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
if (nd == 0) {
|
||||
sum_projected_used_min_ctx = dmds_min_ctx.back().mb.total();
|
||||
} else {
|
||||
@@ -482,7 +505,7 @@ static void common_params_fit_impl(
|
||||
llama_model_params mparams_copy = *mparams;
|
||||
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy);
|
||||
|
||||
const dmds_t dmd_nl = common_get_device_memory_data(
|
||||
const dmds_t dmd_nl = common_get_device_memory_data_impl(
|
||||
path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
|
||||
LOG_TRC("%s: memory for test allocation by device:\n", func_name);
|
||||
@@ -510,7 +533,7 @@ static void common_params_fit_impl(
|
||||
mparams->tensor_buft_overrides = tensor_buft_overrides;
|
||||
|
||||
LOG_TRC("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__);
|
||||
const dmds_t dmds_cpu_moe = common_get_device_memory_data(
|
||||
const dmds_t dmds_cpu_moe = common_get_device_memory_data_impl(
|
||||
path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
@@ -940,7 +963,7 @@ void common_fit_print(
|
||||
uint32_t hp_nct = 0; // hparams.n_ctx_train
|
||||
uint32_t hp_nex = 0; // hparams.n_expert
|
||||
|
||||
auto dmd = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, GGML_LOG_LEVEL_ERROR);
|
||||
auto dmd = common_get_device_memory_data_impl(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, GGML_LOG_LEVEL_ERROR);
|
||||
GGML_ASSERT(dmd.size() == devs.size() + 1);
|
||||
|
||||
for (size_t id = 0; id < devs.size(); id++) {
|
||||
|
||||
+32
-24
@@ -1,9 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "llama.h"
|
||||
#include "../src/llama-ext.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
@@ -18,31 +16,41 @@ enum common_params_fit_status {
|
||||
// - this function is NOT thread safe because it modifies the global llama logger state
|
||||
// - only parameters that have the same value as in llama_default_model_params are modified
|
||||
// with the exception of the context size which is modified if and only if equal to 0
|
||||
enum common_params_fit_status common_fit_params(
|
||||
const char * path_model,
|
||||
struct llama_model_params * mparams,
|
||||
struct llama_context_params * cparams,
|
||||
float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements
|
||||
struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
|
||||
size_t * margins, // margins of memory to leave per device in bytes
|
||||
uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use
|
||||
enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log
|
||||
common_params_fit_status common_fit_params(
|
||||
const char * path_model,
|
||||
llama_model_params * mparams,
|
||||
llama_context_params * cparams,
|
||||
float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements
|
||||
llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
|
||||
size_t * margins, // margins of memory to leave per device in bytes
|
||||
uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use
|
||||
ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log
|
||||
|
||||
// print estimated memory to stdout
|
||||
void common_fit_print(
|
||||
const char * path_model,
|
||||
struct llama_model_params * mparams,
|
||||
struct llama_context_params * cparams);
|
||||
const char * path_model,
|
||||
llama_model_params * mparams,
|
||||
llama_context_params * cparams);
|
||||
|
||||
void common_memory_breakdown_print(const struct llama_context * ctx);
|
||||
void common_memory_breakdown_print(const llama_context * ctx);
|
||||
|
||||
struct common_device_memory_data {
|
||||
int64_t total;
|
||||
int64_t free;
|
||||
size_t model;
|
||||
size_t context;
|
||||
size_t compute;
|
||||
};
|
||||
|
||||
using common_device_memory_data_vec = std::vector<common_device_memory_data>;
|
||||
|
||||
// Load a model + context with no_alloc and return the per-device memory breakdown.
|
||||
std::vector<llama_device_memory_data> common_get_device_memory_data(
|
||||
const char * path_model,
|
||||
const struct llama_model_params * mparams,
|
||||
const struct llama_context_params * cparams,
|
||||
std::vector<ggml_backend_dev_t> & devs,
|
||||
uint32_t & hp_ngl,
|
||||
uint32_t & hp_n_ctx_train,
|
||||
uint32_t & hp_n_expert,
|
||||
enum ggml_log_level log_level);
|
||||
common_device_memory_data_vec common_get_device_memory_data(
|
||||
const char * path_model,
|
||||
const llama_model_params * mparams,
|
||||
const llama_context_params * cparams,
|
||||
std::vector<ggml_backend_dev_t> & devs,
|
||||
uint32_t & hp_ngl,
|
||||
uint32_t & hp_n_ctx_train,
|
||||
uint32_t & hp_n_expert,
|
||||
ggml_log_level log_level);
|
||||
|
||||
+418
-10
@@ -375,31 +375,437 @@ struct common_speculative_impl_draft_simple : public common_speculative_impl {
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
// EAGLE3 speculative decoding state
|
||||
//
|
||||
// Input of draft decoder: (This is different compared to MTP)
|
||||
// At "pos P", the decoder takes input pair (t_{P+1}, g_P), with RoPE at P.
|
||||
// - t_{P+1} = token at sequence pos P+1 (the *next* token after P)
|
||||
// - g_P = encoder output = projection of target's extracted hidden states at P
|
||||
//
|
||||
// Deferred boundary (MTP doesn't have this issue):
|
||||
// Within a single process() call with n_tokens, we can only write decoder KV for
|
||||
// training pos 0..n_tokens-2. The last training pos (n_tokens-1) needs t_{n_tokens}
|
||||
// which lies *outside* this batch — it is the token target will sample next or the first token from next ubatch.
|
||||
// So the last training pos of each process() call is *deferred* to whichever next call has
|
||||
// the missing token in hand:
|
||||
// - multi-ubatch prefill: the next process()'s first token completes the pair
|
||||
// (handled by the per-seq "cross-ubatch bridge")
|
||||
// - single-ubatch prefill / after verify: draft()'s seed step uses "dp.id_last"
|
||||
// (target's freshest sample) to complete the pair
|
||||
//
|
||||
// Per-seq carry-over state:
|
||||
// pending_g_last [n_embd_dec] ┐ the deferred boundary's (g, pos). Set by
|
||||
// pending_pos_last llama_pos ┘ process() at end of ubatch (= last row);
|
||||
// rebased by accept() to first-non-accepted pos.
|
||||
// verify_g [N × n_embd_dec] snapshot of process()'s encoder output;
|
||||
// verify_pos_first llama_pos consumed by accept() to recover the right
|
||||
// verify_g_rows int32_t pending_g_last row for any n_accepted value.
|
||||
//
|
||||
// Performance is overall good but there is waste in verify cycle:
|
||||
// process() runs encoder + decoder on the *full* verify batch including rows for
|
||||
// rejected drafts. The KV at those positions is then dropped.
|
||||
//
|
||||
// TODO: Not sure if we need optimization for this waste?
|
||||
// If so we may need hybrid stash:
|
||||
// in verify mode, have process() only stash features and let draft() seed run
|
||||
// encoder+decoder on n_accepted+1 rows).
|
||||
struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
|
||||
//common_params_speculative_eagle3 params;
|
||||
common_params_speculative_draft params;
|
||||
llama_batch batch;
|
||||
|
||||
std::vector<common_sampler_ptr> smpls;
|
||||
|
||||
int32_t n_embd_dec = 0; // draft hidden size
|
||||
int32_t n_embd_enc = 0; // target_layer_ids_n * target_hidden_size
|
||||
int32_t n_embd_tgt = 0; // target model hidden size
|
||||
|
||||
const int32_t * target_layer_ids = nullptr; // model_dft's extract layer indices
|
||||
uint32_t target_layer_ids_n = 0;
|
||||
|
||||
// [per-seq] deferred boundary state
|
||||
std::vector<std::vector<float>> pending_g_last;
|
||||
std::vector<llama_pos> pending_pos_last;
|
||||
|
||||
// [per-seq] snapshot of the most recent process()'s encoder output
|
||||
std::vector<std::vector<float>> verify_g; // [n_seq][n_rows * n_embd_dec]
|
||||
std::vector<llama_pos> verify_pos_first; // [n_seq] — pos of verify_g[seq][0]
|
||||
std::vector<int32_t> verify_g_rows; // [n_seq] — number of rows
|
||||
|
||||
// scratch buffer for concatenated target features [n_tokens, n_embd_enc]
|
||||
std::vector<float> features_buf;
|
||||
std::vector<float> g_embd_buf;
|
||||
|
||||
common_speculative_impl_draft_eagle3(const common_params_speculative & params, uint32_t n_seq)
|
||||
: common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, n_seq)
|
||||
, params(params.draft)
|
||||
{
|
||||
LOG_INF("%s: adding speculative implementation 'draft-eagle3'\n", __func__);
|
||||
LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%f\n", __func__, params.draft.n_max, params.draft.n_min, params.draft.p_min);
|
||||
|
||||
auto * ctx_tgt = this->params.ctx_tgt;
|
||||
auto * ctx_dft = this->params.ctx_dft;
|
||||
GGML_ASSERT(ctx_tgt && ctx_dft && "EAGLE3 requires ctx_tgt and ctx_dft to be set");
|
||||
|
||||
const llama_model * model_dft = llama_get_model(ctx_dft);
|
||||
const llama_model * model_tgt = llama_get_model(ctx_tgt);
|
||||
|
||||
target_layer_ids = llama_model_target_layer_ids (model_dft);
|
||||
target_layer_ids_n = llama_model_target_layer_ids_n(model_dft);
|
||||
if (target_layer_ids_n != 3) {
|
||||
throw std::runtime_error("draft model is not eagle3 (expected 3 extract layers, got " +
|
||||
std::to_string(target_layer_ids_n) + ")");
|
||||
}
|
||||
|
||||
n_embd_tgt = llama_model_n_embd(model_tgt);
|
||||
n_embd_dec = llama_model_n_embd(model_dft);
|
||||
n_embd_enc = (int32_t) target_layer_ids_n * n_embd_tgt;
|
||||
|
||||
const int32_t n_b = (int32_t) llama_n_batch(ctx_dft);
|
||||
batch = llama_batch_init(/*n_tokens=*/ n_b, /*embd=*/ n_embd_dec, /*n_seq_max=*/ 1);
|
||||
// llama_batch_init allocates only one of token/embd; eagle3 decoder needs both.
|
||||
// TODO: fix, how to call without malloc
|
||||
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_b);
|
||||
|
||||
smpls.resize(n_seq);
|
||||
for (auto & s : smpls) {
|
||||
common_params_sampling sparams;
|
||||
sparams.no_perf = false;
|
||||
sparams.top_k = 10;
|
||||
sparams.samplers = { COMMON_SAMPLER_TYPE_TOP_K };
|
||||
s.reset(common_sampler_init(llama_get_model(ctx_dft), sparams));
|
||||
}
|
||||
|
||||
// turn on extraction of the target layers' input embeddings
|
||||
for (uint32_t k = 0; k < target_layer_ids_n; ++k) {
|
||||
llama_set_embeddings_layer_inp(ctx_tgt, (uint32_t) target_layer_ids[k], true);
|
||||
}
|
||||
|
||||
// turn on extraction of the draft model's pre-norm hidden state
|
||||
// (used both for the encoder output g_embd and the decoder pre-norm output).
|
||||
llama_set_embeddings_nextn(ctx_dft, true, /*masked*/ true);
|
||||
|
||||
pending_g_last.assign(n_seq, std::vector<float>(n_embd_dec, 0.0f));
|
||||
pending_pos_last.assign(n_seq, -1);
|
||||
|
||||
verify_g.assign(n_seq, std::vector<float>());
|
||||
verify_pos_first.assign(n_seq, -1);
|
||||
verify_g_rows.assign(n_seq, 0);
|
||||
}
|
||||
|
||||
void begin(llama_seq_id /*seq_id*/, const llama_tokens & /*prompt*/) override {
|
||||
// noop
|
||||
~common_speculative_impl_draft_eagle3() override {
|
||||
if (batch.token != nullptr) {
|
||||
free(batch.token);
|
||||
batch.token = nullptr;
|
||||
}
|
||||
llama_batch_free(batch);
|
||||
}
|
||||
|
||||
bool process(const llama_batch & /*batch*/) override {
|
||||
// TODO: implement
|
||||
void begin(llama_seq_id seq_id, const llama_tokens & prompt) override {
|
||||
const int32_t N = (int32_t) prompt.size();
|
||||
if (N <= 0) {
|
||||
return;
|
||||
}
|
||||
// expected state after prefill: ctx_dft has pos 0..N-2 (last position is deferred to
|
||||
// draft()'s seed step). Warn only if more than one position is missing.
|
||||
auto * ctx_dft = this->params.ctx_dft;
|
||||
const llama_pos pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), seq_id);
|
||||
if (pos_max < N - 2) {
|
||||
LOG_WRN("%s: ctx_dft pos_max=%d < N-2=%d — process() did not run on every prefill ubatch. "
|
||||
"Drafts may degrade.\n",
|
||||
__func__, (int) pos_max, N - 2);
|
||||
}
|
||||
}
|
||||
|
||||
bool process(const llama_batch & batch_in) override {
|
||||
if (batch_in.n_tokens <= 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (batch_in.token == nullptr || batch_in.embd != nullptr) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const int32_t n_tokens = batch_in.n_tokens;
|
||||
|
||||
// i_batch_beg[seq] / i_batch_end[seq]: inclusive batch indices of this seq's
|
||||
// first/last token in batch_in. Assumes per-seq tokens are contiguous within
|
||||
// the ubatch (server's default ordering).
|
||||
std::vector<int32_t> i_batch_beg(n_seq, -1);
|
||||
std::vector<int32_t> i_batch_end(n_seq, -1);
|
||||
for (int k = 0; k < n_tokens; ++k) {
|
||||
GGML_ASSERT(batch_in.n_seq_id[k] == 1);
|
||||
const llama_seq_id seq_id = batch_in.seq_id[k][0];
|
||||
if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq) {
|
||||
continue;
|
||||
}
|
||||
i_batch_end[seq_id] = k;
|
||||
if (i_batch_beg[seq_id] < 0) {
|
||||
i_batch_beg[seq_id] = k;
|
||||
}
|
||||
}
|
||||
|
||||
auto * ctx_tgt = this->params.ctx_tgt;
|
||||
auto * ctx_dft = this->params.ctx_dft;
|
||||
|
||||
// Interleave each extract_layer's hidden state into a contiguous buffer of
|
||||
// shape [n_tokens, target_layer_ids_n * n_embd_tgt]. Then run EAGLE3 encoder
|
||||
// to get one g_embd row per token.
|
||||
features_buf.resize((size_t) n_tokens * n_embd_enc, 0.0f);
|
||||
|
||||
for (uint32_t k = 0; k < target_layer_ids_n; ++k) {
|
||||
const float * layer = llama_get_embeddings_layer_inp(ctx_tgt, (uint32_t) target_layer_ids[k]);
|
||||
if (!layer) {
|
||||
GGML_ABORT("EAGLE3: target layer %d input not extracted.", target_layer_ids[k]);
|
||||
}
|
||||
for (int32_t i = 0; i < n_tokens; ++i) {
|
||||
float * dst = features_buf.data() + (size_t) i * n_embd_enc + k * (size_t) n_embd_tgt;
|
||||
const float * src = layer + (size_t) i * n_embd_tgt;
|
||||
std::memcpy(dst, src, (size_t) n_embd_tgt * sizeof(float));
|
||||
}
|
||||
}
|
||||
|
||||
g_embd_buf.resize((size_t) n_tokens * n_embd_dec);
|
||||
|
||||
// llama_encode() requires the full encoder batch to fit in n_ubatch.
|
||||
// Allow batch > ubatch: eagle3's per-token encoder can be chunked safely.
|
||||
const int32_t n_ubatch_dft = (int32_t) llama_n_ubatch(ctx_dft);
|
||||
for (int32_t i = 0; i < n_tokens; i += n_ubatch_dft) {
|
||||
const int32_t n_chunk = std::min(n_ubatch_dft, n_tokens - i);
|
||||
|
||||
llama_batch enc_batch = {
|
||||
/*.n_tokens =*/ n_chunk,
|
||||
/*.token =*/ nullptr,
|
||||
/*.embd =*/ features_buf.data() + (size_t) i * n_embd_enc,
|
||||
/*.pos =*/ nullptr,
|
||||
/*.n_seq_id =*/ nullptr,
|
||||
/*.seq_id =*/ nullptr,
|
||||
/*.logits =*/ nullptr,
|
||||
};
|
||||
const int32_t rc = llama_encode(ctx_dft, enc_batch);
|
||||
if (rc != 0) {
|
||||
LOG_ERR("%s: llama_encode(ctx_dft) failed rc=%d (n_tokens=%d, offset=%d)\n",
|
||||
__func__, rc, (int) n_chunk, (int) i);
|
||||
return false;
|
||||
}
|
||||
|
||||
// g_embd has shape [n_chunk, n_embd_dec] in ctx_dft's pre-norm embeddings buffer.
|
||||
const float * g_embd_chunk = llama_get_embeddings_nextn(ctx_dft);
|
||||
GGML_ASSERT(g_embd_chunk && "EAGLE3 encoder produced no output.");
|
||||
std::memcpy(g_embd_buf.data() + (size_t) i * n_embd_dec,
|
||||
g_embd_chunk,
|
||||
(size_t) n_chunk * n_embd_dec * sizeof(float));
|
||||
}
|
||||
|
||||
const float * g_embd = g_embd_buf.data();
|
||||
|
||||
const size_t row_bytes = (size_t) n_embd_dec * sizeof(float);
|
||||
|
||||
// EAGLE3 decoder input convention: at memory pos P the input pair is
|
||||
// (token[P+1], g_embd[P]). This shifts the token index "left by one" relative to g_embd.
|
||||
//
|
||||
// Per seq, in order:
|
||||
// (a) cross-ubatch bridge — when applicable, write the previously-deferred
|
||||
// pos using this ubatch's first token + pending_g_last.
|
||||
// (b) main write loop — for k in [beg, end-1], write (token[k+1], g_embd[k])
|
||||
// at pos[k]. The last training pos (k=end) is left unwritten = new
|
||||
// deferred boundary, completed by the next process() or draft() call.
|
||||
// (c) refresh deferred state — stash this ubatch's full g_embd into verify_g,
|
||||
// update pending_g_last / pending_pos_last to the last row.
|
||||
common_batch_clear(batch);
|
||||
|
||||
for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
|
||||
const int32_t beg = i_batch_beg[seq_id];
|
||||
const int32_t end = i_batch_end[seq_id];
|
||||
if (beg < 0 || end < 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// cross-ubatch bridge — complete the prior ubatch's deferred boundary.
|
||||
// Fires iff all three preconditions hold:
|
||||
// 1) pending_pos_last >= 0
|
||||
// 2) pending_pos_last + 1 == pos[beg]
|
||||
// 3) pending_pos_last > dft_pos_max // TODO: is this check needed?
|
||||
const llama_pos pending_pos = pending_pos_last[seq_id];
|
||||
if (pending_pos >= 0 && pending_pos + 1 == batch_in.pos[beg]) {
|
||||
const llama_pos dft_pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), seq_id);
|
||||
if (pending_pos > dft_pos_max) {
|
||||
common_batch_add(batch, batch_in.token[beg], pending_pos, { seq_id }, /*logits=*/ false);
|
||||
std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd_dec,
|
||||
pending_g_last[seq_id].data(), row_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
for (int32_t k = beg; k < end; ++k) {
|
||||
common_batch_add(batch, batch_in.token[k + 1], batch_in.pos[k], { seq_id }, /*logits=*/ false);
|
||||
std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd_dec,
|
||||
g_embd + (size_t) k * n_embd_dec, row_bytes);
|
||||
}
|
||||
|
||||
// refresh deferred state
|
||||
const int32_t n_rows = end - beg + 1;
|
||||
verify_pos_first[seq_id] = batch_in.pos[beg];
|
||||
pending_pos_last[seq_id] = batch_in.pos[end];
|
||||
verify_g_rows[seq_id] = n_rows;
|
||||
verify_g[seq_id].resize((size_t) n_rows * n_embd_dec, 0.0f);
|
||||
std::memcpy(verify_g[seq_id].data(), g_embd + (size_t) beg * n_embd_dec, row_bytes * n_rows);
|
||||
std::memcpy(pending_g_last[seq_id].data(), g_embd + (size_t) end * n_embd_dec, row_bytes);
|
||||
}
|
||||
|
||||
if (batch.n_tokens > 0) {
|
||||
const int32_t rc = llama_decode(ctx_dft, batch);
|
||||
if (rc != 0) {
|
||||
LOG_ERR("%s: llama_decode(ctx_dft) failed rc=%d (n_tokens=%d, ubatch_pos[0]=%d)\n",
|
||||
__func__, rc, (int) batch.n_tokens, (int) batch_in.pos[0]);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void draft(common_speculative_draft_params_vec & /*dparams*/) override {
|
||||
// TODO: implement
|
||||
void draft(common_speculative_draft_params_vec & dparams) override {
|
||||
auto & ctx_dft = params.ctx_dft;
|
||||
|
||||
common_batch_clear(batch);
|
||||
|
||||
// keep track of which sequences are still drafting
|
||||
int n_drafting = 0;
|
||||
std::vector<bool> drafting(n_seq);
|
||||
|
||||
const size_t row_bytes = (size_t) n_embd_dec * sizeof(float);
|
||||
|
||||
// Complete the deferred boundary pair (dp.id_last, pending_g_last) at memory
|
||||
// pos pending_pos_last. dp.id_last is target's freshest sample (= corrected
|
||||
// token after verify, or first generated token after prefill), matching the
|
||||
// EAGLE3 input convention (token[P+1], g_embd[P]) at pos P.
|
||||
for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
|
||||
auto & dp = dparams[seq_id];
|
||||
|
||||
if (!dp.drafting) {
|
||||
continue;
|
||||
}
|
||||
if (pending_pos_last[seq_id] < 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
n_drafting++;
|
||||
drafting[seq_id] = true;
|
||||
common_sampler_reset(smpls[seq_id].get());
|
||||
|
||||
llama_memory_seq_rm(llama_get_memory(ctx_dft), seq_id, pending_pos_last[seq_id], -1);
|
||||
|
||||
common_batch_add(batch, dp.id_last, pending_pos_last[seq_id], { seq_id }, true);
|
||||
std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd_dec,
|
||||
pending_g_last[seq_id].data(),
|
||||
row_bytes);
|
||||
}
|
||||
|
||||
if (batch.n_tokens == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
int ret = llama_decode(ctx_dft, batch);
|
||||
if (ret != 0) {
|
||||
LOG_WRN("%s: llama_decode returned %d\n", __func__, ret);
|
||||
return;
|
||||
}
|
||||
|
||||
int i = 0;
|
||||
|
||||
while (n_drafting > 0) {
|
||||
int i_batch = 0;
|
||||
|
||||
common_batch_clear(batch);
|
||||
|
||||
for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
|
||||
if (!drafting[seq_id]) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto * smpl = smpls[seq_id].get();
|
||||
|
||||
common_sampler_sample(smpl, ctx_dft, i_batch, true);
|
||||
// pre-norm hidden state of this position becomes g_embd for the next step
|
||||
const float * prenorm = llama_get_embeddings_nextn_ith(ctx_dft, i_batch);
|
||||
++i_batch;
|
||||
|
||||
const auto * cur_p = common_sampler_get_candidates(smpl, true);
|
||||
|
||||
for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
|
||||
LOG_DBG(" - seq_id %d, draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
|
||||
seq_id, k, i, cur_p->data[k].id, cur_p->data[k].p,
|
||||
common_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
|
||||
}
|
||||
|
||||
const llama_token id = cur_p->data[0].id;
|
||||
|
||||
// only collect very high-confidence draft tokens
|
||||
// (configurable via --spec-draft-p-min, set to 0.0 to disable early-stop)
|
||||
if (cur_p->data[0].p < params.p_min) {
|
||||
drafting[seq_id] = false;
|
||||
n_drafting--;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
common_sampler_accept(smpl, id, true);
|
||||
|
||||
auto & dp = dparams.at(seq_id);
|
||||
auto & result = *dp.result;
|
||||
|
||||
result.push_back(id);
|
||||
|
||||
if (params.n_max <= (int) result.size()) {
|
||||
drafting[seq_id] = false;
|
||||
n_drafting--;
|
||||
continue;
|
||||
}
|
||||
|
||||
common_batch_add(batch, id, pending_pos_last[seq_id] + (i + 1), { seq_id }, true);
|
||||
std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd_dec, prenorm, row_bytes);
|
||||
}
|
||||
|
||||
if (batch.n_tokens == 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
ret = llama_decode(ctx_dft, batch);
|
||||
if (ret != 0) {
|
||||
LOG_WRN("%s: llama_decode[%d] returned %d\n", __func__, i, ret);
|
||||
break;
|
||||
}
|
||||
|
||||
++i;
|
||||
}
|
||||
|
||||
for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
|
||||
auto & dp = dparams[seq_id];
|
||||
if (!dp.drafting) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (dp.result->size() < (size_t) params.n_min) {
|
||||
dp.result->clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override {
|
||||
// noop
|
||||
void accept(llama_seq_id seq_id, uint16_t n_accepted, bool /*is_other*/) override {
|
||||
if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int32_t n_rows = verify_g_rows[seq_id];
|
||||
if (n_rows <= 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int32_t i_g = std::min<int32_t>(n_accepted, n_rows - 1);
|
||||
pending_pos_last[seq_id] = verify_pos_first[seq_id] + i_g;
|
||||
std::memcpy(pending_g_last[seq_id].data(),
|
||||
verify_g[seq_id].data() + (size_t) i_g * n_embd_dec,
|
||||
(size_t) n_embd_dec * sizeof(float));
|
||||
}
|
||||
|
||||
bool need_embd() const override {
|
||||
@@ -1370,9 +1776,11 @@ common_speculative * common_speculative_init(common_params_speculative & params,
|
||||
uint32_t enabled_configs = common_get_enabled_speculative_configs(params.types);
|
||||
|
||||
bool has_draft_simple = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE));
|
||||
bool has_draft_eagle3 = false; // TODO PR-18039: if params.speculative.eagle3
|
||||
bool has_draft_eagle3 = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3)) && params.draft.ctx_dft != nullptr;
|
||||
bool has_mtp = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_MTP)) && params.draft.ctx_dft != nullptr;
|
||||
|
||||
|
||||
|
||||
bool has_ngram_cache = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_NGRAM_CACHE));
|
||||
bool has_ngram_simple = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE));
|
||||
bool has_ngram_map_k = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K));
|
||||
|
||||
@@ -130,6 +130,9 @@ TEXT_MODEL_MAP: dict[str, str] = {
|
||||
"LlamaBidirectionalModel": "llama",
|
||||
"LlamaForCausalLM": "llama",
|
||||
"LlamaModel": "llama",
|
||||
"Eagle3DraftModel": "llama",
|
||||
"Eagle3Speculator": "llama",
|
||||
"LlamaForCausalLMEagle3": "llama",
|
||||
"LlavaForConditionalGeneration": "llama",
|
||||
"LlavaStableLMEpochForCausalLM": "stablelm",
|
||||
"MPTForCausalLM": "mpt",
|
||||
|
||||
@@ -94,6 +94,7 @@ class ModelBase:
|
||||
metadata: gguf.Metadata
|
||||
dir_model_card: Path
|
||||
remote_hf_model_id: str | None
|
||||
target_model_dir: Path | None
|
||||
|
||||
# subclasses should define this!
|
||||
model_arch: gguf.MODEL_ARCH
|
||||
@@ -119,6 +120,7 @@ class ModelBase:
|
||||
small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None,
|
||||
disable_mistral_community_chat_template: bool = False,
|
||||
sentence_transformers_dense_modules: bool = False,
|
||||
target_model_dir: Path | None = None,
|
||||
fuse_gate_up_exps: bool = False,
|
||||
fp8_as_q8: bool = False):
|
||||
if type(self) is ModelBase or \
|
||||
@@ -139,6 +141,7 @@ class ModelBase:
|
||||
self.dry_run = dry_run
|
||||
self.remote_hf_model_id = remote_hf_model_id
|
||||
self.sentence_transformers_dense_modules = sentence_transformers_dense_modules
|
||||
self.target_model_dir = target_model_dir
|
||||
self.fuse_gate_up_exps = fuse_gate_up_exps
|
||||
self._gate_exp_buffer: dict[int, Tensor] = {}
|
||||
self._up_exp_buffer: dict[int, Tensor] = {}
|
||||
@@ -2481,6 +2484,7 @@ class LazyTorchTensor(gguf.LazyBase):
|
||||
torch.float16: np.float16,
|
||||
torch.float32: np.float32,
|
||||
torch.uint8: np.uint8,
|
||||
torch.int64: np.int64,
|
||||
}
|
||||
|
||||
# only used when byteswapping data. Only correct size is needed
|
||||
|
||||
+130
-1
@@ -5,12 +5,13 @@ import math
|
||||
|
||||
from typing import Callable, Iterable, TYPE_CHECKING
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from torch import Tensor
|
||||
|
||||
from .base import ModelBase, TextModel, gguf
|
||||
from .base import ModelBase, TextModel, gguf, logger
|
||||
|
||||
|
||||
@ModelBase.register(
|
||||
@@ -21,6 +22,9 @@ from .base import ModelBase, TextModel, gguf
|
||||
"VLlama3ForCausalLM",
|
||||
"LlavaForConditionalGeneration",
|
||||
"VoxtralForConditionalGeneration",
|
||||
"LlamaForCausalLMEagle3",
|
||||
"Eagle3Speculator",
|
||||
"Eagle3DraftModel",
|
||||
"IQuestCoderForCausalLM",
|
||||
"LlamaModel")
|
||||
class LlamaModel(TextModel):
|
||||
@@ -39,7 +43,61 @@ class LlamaModel(TextModel):
|
||||
hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
|
||||
self.origin_hf_arch = hparams.get('architectures', [None])[0]
|
||||
|
||||
# Detect eagle3 draft checkpoint by hparams (some models don't use a distinct HF arch name)
|
||||
if "draft_vocab_size" in self.hparams and self.hparams["num_hidden_layers"] == 1:
|
||||
self.is_eagle3 = True
|
||||
self.model_arch = gguf.MODEL_ARCH.EAGLE3
|
||||
logger.info("Detected EAGLE-3 draft model, switching to EAGLE3 architecture")
|
||||
# Re-initialize tensor_map with eagle3 architecture
|
||||
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
||||
# Update gguf_writer architecture
|
||||
self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
|
||||
self.gguf_writer.add_architecture()
|
||||
if self.target_model_dir is None:
|
||||
raise ValueError(
|
||||
"EAGLE-3 model requires --target-model-dir to be specified. "
|
||||
"Please provide the path to the target model directory to read config.json"
|
||||
)
|
||||
# Read both eagle3 raw config and target model config
|
||||
with open(self.dir_model / "config.json", 'r', encoding='utf-8') as f:
|
||||
eagle3_raw_config = json.load(f)
|
||||
with open(self.target_model_dir / "config.json", 'r', encoding='utf-8') as f:
|
||||
target_config = json.load(f)
|
||||
|
||||
if "text_config" in target_config:
|
||||
target_config = {**target_config, **target_config["text_config"]}
|
||||
self.target_vocab_size = target_config["vocab_size"]
|
||||
|
||||
# target_layers: derived from target model layer count (low/mid/high)
|
||||
target_num_layers = target_config["num_hidden_layers"]
|
||||
target_layers = [2, target_num_layers // 2, target_num_layers - 3]
|
||||
logger.info(f"EAGLE-3: target_layers = {target_layers} (target model has {target_num_layers} layers)")
|
||||
self.gguf_writer.add_array(f"{self.gguf_writer.arch}.target_layers", target_layers)
|
||||
|
||||
# target_hidden_size: prefer eagle3 config, fallback to target config
|
||||
if eagle3_raw_config.get("target_hidden_size") is not None:
|
||||
target_hidden_size = eagle3_raw_config["target_hidden_size"]
|
||||
src = "EAGLE-3 config"
|
||||
else:
|
||||
target_hidden_size = target_config["hidden_size"]
|
||||
src = "target model config"
|
||||
logger.info(f"EAGLE-3: target_hidden_size = {target_hidden_size} (from {src})")
|
||||
self.gguf_writer.add_uint32(f"{self.gguf_writer.arch}.target_hidden_size", target_hidden_size)
|
||||
|
||||
# norm_before_residual (RedHat-style eagle3 specific)
|
||||
norm_before_residual = eagle3_raw_config.get("norm_before_residual", False)
|
||||
logger.info(f"EAGLE-3: norm_before_residual = {norm_before_residual}")
|
||||
self.gguf_writer.add_bool(f"{self.gguf_writer.arch}.norm_before_residual", norm_before_residual)
|
||||
|
||||
def set_vocab(self):
|
||||
# eagle3: use tokenizer from target model if provided
|
||||
original_dir_model = None
|
||||
if getattr(self, 'is_eagle3', False):
|
||||
assert self.target_model_dir is not None
|
||||
logger.info(f"EAGLE-3: Using tokenizer from target model: {self.target_model_dir}")
|
||||
original_dir_model = self.dir_model
|
||||
self.dir_model = self.target_model_dir
|
||||
|
||||
if self.origin_hf_arch == "GlmasrModel":
|
||||
return self._set_vocab_glmedge()
|
||||
|
||||
@@ -85,6 +143,10 @@ class LlamaModel(TextModel):
|
||||
if self.hparams.get("vocab_size", 32000) == 49152:
|
||||
self.gguf_writer.add_add_bos_token(False)
|
||||
|
||||
# eagle3: Restore original dir_model
|
||||
if original_dir_model is not None:
|
||||
self.dir_model = original_dir_model
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
@@ -129,7 +191,49 @@ class LlamaModel(TextModel):
|
||||
|
||||
return super().filter_tensors((name, gen))
|
||||
|
||||
def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Callable[[], Tensor]]:
|
||||
tensors = super().index_tensors(remote_hf_model_id)
|
||||
|
||||
# Handle Eagle3Speculator nested config
|
||||
if "transformer_layer_config" in self.hparams:
|
||||
self.hparams = {**self.hparams, **self.hparams["transformer_layer_config"]}
|
||||
|
||||
# eagle3 detection
|
||||
if "draft_vocab_size" in self.hparams and self.hparams["num_hidden_layers"] == 1:
|
||||
logger.info("EAGLE-3: renaming midlayer.* / layers.0.* to model.layers.0.*")
|
||||
new_tensors = {}
|
||||
for name, gen in tensors.items():
|
||||
if name.startswith("midlayer."):
|
||||
new_name = "model.layers.0." + name[len("midlayer."):]
|
||||
new_tensors[new_name] = gen
|
||||
elif name.startswith("layers.0."): # Eagle3Speculator format
|
||||
new_name = "model." + name
|
||||
new_tensors[new_name] = gen
|
||||
else:
|
||||
new_tensors[name] = gen
|
||||
return new_tensors
|
||||
|
||||
return tensors
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# eagle3: special tensors that bypass standard llama mapping
|
||||
if getattr(self, 'is_eagle3', False):
|
||||
if name == "fc.weight":
|
||||
yield (name, data_torch)
|
||||
return
|
||||
if name == "d2t":
|
||||
# store for manual int64 handling in prepare_tensors (avoid F32 conversion)
|
||||
if not hasattr(self, '_eagle3_int_tensors'):
|
||||
self._eagle3_int_tensors = {}
|
||||
self._eagle3_int_tensors[name] = data_torch
|
||||
return
|
||||
if name == "t2d":
|
||||
# not used at runtime, skip
|
||||
return
|
||||
if name.endswith(".hidden_norm.weight"):
|
||||
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_NORM_2, bid), data_torch)
|
||||
return
|
||||
|
||||
n_head = self.find_hparam(["n_heads", "num_attention_heads"])
|
||||
n_kv_head = self.find_hparam(["n_kv_heads", "num_key_value_heads"])
|
||||
|
||||
@@ -205,8 +309,33 @@ class LlamaModel(TextModel):
|
||||
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
|
||||
|
||||
def prepare_tensors(self):
|
||||
# eagle3: collect d2t original dtype before parent converts tensors to F32
|
||||
eagle3_original_dtypes = {}
|
||||
if getattr(self, 'is_eagle3', False):
|
||||
for name, data_torch in self.get_tensors():
|
||||
if name == "d2t":
|
||||
eagle3_original_dtypes[name] = data_torch.dtype
|
||||
|
||||
super().prepare_tensors()
|
||||
|
||||
# eagle3: write d2t as absolute target token ids
|
||||
if getattr(self, 'is_eagle3', False) and hasattr(self, '_eagle3_int_tensors'):
|
||||
for name, data_torch in self._eagle3_int_tensors.items():
|
||||
old_dtype = eagle3_original_dtypes.get(name, data_torch.dtype)
|
||||
data = data_torch.to(torch.int64).cpu().numpy()
|
||||
if name == "d2t":
|
||||
data = data.reshape(-1)
|
||||
data = data + np.arange(data.size, dtype=np.int64)
|
||||
if np.any((data < 0) | (data >= self.target_vocab_size)):
|
||||
raise ValueError(f"EAGLE-3 d2t target ids out of range for target vocab size {self.target_vocab_size}")
|
||||
if np.unique(data).size != data.size:
|
||||
raise ValueError("EAGLE-3 d2t contains duplicate target ids")
|
||||
data_qtype = gguf.GGMLQuantizationType.I64
|
||||
|
||||
shape_str = f"{{{', '.join(str(n) for n in reversed(data.shape))}}}"
|
||||
logger.info(f"{name + ',':<30} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
|
||||
self.gguf_writer.add_tensor(name, data, raw_dtype=data_qtype)
|
||||
|
||||
if self._experts is not None:
|
||||
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
||||
experts = [k for d in self._experts for k in d.keys()]
|
||||
|
||||
@@ -153,6 +153,15 @@ def parse_args() -> argparse.Namespace:
|
||||
help="Store tensors dequantized from FP8 as Q8_0 instead of BF16/F16.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--target-model-dir", type=str, default=None,
|
||||
help=(
|
||||
"path to the target model directory; required when converting a standalone draft model "
|
||||
"(e.g. EAGLE3 / DFlash) that needs target-model metadata such as tokenizer, hidden size, and "
|
||||
"layer count to populate its GGUF."
|
||||
),
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
if not args.print_supported_models and args.model is None:
|
||||
parser.error("the following arguments are required: model")
|
||||
@@ -269,6 +278,7 @@ def main() -> None:
|
||||
small_first_shard=args.no_tensor_first_split,
|
||||
remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template,
|
||||
sentence_transformers_dense_modules=args.sentence_transformers_dense_modules,
|
||||
target_model_dir=Path(args.target_model_dir) if args.target_model_dir else None,
|
||||
fuse_gate_up_exps=args.fuse_gate_up_exps,
|
||||
fp8_as_q8=args.fp8_as_q8,
|
||||
)
|
||||
|
||||
+2
-2
@@ -4,8 +4,8 @@ project("ggml" C CXX ASM)
|
||||
|
||||
### GGML Version
|
||||
set(GGML_VERSION_MAJOR 0)
|
||||
set(GGML_VERSION_MINOR 14)
|
||||
set(GGML_VERSION_PATCH 0)
|
||||
set(GGML_VERSION_MINOR 15)
|
||||
set(GGML_VERSION_PATCH 1)
|
||||
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
|
||||
|
||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
|
||||
|
||||
@@ -1,16 +1,18 @@
|
||||
#include "concat.cuh"
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
// contiguous kernels
|
||||
template <int dim>
|
||||
static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE) concat_f32_cont(const float * x,
|
||||
const float * y,
|
||||
float * dst,
|
||||
int64_t ne00,
|
||||
int64_t ne01,
|
||||
int64_t ne02,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2) {
|
||||
template <typename T, int dim>
|
||||
static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE) concat_cont(const T * x,
|
||||
const T * y,
|
||||
T * dst,
|
||||
int64_t ne00,
|
||||
int64_t ne01,
|
||||
int64_t ne02,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2) {
|
||||
static_assert(dim >= 0 && dim <= 2, "dim must be in [0, 2]");
|
||||
|
||||
const int64_t n = ne0 * ne1 * ne2;
|
||||
@@ -50,37 +52,37 @@ static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE) concat_f32_cont
|
||||
}
|
||||
}
|
||||
|
||||
static void concat_f32_cuda(const float * x,
|
||||
const float * y,
|
||||
float * dst,
|
||||
int64_t ne00,
|
||||
int64_t ne01,
|
||||
int64_t ne02,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2,
|
||||
int dim,
|
||||
cudaStream_t stream) {
|
||||
template <typename T>
|
||||
static void concat_cont_cuda(const T * x,
|
||||
const T * y,
|
||||
T * dst,
|
||||
int64_t ne00,
|
||||
int64_t ne01,
|
||||
int64_t ne02,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2,
|
||||
int dim,
|
||||
cudaStream_t stream) {
|
||||
const int64_t n = ne0 * ne1 * ne2;
|
||||
const int num_blocks = (n + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
|
||||
|
||||
if (dim == 0) {
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream);
|
||||
ggml_cuda_kernel_launch(concat_f32_cont<0>, launch_params,x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
|
||||
ggml_cuda_kernel_launch(concat_cont<T, 0>, launch_params, x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
|
||||
return;
|
||||
}
|
||||
if (dim == 1) {
|
||||
concat_f32_cont<1>
|
||||
<<<num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
|
||||
concat_cont<T, 1><<<num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
|
||||
return;
|
||||
}
|
||||
concat_f32_cont<2><<<num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
|
||||
concat_cont<T, 2><<<num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
|
||||
}
|
||||
|
||||
// non-contiguous kernel (slow)
|
||||
template <int dim>
|
||||
template <typename T, int dim>
|
||||
static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE)
|
||||
concat_f32_non_cont(
|
||||
concat_non_cont(
|
||||
const char * src0,
|
||||
const char * src1,
|
||||
char * dst,
|
||||
@@ -107,61 +109,49 @@ static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE)
|
||||
uint64_t nb0,
|
||||
uint64_t nb1,
|
||||
uint64_t nb2,
|
||||
uint64_t nb3){
|
||||
uint64_t nb3) {
|
||||
static_assert(dim >= 0 && dim <= 3, "dim must be in [0, 3]");
|
||||
|
||||
const int64_t i3 = blockIdx.z;
|
||||
const int64_t i2 = blockIdx.y;
|
||||
const int64_t i1 = blockIdx.x;
|
||||
|
||||
const float * x;
|
||||
const T * x;
|
||||
|
||||
for (int64_t i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) {
|
||||
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
||||
x = (const float *)(src0 + (i3 )*nb03 + (i2 )*nb02 + (i1 )*nb01 + (i0 )*nb00);
|
||||
x = (const T *)(src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||
} else {
|
||||
if constexpr (dim == 0) {
|
||||
x = (const float *) (src1 + i3 * nb13 + i2 * nb12 + i1 * nb11 + (i0 - ne00) * nb10);
|
||||
x = (const T *)(src1 + i3*nb13 + i2*nb12 + i1*nb11 + (i0 - ne00)*nb10);
|
||||
} else if constexpr (dim == 1) {
|
||||
x = (const float *) (src1 + i3 * nb13 + i2 * nb12 + (i1 - ne01) * nb11 + i0 * nb10);
|
||||
x = (const T *)(src1 + i3*nb13 + i2*nb12 + (i1 - ne01)*nb11 + i0*nb10);
|
||||
} else if constexpr (dim == 2) {
|
||||
x = (const float *) (src1 + i3 * nb13 + (i2 - ne02) * nb12 + i1 * nb11 + i0 * nb10);
|
||||
x = (const T *)(src1 + i3*nb13 + (i2 - ne02)*nb12 + i1*nb11 + i0*nb10);
|
||||
} else if constexpr (dim == 3) {
|
||||
x = (const float *) (src1 + (i3 - ne03) * nb13 + i2 * nb12 + i1 * nb11 + i0 * nb10);
|
||||
x = (const T *)(src1 + (i3 - ne03)*nb13 + i2*nb12 + i1*nb11 + i0*nb10);
|
||||
}
|
||||
}
|
||||
|
||||
float * y = (float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
T * y = (T *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
*y = *x;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
const int32_t dim = ((int32_t *) dst->op_params)[0];
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
template <typename T>
|
||||
static void concat_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, int dim, cudaStream_t stream) {
|
||||
if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
const float * src1_d = (const float *)src1->data;
|
||||
|
||||
float * dst_d = (float *)dst->data;
|
||||
const T * src0_d = (const T *) src0->data;
|
||||
const T * src1_d = (const T *) src1->data;
|
||||
T * dst_d = (T *) dst->data;
|
||||
|
||||
if (dim != 3) {
|
||||
for (int i3 = 0; i3 < dst->ne[3]; i3++) {
|
||||
concat_f32_cuda(
|
||||
src0_d + i3 * (src0->nb[3] / 4),
|
||||
src1_d + i3 * (src1->nb[3] / 4),
|
||||
dst_d + i3 * ( dst->nb[3] / 4),
|
||||
for (int64_t i3 = 0; i3 < dst->ne[3]; i3++) {
|
||||
concat_cont_cuda(
|
||||
src0_d + i3*(src0->nb[3] / sizeof(T)),
|
||||
src1_d + i3*(src1->nb[3] / sizeof(T)),
|
||||
dst_d + i3*( dst->nb[3] / sizeof(T)),
|
||||
src0->ne[0], src0->ne[1], src0->ne[2],
|
||||
dst->ne[0], dst->ne[1], dst->ne[2], dim, stream);
|
||||
}
|
||||
@@ -169,13 +159,13 @@ void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const size_t size0 = ggml_nbytes(src0);
|
||||
const size_t size1 = ggml_nbytes(src1);
|
||||
|
||||
CUDA_CHECK(cudaMemcpyAsync(dst_d, src0_d, size0, cudaMemcpyDeviceToDevice, stream));
|
||||
CUDA_CHECK(cudaMemcpyAsync(dst_d + size0/4, src1_d, size1, cudaMemcpyDeviceToDevice, stream));
|
||||
CUDA_CHECK(cudaMemcpyAsync((char *) dst->data, src0->data, size0, cudaMemcpyDeviceToDevice, stream));
|
||||
CUDA_CHECK(cudaMemcpyAsync((char *) dst->data + size0, src1->data, size1, cudaMemcpyDeviceToDevice, stream));
|
||||
}
|
||||
} else {
|
||||
dim3 grid_dim(dst->ne[1], dst->ne[2], dst->ne[3]);
|
||||
auto launch_kernel = [&](auto dim) {
|
||||
concat_f32_non_cont<dim><<<grid_dim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(
|
||||
concat_non_cont<T, dim><<<grid_dim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(
|
||||
(const char *) src0->data, (const char *) src1->data, (char *) dst->data,
|
||||
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
||||
src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
|
||||
@@ -203,3 +193,35 @@ void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
const int32_t dim = ((int32_t *) dst->op_params)[0];
|
||||
|
||||
GGML_ASSERT(src0->type == src1->type);
|
||||
GGML_ASSERT(dst->type == src0->type);
|
||||
GGML_ASSERT(!ggml_is_quantized(src0->type));
|
||||
GGML_ASSERT(ggml_blck_size(src0->type) == 1);
|
||||
|
||||
switch (ggml_type_size(src0->type)) {
|
||||
case 1:
|
||||
concat_cuda<uint8_t>(src0, src1, dst, dim, stream);
|
||||
break;
|
||||
case 2:
|
||||
concat_cuda<uint16_t>(src0, src1, dst, dim, stream);
|
||||
break;
|
||||
case 4:
|
||||
concat_cuda<uint32_t>(src0, src1, dst, dim, stream);
|
||||
break;
|
||||
case 8:
|
||||
concat_cuda<uint64_t>(src0, src1, dst, dim, stream);
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("Unsupported type size: %zu", ggml_type_size(src0->type));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5345,7 +5345,15 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_OP_CONCAT:
|
||||
{
|
||||
ggml_type src0_type = op->src[0]->type;
|
||||
return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
|
||||
ggml_type src1_type = op->src[1]->type;
|
||||
return src0_type == src1_type &&
|
||||
src0_type == op->type &&
|
||||
!ggml_is_quantized(src0_type) &&
|
||||
ggml_blck_size(src0_type) == 1 &&
|
||||
(ggml_type_size(src0_type) == 1 ||
|
||||
ggml_type_size(src0_type) == 2 ||
|
||||
ggml_type_size(src0_type) == 4 ||
|
||||
ggml_type_size(src0_type) == 8);
|
||||
} break;
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
{
|
||||
|
||||
@@ -1120,8 +1120,17 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
||||
case GGML_OP_VIEW:
|
||||
case GGML_OP_TRANSPOSE:
|
||||
case GGML_OP_PERMUTE:
|
||||
case GGML_OP_CONCAT:
|
||||
return true;
|
||||
case GGML_OP_CONCAT:
|
||||
{
|
||||
// kernel_concat copies one float-sized value per element.
|
||||
// Other scalar types need a type-generic copy kernel first.
|
||||
const enum ggml_type src0_type = op->src[0]->type;
|
||||
const enum ggml_type src1_type = op->src[1]->type;
|
||||
return src0_type == src1_type &&
|
||||
src0_type == op->type &&
|
||||
(src0_type == GGML_TYPE_F32 || src0_type == GGML_TYPE_I32);
|
||||
}
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_SUB:
|
||||
case GGML_OP_MUL:
|
||||
|
||||
@@ -142,6 +142,10 @@ set(GGML_OPENCL_KERNELS
|
||||
gemm_noshuffle_q4_0_f32
|
||||
gemv_noshuffle_q4_1_f32
|
||||
gemm_noshuffle_q4_1_f32
|
||||
gemv_noshuffle_q5_0_f32
|
||||
gemm_noshuffle_q5_0_f32
|
||||
gemv_noshuffle_q5_1_f32
|
||||
gemm_noshuffle_q5_1_f32
|
||||
gemv_noshuffle_iq4_nl_f32
|
||||
gemm_noshuffle_iq4_nl_f32
|
||||
gemv_noshuffle_q8_0_f32
|
||||
|
||||
@@ -593,6 +593,10 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_restore_block_q4_0_noshuffle;
|
||||
cl_kernel kernel_convert_block_q4_1_noshuffle;
|
||||
cl_kernel kernel_restore_block_q4_1_noshuffle;
|
||||
cl_kernel kernel_convert_block_q5_0_noshuffle;
|
||||
cl_kernel kernel_restore_block_q5_0_noshuffle;
|
||||
cl_kernel kernel_convert_block_q5_1_noshuffle;
|
||||
cl_kernel kernel_restore_block_q5_1_noshuffle;
|
||||
cl_kernel kernel_convert_block_q4_K_noshuffle;
|
||||
cl_kernel kernel_restore_block_q4_K_noshuffle;
|
||||
cl_kernel kernel_convert_block_q4_K, kernel_restore_block_q4_K;
|
||||
@@ -829,6 +833,10 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_gemm_noshuffle_q6_K_f32;
|
||||
cl_kernel kernel_gemv_noshuffle_q5_k_f32;
|
||||
cl_kernel kernel_gemm_noshuffle_q5_k_f32;
|
||||
cl_kernel kernel_gemv_noshuffle_q5_0_f32;
|
||||
cl_kernel kernel_gemm_noshuffle_q5_0_f32;
|
||||
cl_kernel kernel_gemv_noshuffle_q5_1_f32;
|
||||
cl_kernel kernel_gemm_noshuffle_q5_1_f32;
|
||||
cl_kernel kernel_gemv_noshuffle_iq4_nl_f32;
|
||||
cl_kernel kernel_gemm_noshuffle_iq4_nl_f32;
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
@@ -1152,6 +1160,10 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q4_1_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_1_trans4_ns", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q5_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_0", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q5_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_0", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q5_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_0_noshuffle", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q5_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_0_noshuffle", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q5_1_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_1_noshuffle", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q5_1_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_1_noshuffle", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q5_0_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_0_trans4_ns", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q5_0_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_0_trans4_ns", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q5_1 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_1", &err), err));
|
||||
@@ -3065,6 +3077,80 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// gemm_noshuffle_q5_0_f32
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "gemm_noshuffle_q5_0_f32.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("gemm_noshuffle_q5_0_f32.cl");
|
||||
#endif
|
||||
cl_program prog = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
CL_CHECK((backend_ctx->kernel_gemm_noshuffle_q5_0_f32 = clCreateKernel(prog, "kernel_gemm_noshuffle_q5_0_f32", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// gemv_noshuffle_q5_0_f32
|
||||
{
|
||||
std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
||||
" -cl-mad-enable ";
|
||||
if (backend_ctx->has_vector_subgroup_broadcast) {
|
||||
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAST ";
|
||||
}
|
||||
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "gemv_noshuffle_q5_0_f32.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("gemv_noshuffle_q5_0_f32.cl");
|
||||
#endif
|
||||
cl_program prog = build_program_from_source(
|
||||
backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_gemv_compile_opts);
|
||||
CL_CHECK((backend_ctx->kernel_gemv_noshuffle_q5_0_f32 = clCreateKernel(prog, "kernel_gemv_noshuffle_q5_0_f32", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// gemm_noshuffle_q5_1_f32
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "gemm_noshuffle_q5_1_f32.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("gemm_noshuffle_q5_1_f32.cl");
|
||||
#endif
|
||||
cl_program prog = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
CL_CHECK((backend_ctx->kernel_gemm_noshuffle_q5_1_f32 = clCreateKernel(prog, "kernel_gemm_noshuffle_q5_1_f32", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// gemv_noshuffle_q5_1_f32
|
||||
{
|
||||
std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
||||
" -cl-mad-enable ";
|
||||
if (backend_ctx->has_vector_subgroup_broadcast) {
|
||||
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAST ";
|
||||
}
|
||||
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "gemv_noshuffle_q5_1_f32.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("gemv_noshuffle_q5_1_f32.cl");
|
||||
#endif
|
||||
cl_program prog = build_program_from_source(
|
||||
backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_gemv_compile_opts);
|
||||
CL_CHECK((backend_ctx->kernel_gemv_noshuffle_q5_1_f32 = clCreateKernel(prog, "kernel_gemv_noshuffle_q5_1_f32", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// gemm_noshuffle_iq4_nl_f32
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
@@ -6107,15 +6193,16 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
return;
|
||||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
cl_kernel kernel = backend_ctx->kernel_convert_block_q5_0;
|
||||
cl_ulong n_blk = ggml_nelements(tensor)/ggml_blck_size(tensor->type);
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
if (use_adreno_kernels(backend_ctx, tensor)) {
|
||||
cl_kernel kernel = backend_ctx->kernel_convert_block_q5_0_noshuffle;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qs));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->qh));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &n_blk));
|
||||
|
||||
size_t global_work_size[] = {(size_t)CEIL_DIV(n_blk, 64) * 64, 1, 1};
|
||||
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
||||
size_t local_work_size[] = {64, 1, 1};
|
||||
|
||||
cl_event evt;
|
||||
@@ -6124,7 +6211,39 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
CL_CHECK(clReleaseMemObject(data_device));
|
||||
|
||||
tensor->extra = extra;
|
||||
|
||||
int M = tensor->ne[1];
|
||||
int K = tensor->ne[0];
|
||||
GGML_ASSERT(K % 32 == 0);
|
||||
|
||||
// Transpose qs as ushort
|
||||
transpose_2d_as_16b(backend_ctx, extra->qs, extra->qs, size_qs, K/4, M);
|
||||
// Transpose qh as uchar
|
||||
transpose_2d_as_8b(backend_ctx, extra->qh, extra->qh, size_qh, K/8, M);
|
||||
// Transpose d as ushort
|
||||
transpose_2d_as_16b(backend_ctx, extra->d, extra->d, size_d, K/32, M);
|
||||
|
||||
return;
|
||||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
cl_kernel kernel = backend_ctx->kernel_convert_block_q5_0;
|
||||
cl_ulong n_blk = ggml_nelements(tensor)/ggml_blck_size(tensor->type);
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qs));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->qh));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &n_blk));
|
||||
|
||||
size_t global_work_size[] = {(size_t)CEIL_DIV(n_blk, 64) * 64, 1, 1};
|
||||
size_t local_work_size[] = {64, 1, 1};
|
||||
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
CL_CHECK(clReleaseMemObject(data_device));
|
||||
|
||||
tensor->extra = extra;
|
||||
return;
|
||||
}
|
||||
if (tensor->type == GGML_TYPE_Q5_1) {
|
||||
ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
|
||||
@@ -6225,6 +6344,42 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
return;
|
||||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
if (use_adreno_kernels(backend_ctx, tensor)) {
|
||||
cl_kernel kernel = backend_ctx->kernel_convert_block_q5_1_noshuffle;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qs));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->qh));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->m));
|
||||
|
||||
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
||||
size_t local_work_size[] = {64, 1, 1};
|
||||
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
CL_CHECK(clReleaseMemObject(data_device));
|
||||
|
||||
tensor->extra = extra;
|
||||
|
||||
int M = tensor->ne[1];
|
||||
int K = tensor->ne[0];
|
||||
GGML_ASSERT(K % 32 == 0);
|
||||
|
||||
// Transpose qs as ushort
|
||||
transpose_2d_as_16b(backend_ctx, extra->qs, extra->qs, size_qs, K/4, M);
|
||||
// Transpose qh as uchar
|
||||
transpose_2d_as_8b(backend_ctx, extra->qh, extra->qh, size_qh, K/8, M);
|
||||
// Transpose d as ushort
|
||||
transpose_2d_as_16b(backend_ctx, extra->d, extra->d, size_d, K/32, M);
|
||||
// Transpose m as ushort
|
||||
transpose_2d_as_16b(backend_ctx, extra->m, extra->m, size_m, K/32, M);
|
||||
|
||||
return;
|
||||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
cl_kernel kernel = backend_ctx->kernel_convert_block_q5_1;
|
||||
cl_ulong n_blk = ggml_nelements(tensor)/ggml_blck_size(tensor->type);
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
|
||||
@@ -7299,6 +7454,48 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||
CL_CHECK(clReleaseMemObject(data_device));
|
||||
return;
|
||||
}
|
||||
if (use_adreno_kernels(backend_ctx, tensor)) {
|
||||
ggml_cl_buffer buf_trans_qs;
|
||||
ggml_cl_buffer buf_trans_qh;
|
||||
ggml_cl_buffer buf_trans_d;
|
||||
ggml_cl_buffer buf_unpacked;
|
||||
|
||||
cl_int M = tensor->ne[1];
|
||||
cl_int K = tensor->ne[0];
|
||||
|
||||
GGML_ASSERT(K % 32 == 0);
|
||||
|
||||
size_t size_qs = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*ggml_blck_size(tensor->type)/2;
|
||||
size_t size_qh = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*sizeof(int32_t);
|
||||
size_t size_d = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*sizeof(ggml_fp16_t);
|
||||
|
||||
buf_trans_qs.allocate(backend_ctx->context, size_qs);
|
||||
buf_trans_qh.allocate(backend_ctx->context, size_qh);
|
||||
buf_trans_d.allocate(backend_ctx->context, size_d);
|
||||
buf_unpacked.allocate(backend_ctx->context, ggml_nbytes(tensor));
|
||||
|
||||
transpose_2d_as_16b(backend_ctx, extra->qs, buf_trans_qs.buffer, size_qs, M, K/4);
|
||||
transpose_2d_as_8b(backend_ctx, extra->qh, buf_trans_qh.buffer, size_qh, M, K/8);
|
||||
transpose_2d_as_16b(backend_ctx, extra->d, buf_trans_d.buffer, size_d, M, K/32);
|
||||
|
||||
cl_uchar mask_0F = 0x0F;
|
||||
cl_uchar mask_F0 = 0xF0;
|
||||
|
||||
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
||||
size_t local_work_size[] = {1, 1, 1};
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_restore_block_q5_0_noshuffle;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_trans_qs.buffer));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_qh.buffer));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &buf_trans_d.buffer));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &buf_unpacked.buffer));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_uchar), &mask_0F));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_uchar), &mask_F0));
|
||||
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, buf_unpacked.buffer, CL_TRUE, offset, size, data, 0, NULL, NULL));
|
||||
return;
|
||||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
|
||||
cl_int err;
|
||||
@@ -7362,6 +7559,54 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||
CL_CHECK(clReleaseMemObject(data_device));
|
||||
return;
|
||||
}
|
||||
|
||||
if (use_adreno_kernels(backend_ctx, tensor)) {
|
||||
ggml_cl_buffer buf_trans_qs;
|
||||
ggml_cl_buffer buf_trans_qh;
|
||||
ggml_cl_buffer buf_trans_d;
|
||||
ggml_cl_buffer buf_trans_m;
|
||||
ggml_cl_buffer buf_unpacked;
|
||||
|
||||
cl_int M = tensor->ne[1];
|
||||
cl_int K = tensor->ne[0];
|
||||
GGML_ASSERT(K % 32 == 0);
|
||||
|
||||
size_t size_qs = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*ggml_blck_size(tensor->type)/2;
|
||||
size_t size_qh = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*sizeof(int32_t);
|
||||
size_t size_d = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*sizeof(ggml_fp16_t);
|
||||
size_t size_m = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*sizeof(ggml_fp16_t);
|
||||
|
||||
buf_trans_qs.allocate(backend_ctx->context, size_qs);
|
||||
buf_trans_qh.allocate(backend_ctx->context, size_qh);
|
||||
buf_trans_d.allocate(backend_ctx->context, size_d);
|
||||
buf_trans_m.allocate(backend_ctx->context, size_m);
|
||||
buf_unpacked.allocate(backend_ctx->context, ggml_nbytes(tensor));
|
||||
|
||||
// Transpose back: from col-major to row-major
|
||||
transpose_2d_as_16b(backend_ctx, extra->qs, buf_trans_qs.buffer, size_qs, M, K/4);
|
||||
transpose_2d_as_8b(backend_ctx, extra->qh, buf_trans_qh.buffer, size_qh, M, K/8);
|
||||
transpose_2d_as_16b(backend_ctx, extra->d, buf_trans_d.buffer, size_d, M, K/32);
|
||||
transpose_2d_as_16b(backend_ctx, extra->m, buf_trans_m.buffer, size_m, M, K/32);
|
||||
|
||||
cl_uchar mask_0F = 0x0F;
|
||||
cl_uchar mask_F0 = 0xF0;
|
||||
|
||||
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
||||
size_t local_work_size[] = {1, 1, 1};
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_restore_block_q5_1_noshuffle;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_trans_qs.buffer));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_qh.buffer));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &buf_trans_d.buffer));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &buf_trans_m.buffer));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &buf_unpacked.buffer));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_uchar), &mask_0F));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_uchar), &mask_F0));
|
||||
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, buf_unpacked.buffer, CL_TRUE, offset, size, data, 0, NULL, NULL));
|
||||
return;
|
||||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
cl_int err;
|
||||
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
||||
@@ -12205,6 +12450,368 @@ static void ggml_cl_mul_mat_q4_1_f32_adreno(ggml_backend_t backend, const ggml_t
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_cl_mul_mat_q5_0_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(src1);
|
||||
GGML_ASSERT(src1->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
ggml_tensor_extra_cl_q5_0 * extra0_q5_0 = (ggml_tensor_extra_cl_q5_0 *)src0->extra;
|
||||
|
||||
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
const int ne00 = src0->ne[0];
|
||||
const int ne01 = src0->ne[1];
|
||||
|
||||
const int ne1 = dst->ne[1];
|
||||
|
||||
GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0);
|
||||
|
||||
cl_context context = backend_ctx->context;
|
||||
cl_kernel kernel;
|
||||
|
||||
cl_int err;
|
||||
cl_image_format img_fmt;
|
||||
cl_image_desc img_desc;
|
||||
cl_buffer_region region;
|
||||
|
||||
int M = ne01;
|
||||
int N = ne1;
|
||||
int K = ne00;
|
||||
|
||||
if (ne1 == 1) {
|
||||
cl_mem qs_img = nullptr;
|
||||
cl_mem b_sub_buf = nullptr;
|
||||
cl_mem b_img = nullptr;
|
||||
|
||||
// image for qs
|
||||
img_fmt = { CL_R, CL_UNSIGNED_INT32 };
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = M * K / 2 / 4;
|
||||
img_desc.buffer = extra0_q5_0->qs;
|
||||
CL_CHECK((qs_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
// subbuffer for activations
|
||||
region.origin = offset1;
|
||||
region.size = K * N * sizeof(float);
|
||||
CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
|
||||
// image for activations
|
||||
img_fmt = {CL_RGBA, CL_FLOAT};
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = K * N / 4;
|
||||
img_desc.buffer = b_sub_buf;
|
||||
CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
kernel = backend_ctx->kernel_gemv_noshuffle_q5_0_f32;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &qs_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q5_0->qh));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q5_0->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &b_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_int), &ne01));
|
||||
|
||||
size_t local_work_size[3] = {64, 4, 1};
|
||||
size_t global_work_size[3] = {(size_t)CEIL_DIV(ne01/2, 64)*64, 4, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
|
||||
CL_CHECK(clReleaseMemObject(qs_img));
|
||||
CL_CHECK(clReleaseMemObject(b_sub_buf));
|
||||
CL_CHECK(clReleaseMemObject(b_img));
|
||||
} else {
|
||||
cl_mem b_sub_buf = nullptr;
|
||||
cl_mem b_sub_buf_trans = nullptr;
|
||||
cl_mem b_img = nullptr;
|
||||
cl_mem b_img_trans = nullptr;
|
||||
cl_mem d_sub_buf = nullptr;
|
||||
|
||||
// subbuffer for activations
|
||||
region.origin = offset1;
|
||||
region.size = K * N * sizeof(float);
|
||||
CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
|
||||
// image for activations
|
||||
img_fmt = {CL_RGBA, CL_FLOAT};
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = K * N / 4;
|
||||
img_desc.buffer = b_sub_buf;
|
||||
CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
// pad N to multiple of 8
|
||||
int extra_elements = N % 8;
|
||||
int padding = 0;
|
||||
if (extra_elements > 0){
|
||||
padding = 8 - extra_elements;
|
||||
}
|
||||
|
||||
// subbuffer for transposed activations
|
||||
region.origin = 0;
|
||||
region.size = K * (N + padding) * sizeof(float)/2;
|
||||
backend_ctx->prealloc_act_trans.allocate(context, region.size);
|
||||
CL_CHECK((b_sub_buf_trans = clCreateSubBuffer(backend_ctx->prealloc_act_trans.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
|
||||
// image for transposed activations
|
||||
img_fmt = {CL_RGBA, CL_HALF_FLOAT};
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = K * (N + padding) / 4;
|
||||
img_desc.buffer = b_sub_buf_trans;
|
||||
CL_CHECK((b_img_trans = clCreateImage(context, 0, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
// subbuffer for output
|
||||
region.origin = extrad->offset;
|
||||
region.size = M * N * sizeof(float);
|
||||
CL_CHECK((d_sub_buf = clCreateSubBuffer(extrad->data_device, CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
|
||||
// transpose activations
|
||||
int height_B = N/4;
|
||||
if (height_B == 0) {
|
||||
height_B = 1;
|
||||
}
|
||||
int width_B = K/4;
|
||||
int padded_height_B = (N + padding)/4;
|
||||
|
||||
kernel = backend_ctx->kernel_transpose_32_16;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &b_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &b_img_trans));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_B));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_B));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &padded_height_B));
|
||||
|
||||
size_t local_work_size_t[2] = { 1, 16 };
|
||||
size_t global_work_size_t[2] = { (size_t)width_B, (size_t)padded_height_B };
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size_t, local_work_size_t, dst);
|
||||
|
||||
// gemm
|
||||
kernel = backend_ctx->kernel_gemm_noshuffle_q5_0_f32;
|
||||
int padded_N = N + padding;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q5_0->qs));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q5_0->qh));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q5_0->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &b_img_trans));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_sub_buf));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_int), &padded_N));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_int), &ne1));
|
||||
|
||||
size_t global_work_size[3] = {(size_t)CEIL_DIV(ne1, 8), (size_t)CEIL_DIV(ne01, 4), 1};
|
||||
size_t local_work_size[3] = {1, 128, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
|
||||
CL_CHECK(clReleaseMemObject(b_sub_buf));
|
||||
CL_CHECK(clReleaseMemObject(b_sub_buf_trans));
|
||||
CL_CHECK(clReleaseMemObject(b_img));
|
||||
CL_CHECK(clReleaseMemObject(b_img_trans));
|
||||
CL_CHECK(clReleaseMemObject(d_sub_buf));
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(backend);
|
||||
GGML_UNUSED(src0);
|
||||
GGML_UNUSED(src1);
|
||||
GGML_UNUSED(dst);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_cl_mul_mat_q5_1_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(src1);
|
||||
GGML_ASSERT(src1->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
ggml_tensor_extra_cl_q5_1 * extra0_q5_1 = (ggml_tensor_extra_cl_q5_1 *)src0->extra;
|
||||
|
||||
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
const int ne00 = src0->ne[0];
|
||||
const int ne01 = src0->ne[1];
|
||||
|
||||
const int ne1 = dst->ne[1];
|
||||
|
||||
GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0);
|
||||
|
||||
cl_context context = backend_ctx->context;
|
||||
cl_kernel kernel;
|
||||
|
||||
cl_int err;
|
||||
cl_image_format img_fmt;
|
||||
cl_image_desc img_desc;
|
||||
cl_buffer_region region;
|
||||
|
||||
int M = ne01;
|
||||
int N = ne1;
|
||||
int K = ne00;
|
||||
|
||||
if (ne1 == 1) {
|
||||
cl_mem qs_img = nullptr;
|
||||
cl_mem b_sub_buf = nullptr;
|
||||
cl_mem b_img = nullptr;
|
||||
|
||||
// image for qs
|
||||
img_fmt = { CL_R, CL_UNSIGNED_INT32 };
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = M * K / 2 / 4;
|
||||
img_desc.buffer = extra0_q5_1->qs;
|
||||
CL_CHECK((qs_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
// subbuffer for activations
|
||||
region.origin = offset1;
|
||||
region.size = K * N * sizeof(float);
|
||||
CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
|
||||
// image for activations
|
||||
img_fmt = {CL_RGBA, CL_FLOAT};
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = K * N / 4;
|
||||
img_desc.buffer = b_sub_buf;
|
||||
CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
kernel = backend_ctx->kernel_gemv_noshuffle_q5_1_f32;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &qs_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q5_1->qh));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q5_1->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q5_1->m));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &b_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_int), &ne01));
|
||||
|
||||
size_t local_work_size[3] = {64, 4, 1};
|
||||
size_t global_work_size[3] = {(size_t)CEIL_DIV(ne01/2, 64)*64, 4, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
|
||||
CL_CHECK(clReleaseMemObject(qs_img));
|
||||
CL_CHECK(clReleaseMemObject(b_sub_buf));
|
||||
CL_CHECK(clReleaseMemObject(b_img));
|
||||
} else {
|
||||
cl_mem b_sub_buf = nullptr;
|
||||
cl_mem b_sub_buf_trans = nullptr;
|
||||
cl_mem b_img = nullptr;
|
||||
cl_mem b_img_trans = nullptr;
|
||||
cl_mem d_sub_buf = nullptr;
|
||||
|
||||
// subbuffer for activations
|
||||
region.origin = offset1;
|
||||
region.size = K * N * sizeof(float);
|
||||
CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
|
||||
// image for activations
|
||||
img_fmt = {CL_RGBA, CL_FLOAT};
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = K * N / 4;
|
||||
img_desc.buffer = b_sub_buf;
|
||||
CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
// pad N to multiple of 8
|
||||
int extra_elements = N % 8;
|
||||
int padding = 0;
|
||||
if (extra_elements > 0){
|
||||
padding = 8 - extra_elements;
|
||||
}
|
||||
|
||||
// subbuffer for transposed activations
|
||||
region.origin = 0;
|
||||
region.size = K * (N + padding) * sizeof(float)/2;
|
||||
backend_ctx->prealloc_act_trans.allocate(context, region.size);
|
||||
CL_CHECK((b_sub_buf_trans = clCreateSubBuffer(backend_ctx->prealloc_act_trans.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
|
||||
// image for transposed activations
|
||||
img_fmt = {CL_RGBA, CL_HALF_FLOAT};
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = K * (N + padding) / 4;
|
||||
img_desc.buffer = b_sub_buf_trans;
|
||||
CL_CHECK((b_img_trans = clCreateImage(context, 0, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
// subbuffer for output
|
||||
region.origin = extrad->offset;
|
||||
region.size = M * N * sizeof(float);
|
||||
CL_CHECK((d_sub_buf = clCreateSubBuffer(extrad->data_device, CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
|
||||
// transpose activations
|
||||
int height_B = N/4;
|
||||
if (height_B == 0) {
|
||||
height_B = 1;
|
||||
}
|
||||
int width_B = K/4;
|
||||
int padded_height_B = (N + padding)/4;
|
||||
|
||||
kernel = backend_ctx->kernel_transpose_32_16;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &b_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &b_img_trans));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_B));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_B));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &padded_height_B));
|
||||
|
||||
size_t local_work_size_t[2] = { 1, 16 };
|
||||
size_t global_work_size_t[2] = { (size_t)width_B, (size_t)padded_height_B };
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size_t, local_work_size_t, dst);
|
||||
|
||||
// gemm
|
||||
kernel = backend_ctx->kernel_gemm_noshuffle_q5_1_f32;
|
||||
int padded_N = N + padding;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q5_1->qs));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q5_1->qh));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q5_1->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q5_1->m));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &b_img_trans));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &d_sub_buf));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_int), &padded_N));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_int), &ne1));
|
||||
|
||||
size_t global_work_size[3] = {(size_t)CEIL_DIV(ne1, 8), (size_t)CEIL_DIV(ne01, 4), 1};
|
||||
size_t local_work_size[3] = {1, 128, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
|
||||
CL_CHECK(clReleaseMemObject(b_sub_buf));
|
||||
CL_CHECK(clReleaseMemObject(b_sub_buf_trans));
|
||||
CL_CHECK(clReleaseMemObject(b_img));
|
||||
CL_CHECK(clReleaseMemObject(b_img_trans));
|
||||
CL_CHECK(clReleaseMemObject(d_sub_buf));
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(backend);
|
||||
GGML_UNUSED(src0);
|
||||
GGML_UNUSED(src1);
|
||||
GGML_UNUSED(dst);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_cl_mul_mat_iq4_nl_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
GGML_ASSERT(src0);
|
||||
@@ -13243,6 +13850,18 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
||||
return;
|
||||
}
|
||||
|
||||
// q5_0 x fp32
|
||||
if (src0t == GGML_TYPE_Q5_0 && src1t == GGML_TYPE_F32) {
|
||||
ggml_cl_mul_mat_q5_0_f32_adreno(backend, src0, src1, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
// q5_1 x fp32
|
||||
if (src0t == GGML_TYPE_Q5_1 && src1t == GGML_TYPE_F32) {
|
||||
ggml_cl_mul_mat_q5_1_f32_adreno(backend, src0, src1, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
// iq4_nl x fp32
|
||||
if (src0t == GGML_TYPE_IQ4_NL && src1t == GGML_TYPE_F32) {
|
||||
ggml_cl_mul_mat_iq4_nl_f32_adreno(backend, src0, src1, dst);
|
||||
|
||||
@@ -584,6 +584,60 @@ kernel void kernel_restore_block_q5_0(
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_convert_block_q5_0_noshuffle(
|
||||
global struct block_q5_0 * src0,
|
||||
global uchar * dst_q,
|
||||
global uint * dst_qh,
|
||||
global half * dst_d
|
||||
) {
|
||||
global struct block_q5_0 * b = (global struct block_q5_0 *) src0 + get_global_id(0);
|
||||
global uchar * q = (global uchar *) dst_q + QK5_0/2*get_global_id(0);
|
||||
global uint * qh = (global uint *) dst_qh + get_global_id(0);
|
||||
global half * d = (global half *) dst_d + get_global_id(0);
|
||||
|
||||
*d = b->d;
|
||||
*qh = *((global uint *)(b->qh));
|
||||
|
||||
for (int i = 0; i < QK5_0/4; ++i) {
|
||||
uchar x0 = b->qs[2*i + 0];
|
||||
uchar x1 = b->qs[2*i + 1];
|
||||
|
||||
q[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
|
||||
q[i + QK5_0/4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
|
||||
|
||||
#ifdef ADRENO_GPU
|
||||
if (get_global_id(0) == 65536*4096) {
|
||||
printf("%04x - %02x\n", *(global ushort*)d, ((x0 & 0xF0) >> 4) | (x1 & 0xF0));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_restore_block_q5_0_noshuffle(
|
||||
global uchar * src_q,
|
||||
global uint * src_qh,
|
||||
global half * src_d,
|
||||
global struct block_q5_0 * dst,
|
||||
uchar mask_0F,
|
||||
uchar mask_F0
|
||||
) {
|
||||
global struct block_q5_0 * b = (global struct block_q5_0 *) dst + get_global_id(0);
|
||||
global uchar * q = (global uchar *) src_q + QK5_0/2*get_global_id(0);
|
||||
global uint * qh = (global uint *) src_qh + get_global_id(0);
|
||||
global half * d = (global half *) src_d + get_global_id(0);
|
||||
|
||||
b->d = *d;
|
||||
*((global uint *)(b->qh)) = *qh;
|
||||
|
||||
for (int i = 0; i < QK5_0/4; ++i) {
|
||||
uchar x0 = q[i + 0 ];
|
||||
uchar x1 = q[i + QK5_0/4];
|
||||
|
||||
b->qs[2*i + 0] = convert_uchar((x0 & mask_0F) | ((x1 & mask_0F) << 4));
|
||||
b->qs[2*i + 1] = convert_uchar(((x0 & mask_F0) >> 4) | (x1 & mask_F0));
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_convert_block_q5_0_trans4_ns(
|
||||
__global struct block_q5_0 * src0,
|
||||
__global uint * dst_qs,
|
||||
@@ -736,6 +790,66 @@ kernel void kernel_restore_block_q5_1(
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_convert_block_q5_1_noshuffle(
|
||||
global struct block_q5_1 * src0,
|
||||
global uchar * dst_q,
|
||||
global uint * dst_qh,
|
||||
global half * dst_d,
|
||||
global half * dst_m
|
||||
) {
|
||||
global struct block_q5_1 * b = (global struct block_q5_1 *) src0 + get_global_id(0);
|
||||
global uchar * q = (global uchar *) dst_q + QK5_1/2*get_global_id(0);
|
||||
global uint * qh = (global uint *) dst_qh + get_global_id(0);
|
||||
global half * d = (global half *) dst_d + get_global_id(0);
|
||||
global half * m = (global half *) dst_m + get_global_id(0);
|
||||
|
||||
*d = b->d;
|
||||
*m = b->m;
|
||||
*qh = *((global uint *)(b->qh));
|
||||
|
||||
for (int i = 0; i < QK5_1/4; ++i) {
|
||||
uchar x0 = b->qs[2*i + 0];
|
||||
uchar x1 = b->qs[2*i + 1];
|
||||
|
||||
q[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
|
||||
q[i + QK5_1/4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
|
||||
|
||||
#ifdef ADRENO_GPU
|
||||
if (get_global_id(0) == 65536*4096) {
|
||||
printf("%04x - %02x\n", *(global ushort*)d, ((x0 & 0xF0) >> 4) | (x1 & 0xF0));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_restore_block_q5_1_noshuffle(
|
||||
global uchar * src_q,
|
||||
global uint * src_qh,
|
||||
global half * src_d,
|
||||
global half * src_m,
|
||||
global struct block_q5_1 * dst,
|
||||
uchar mask_0F,
|
||||
uchar mask_F0
|
||||
) {
|
||||
global struct block_q5_1 * b = (global struct block_q5_1 *) dst + get_global_id(0);
|
||||
global uchar * q = (global uchar *) src_q + QK5_1/2*get_global_id(0);
|
||||
global uint * qh = (global uint *) src_qh + get_global_id(0);
|
||||
global half * d = (global half *) src_d + get_global_id(0);
|
||||
global half * m = (global half *) src_m + get_global_id(0);
|
||||
|
||||
b->d = *d;
|
||||
b->m = *m;
|
||||
*((global uint *)(b->qh)) = *qh;
|
||||
|
||||
for (int i = 0; i < QK5_1/4; ++i) {
|
||||
uchar x0 = q[i + 0 ];
|
||||
uchar x1 = q[i + QK5_1/4];
|
||||
|
||||
b->qs[2*i + 0] = convert_uchar((x0 & mask_0F) | ((x1 & mask_0F) << 4));
|
||||
b->qs[2*i + 1] = convert_uchar(((x0 & mask_F0) >> 4) | (x1 & mask_F0));
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_convert_block_q5_1_trans4_ns(
|
||||
__global struct block_q5_1 * src0,
|
||||
__global uint * dst_qs,
|
||||
|
||||
@@ -0,0 +1,131 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
|
||||
#ifdef cl_qcom_reqd_sub_group_size
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#endif
|
||||
|
||||
#ifdef ADRENO_GPU
|
||||
REQD_SUBGROUP_SIZE_128
|
||||
#endif
|
||||
|
||||
kernel void kernel_gemm_noshuffle_q5_0_f32(
|
||||
global const ushort * src0_qs, // quantized A
|
||||
global const uchar * src0_qh, // 5th bits
|
||||
global const half * src0_d, // A scales
|
||||
__read_only image1d_buffer_t src1, // B (1d image)
|
||||
global float * dst, // C
|
||||
int m, // M
|
||||
int n, // N with padding
|
||||
int k, // K
|
||||
int n_no_padding // N without padding
|
||||
) {
|
||||
|
||||
int n_4 = n >> 2;
|
||||
|
||||
int gy = get_global_id(0);
|
||||
int gx = get_global_id(1);
|
||||
int gx_2 = gx << 2;
|
||||
|
||||
half8 c0 = 0, c1 = 0, c2 = 0, c3 = 0;
|
||||
half8 B;
|
||||
half4 dequantized_weights;
|
||||
|
||||
global const ushort * weight_ptr = src0_qs + gx_2;
|
||||
global const uchar * qh_ptr = src0_qh + gx_2;
|
||||
global const half * scale_ptr = src0_d + gx_2;
|
||||
|
||||
for (int i = 0; i < k; i += 4) {
|
||||
|
||||
B.s0123 = read_imageh(src1, gy*2 + i*n_4);
|
||||
B.s4567 = read_imageh(src1, gy*2 + i*n_4 + 1);
|
||||
|
||||
ushort4 bits4 = vload4(0, weight_ptr + (i >> 2)*m);
|
||||
uchar4 bits1 = vload4(0, qh_ptr + (i >> 3)*m);
|
||||
uchar4 qh = bits1 >> (uchar4)(i & 4);
|
||||
|
||||
half4 scale = vload4(0, scale_ptr + (i >> 5)*m);
|
||||
|
||||
// j=0
|
||||
dequantized_weights.s0 = (convert_half((bits4.s0 & 0x000F) | ((qh.s0 & 0x01) << 4)) - 16.0h) * scale.s0;
|
||||
dequantized_weights.s1 = (convert_half((bits4.s1 & 0x000F) | ((qh.s1 & 0x01) << 4)) - 16.0h) * scale.s1;
|
||||
dequantized_weights.s2 = (convert_half((bits4.s2 & 0x000F) | ((qh.s2 & 0x01) << 4)) - 16.0h) * scale.s2;
|
||||
dequantized_weights.s3 = (convert_half((bits4.s3 & 0x000F) | ((qh.s3 & 0x01) << 4)) - 16.0h) * scale.s3;
|
||||
c0 += B * dequantized_weights.s0;
|
||||
c1 += B * dequantized_weights.s1;
|
||||
c2 += B * dequantized_weights.s2;
|
||||
c3 += B * dequantized_weights.s3;
|
||||
|
||||
// j=1
|
||||
B.s0123 = read_imageh(src1, gy*2 + (i+1)*n_4);
|
||||
B.s4567 = read_imageh(src1, gy*2 + (i+1)*n_4 + 1);
|
||||
dequantized_weights.s0 = (convert_half(((bits4.s0 & 0x00F0) >> 4) | ((qh.s0 & 0x02) << 3)) - 16.0h) * scale.s0;
|
||||
dequantized_weights.s1 = (convert_half(((bits4.s1 & 0x00F0) >> 4) | ((qh.s1 & 0x02) << 3)) - 16.0h) * scale.s1;
|
||||
dequantized_weights.s2 = (convert_half(((bits4.s2 & 0x00F0) >> 4) | ((qh.s2 & 0x02) << 3)) - 16.0h) * scale.s2;
|
||||
dequantized_weights.s3 = (convert_half(((bits4.s3 & 0x00F0) >> 4) | ((qh.s3 & 0x02) << 3)) - 16.0h) * scale.s3;
|
||||
c0 += B * dequantized_weights.s0;
|
||||
c1 += B * dequantized_weights.s1;
|
||||
c2 += B * dequantized_weights.s2;
|
||||
c3 += B * dequantized_weights.s3;
|
||||
|
||||
// j=2
|
||||
B.s0123 = read_imageh(src1, gy*2 + (i+2)*n_4);
|
||||
B.s4567 = read_imageh(src1, gy*2 + (i+2)*n_4 + 1);
|
||||
dequantized_weights.s0 = (convert_half(((bits4.s0 & 0x0F00) >> 8) | ((qh.s0 & 0x04) << 2)) - 16.0h) * scale.s0;
|
||||
dequantized_weights.s1 = (convert_half(((bits4.s1 & 0x0F00) >> 8) | ((qh.s1 & 0x04) << 2)) - 16.0h) * scale.s1;
|
||||
dequantized_weights.s2 = (convert_half(((bits4.s2 & 0x0F00) >> 8) | ((qh.s2 & 0x04) << 2)) - 16.0h) * scale.s2;
|
||||
dequantized_weights.s3 = (convert_half(((bits4.s3 & 0x0F00) >> 8) | ((qh.s3 & 0x04) << 2)) - 16.0h) * scale.s3;
|
||||
c0 += B * dequantized_weights.s0;
|
||||
c1 += B * dequantized_weights.s1;
|
||||
c2 += B * dequantized_weights.s2;
|
||||
c3 += B * dequantized_weights.s3;
|
||||
|
||||
// j=3
|
||||
B.s0123 = read_imageh(src1, gy*2 + (i+3)*n_4);
|
||||
B.s4567 = read_imageh(src1, gy*2 + (i+3)*n_4 + 1);
|
||||
dequantized_weights.s0 = (convert_half(((bits4.s0 & 0xF000) >> 12) | ((qh.s0 & 0x08) << 1)) - 16.0h) * scale.s0;
|
||||
dequantized_weights.s1 = (convert_half(((bits4.s1 & 0xF000) >> 12) | ((qh.s1 & 0x08) << 1)) - 16.0h) * scale.s1;
|
||||
dequantized_weights.s2 = (convert_half(((bits4.s2 & 0xF000) >> 12) | ((qh.s2 & 0x08) << 1)) - 16.0h) * scale.s2;
|
||||
dequantized_weights.s3 = (convert_half(((bits4.s3 & 0xF000) >> 12) | ((qh.s3 & 0x08) << 1)) - 16.0h) * scale.s3;
|
||||
c0 += B * dequantized_weights.s0;
|
||||
c1 += B * dequantized_weights.s1;
|
||||
c2 += B * dequantized_weights.s2;
|
||||
c3 += B * dequantized_weights.s3;
|
||||
}
|
||||
|
||||
int idx = (gy<<3)*m + (gx<<2);
|
||||
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s0, c1.s0, c2.s0, c3.s0), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s1, c1.s1, c2.s1, c3.s1), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s2, c1.s2, c2.s2, c3.s2), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s3, c1.s3, c2.s3, c3.s3), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s4, c1.s4, c2.s4, c3.s4), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s5, c1.s5, c2.s5, c3.s5), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s6, c1.s6, c2.s6, c3.s6), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s7, c1.s7, c2.s7, c3.s7), 0, dst + idx);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,134 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
|
||||
#ifdef cl_qcom_reqd_sub_group_size
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#endif
|
||||
|
||||
#ifdef ADRENO_GPU
|
||||
REQD_SUBGROUP_SIZE_128
|
||||
#endif
|
||||
|
||||
kernel void kernel_gemm_noshuffle_q5_1_f32(
|
||||
global const ushort * src0_qs, // quantized A
|
||||
global const uchar * src0_qh, // 5th bits
|
||||
global const half * src0_d, // A scales
|
||||
global const half * src0_m, // A mins
|
||||
__read_only image1d_buffer_t src1, // B (1d image)
|
||||
global float * dst, // C
|
||||
int m, // M
|
||||
int n, // N with padding
|
||||
int k, // K
|
||||
int n_no_padding // N without padding
|
||||
) {
|
||||
|
||||
int n_4 = n >> 2;
|
||||
|
||||
int gy = get_global_id(0);
|
||||
int gx = get_global_id(1);
|
||||
int gx_2 = gx << 2;
|
||||
|
||||
half8 c0 = 0, c1 = 0, c2 = 0, c3 = 0;
|
||||
half8 B;
|
||||
half4 dequantized_weights;
|
||||
|
||||
global const ushort * weight_ptr = src0_qs + gx_2;
|
||||
global const uchar * qh_ptr = src0_qh + gx_2;
|
||||
global const half * scale_ptr = src0_d + gx_2;
|
||||
global const half * min_ptr = src0_m + gx_2;
|
||||
|
||||
for (int i = 0; i < k; i += 4) {
|
||||
|
||||
B.s0123 = read_imageh(src1, gy*2 + i*n_4);
|
||||
B.s4567 = read_imageh(src1, gy*2 + i*n_4 + 1);
|
||||
|
||||
ushort4 bits4 = vload4(0, weight_ptr + (i >> 2)*m);
|
||||
uchar4 bits1 = vload4(0, qh_ptr + (i >> 3)*m);
|
||||
uchar4 qh = bits1 >> (uchar4)(i & 4);
|
||||
|
||||
half4 scale = vload4(0, scale_ptr + (i >> 5)*m);
|
||||
half4 minv = vload4(0, min_ptr + (i >> 5)*m);
|
||||
|
||||
// j=0
|
||||
dequantized_weights.s0 = convert_half((bits4.s0 & 0x000F) | ((qh.s0 & 0x01) << 4)) * scale.s0 + minv.s0;
|
||||
dequantized_weights.s1 = convert_half((bits4.s1 & 0x000F) | ((qh.s1 & 0x01) << 4)) * scale.s1 + minv.s1;
|
||||
dequantized_weights.s2 = convert_half((bits4.s2 & 0x000F) | ((qh.s2 & 0x01) << 4)) * scale.s2 + minv.s2;
|
||||
dequantized_weights.s3 = convert_half((bits4.s3 & 0x000F) | ((qh.s3 & 0x01) << 4)) * scale.s3 + minv.s3;
|
||||
c0 += B * dequantized_weights.s0;
|
||||
c1 += B * dequantized_weights.s1;
|
||||
c2 += B * dequantized_weights.s2;
|
||||
c3 += B * dequantized_weights.s3;
|
||||
|
||||
// j=1
|
||||
B.s0123 = read_imageh(src1, gy*2 + (i+1)*n_4);
|
||||
B.s4567 = read_imageh(src1, gy*2 + (i+1)*n_4 + 1);
|
||||
dequantized_weights.s0 = convert_half(((bits4.s0 & 0x00F0) >> 4) | ((qh.s0 & 0x02) << 3)) * scale.s0 + minv.s0;
|
||||
dequantized_weights.s1 = convert_half(((bits4.s1 & 0x00F0) >> 4) | ((qh.s1 & 0x02) << 3)) * scale.s1 + minv.s1;
|
||||
dequantized_weights.s2 = convert_half(((bits4.s2 & 0x00F0) >> 4) | ((qh.s2 & 0x02) << 3)) * scale.s2 + minv.s2;
|
||||
dequantized_weights.s3 = convert_half(((bits4.s3 & 0x00F0) >> 4) | ((qh.s3 & 0x02) << 3)) * scale.s3 + minv.s3;
|
||||
c0 += B * dequantized_weights.s0;
|
||||
c1 += B * dequantized_weights.s1;
|
||||
c2 += B * dequantized_weights.s2;
|
||||
c3 += B * dequantized_weights.s3;
|
||||
|
||||
// j=2
|
||||
B.s0123 = read_imageh(src1, gy*2 + (i+2)*n_4);
|
||||
B.s4567 = read_imageh(src1, gy*2 + (i+2)*n_4 + 1);
|
||||
dequantized_weights.s0 = convert_half(((bits4.s0 & 0x0F00) >> 8) | ((qh.s0 & 0x04) << 2)) * scale.s0 + minv.s0;
|
||||
dequantized_weights.s1 = convert_half(((bits4.s1 & 0x0F00) >> 8) | ((qh.s1 & 0x04) << 2)) * scale.s1 + minv.s1;
|
||||
dequantized_weights.s2 = convert_half(((bits4.s2 & 0x0F00) >> 8) | ((qh.s2 & 0x04) << 2)) * scale.s2 + minv.s2;
|
||||
dequantized_weights.s3 = convert_half(((bits4.s3 & 0x0F00) >> 8) | ((qh.s3 & 0x04) << 2)) * scale.s3 + minv.s3;
|
||||
c0 += B * dequantized_weights.s0;
|
||||
c1 += B * dequantized_weights.s1;
|
||||
c2 += B * dequantized_weights.s2;
|
||||
c3 += B * dequantized_weights.s3;
|
||||
|
||||
// j=3
|
||||
B.s0123 = read_imageh(src1, gy*2 + (i+3)*n_4);
|
||||
B.s4567 = read_imageh(src1, gy*2 + (i+3)*n_4 + 1);
|
||||
dequantized_weights.s0 = convert_half(((bits4.s0 & 0xF000) >> 12) | ((qh.s0 & 0x08) << 1)) * scale.s0 + minv.s0;
|
||||
dequantized_weights.s1 = convert_half(((bits4.s1 & 0xF000) >> 12) | ((qh.s1 & 0x08) << 1)) * scale.s1 + minv.s1;
|
||||
dequantized_weights.s2 = convert_half(((bits4.s2 & 0xF000) >> 12) | ((qh.s2 & 0x08) << 1)) * scale.s2 + minv.s2;
|
||||
dequantized_weights.s3 = convert_half(((bits4.s3 & 0xF000) >> 12) | ((qh.s3 & 0x08) << 1)) * scale.s3 + minv.s3;
|
||||
c0 += B * dequantized_weights.s0;
|
||||
c1 += B * dequantized_weights.s1;
|
||||
c2 += B * dequantized_weights.s2;
|
||||
c3 += B * dequantized_weights.s3;
|
||||
}
|
||||
|
||||
int idx = (gy<<3)*m + (gx<<2);
|
||||
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s0, c1.s0, c2.s0, c3.s0), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s1, c1.s1, c2.s1, c3.s1), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s2, c1.s2, c2.s2, c3.s2), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s3, c1.s3, c2.s3, c3.s3), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s4, c1.s4, c2.s4, c3.s4), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s5, c1.s5, c2.s5, c3.s5), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s6, c1.s6, c2.s6, c3.s6), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if(idx+3 < m*n_no_padding){
|
||||
vstore4((float4)(c0.s7, c1.s7, c2.s7, c3.s7), 0, dst + idx);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,291 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
|
||||
#ifdef cl_qcom_reqd_sub_group_size
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#endif
|
||||
|
||||
#define QK5_0 32
|
||||
#define NSUBGROUPS 4
|
||||
#define SUBGROUP_SIZE 64
|
||||
|
||||
#define dequantizeBlockAccum_ns_q5_0_sgbroadcast_1_hi(total_sums, bits4, bits1, scale, y) \
|
||||
float shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s0, 0); \
|
||||
total_sums.s0 += (((bits4.s0 & 0x000F) | (((bits1.s0 ) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s1 & 0x000F) | (((bits1.s4 ) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s1, 0); \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x00F0) >> 4) | (((bits1.s0 >> 1) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x00F0) >> 4) | (((bits1.s4 >> 1) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s2, 0); \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x0F00) >> 8) | (((bits1.s0 >> 2) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x0F00) >> 8) | (((bits1.s4 >> 2) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s3, 0); \
|
||||
total_sums.s0 += ((((bits4.s0 & 0xF000) >> 12) | (((bits1.s0 >> 3) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0xF000) >> 12) | (((bits1.s4 >> 3) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s4, 0); \
|
||||
total_sums.s0 += (((bits4.s2 & 0x000F) | (((bits1.s0 >> 4) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s3 & 0x000F) | (((bits1.s4 >> 4) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s5, 0); \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x00F0) >> 4) | (((bits1.s0 >> 5) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x00F0) >> 4) | (((bits1.s4 >> 5) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s6, 0); \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x0F00) >> 8) | (((bits1.s0 >> 6) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x0F00) >> 8) | (((bits1.s4 >> 6) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s7, 0); \
|
||||
total_sums.s0 += ((((bits4.s2 & 0xF000) >> 12) | (((bits1.s0 >> 7) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0xF000) >> 12) | (((bits1.s4 >> 7) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s0, 1); \
|
||||
total_sums.s0 += (((bits4.s4 & 0x000F) | (((bits1.s1 ) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s5 & 0x000F) | (((bits1.s5 ) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s1, 1); \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x00F0) >> 4) | (((bits1.s1 >> 1) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x00F0) >> 4) | (((bits1.s5 >> 1) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s2, 1); \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x0F00) >> 8) | (((bits1.s1 >> 2) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x0F00) >> 8) | (((bits1.s5 >> 2) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s3, 1); \
|
||||
total_sums.s0 += ((((bits4.s4 & 0xF000) >> 12) | (((bits1.s1 >> 3) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0xF000) >> 12) | (((bits1.s5 >> 3) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s4, 1); \
|
||||
total_sums.s0 += (((bits4.s6 & 0x000F) | (((bits1.s1 >> 4) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s7 & 0x000F) | (((bits1.s5 >> 4) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s5, 1); \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x00F0) >> 4) | (((bits1.s1 >> 5) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x00F0) >> 4) | (((bits1.s5 >> 5) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s6, 1); \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x0F00) >> 8) | (((bits1.s1 >> 6) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x0F00) >> 8) | (((bits1.s5 >> 6) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s7, 1); \
|
||||
total_sums.s0 += ((((bits4.s6 & 0xF000) >> 12) | (((bits1.s1 >> 7) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0xF000) >> 12) | (((bits1.s5 >> 7) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
|
||||
|
||||
#define dequantizeBlockAccum_ns_q5_0_sgbroadcast_1_lo(total_sums, bits4, bits1, scale, y) \
|
||||
shared_y = sub_group_broadcast(y.s0, 2); \
|
||||
total_sums.s0 += (((bits4.s0 & 0x000F) | (((bits1.s2 ) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s1 & 0x000F) | (((bits1.s6 ) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s1, 2); \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x00F0) >> 4) | (((bits1.s2 >> 1) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x00F0) >> 4) | (((bits1.s6 >> 1) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s2, 2); \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x0F00) >> 8) | (((bits1.s2 >> 2) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x0F00) >> 8) | (((bits1.s6 >> 2) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s3, 2); \
|
||||
total_sums.s0 += ((((bits4.s0 & 0xF000) >> 12) | (((bits1.s2 >> 3) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0xF000) >> 12) | (((bits1.s6 >> 3) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s4, 2); \
|
||||
total_sums.s0 += (((bits4.s2 & 0x000F) | (((bits1.s2 >> 4) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s3 & 0x000F) | (((bits1.s6 >> 4) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s5, 2); \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x00F0) >> 4) | (((bits1.s2 >> 5) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x00F0) >> 4) | (((bits1.s6 >> 5) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s6, 2); \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x0F00) >> 8) | (((bits1.s2 >> 6) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x0F00) >> 8) | (((bits1.s6 >> 6) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s7, 2); \
|
||||
total_sums.s0 += ((((bits4.s2 & 0xF000) >> 12) | (((bits1.s2 >> 7) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0xF000) >> 12) | (((bits1.s6 >> 7) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s0, 3); \
|
||||
total_sums.s0 += (((bits4.s4 & 0x000F) | (((bits1.s3 ) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s5 & 0x000F) | (((bits1.s7 ) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s1, 3); \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x00F0) >> 4) | (((bits1.s3 >> 1) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x00F0) >> 4) | (((bits1.s7 >> 1) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s2, 3); \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x0F00) >> 8) | (((bits1.s3 >> 2) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x0F00) >> 8) | (((bits1.s7 >> 2) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s3, 3); \
|
||||
total_sums.s0 += ((((bits4.s4 & 0xF000) >> 12) | (((bits1.s3 >> 3) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0xF000) >> 12) | (((bits1.s7 >> 3) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s4, 3); \
|
||||
total_sums.s0 += (((bits4.s6 & 0x000F) | (((bits1.s3 >> 4) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s7 & 0x000F) | (((bits1.s7 >> 4) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s5, 3); \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x00F0) >> 4) | (((bits1.s3 >> 5) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x00F0) >> 4) | (((bits1.s7 >> 5) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s6, 3); \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x0F00) >> 8) | (((bits1.s3 >> 6) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x0F00) >> 8) | (((bits1.s7 >> 6) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s7, 3); \
|
||||
total_sums.s0 += ((((bits4.s6 & 0xF000) >> 12) | (((bits1.s3 >> 7) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0xF000) >> 12) | (((bits1.s7 >> 7) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \
|
||||
|
||||
|
||||
#define dequantizeBlockAccum_ns_q5_0_sgbroadcast_8_hi(total_sums, bits4, bits1, scale, y) \
|
||||
float8 shared_y; \
|
||||
shared_y = sub_group_broadcast(y, 0); \
|
||||
total_sums.s0 += (((bits4.s0 & 0x000F) | (((bits1.s0 ) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s0; \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x00F0) >> 4) | (((bits1.s0 >> 1) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s1; \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x0F00) >> 8) | (((bits1.s0 >> 2) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s2; \
|
||||
total_sums.s0 += ((((bits4.s0 & 0xF000) >> 12) | (((bits1.s0 >> 3) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s3; \
|
||||
total_sums.s0 += (((bits4.s2 & 0x000F) | (((bits1.s0 >> 4) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s4; \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x00F0) >> 4) | (((bits1.s0 >> 5) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s5; \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x0F00) >> 8) | (((bits1.s0 >> 6) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s6; \
|
||||
total_sums.s0 += ((((bits4.s2 & 0xF000) >> 12) | (((bits1.s0 >> 7) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s7; \
|
||||
total_sums.s1 += (((bits4.s1 & 0x000F) | (((bits1.s4 ) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s0; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x00F0) >> 4) | (((bits1.s4 >> 1) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s1; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x0F00) >> 8) | (((bits1.s4 >> 2) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s2; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0xF000) >> 12) | (((bits1.s4 >> 3) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s3; \
|
||||
total_sums.s1 += (((bits4.s3 & 0x000F) | (((bits1.s4 >> 4) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s4; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x00F0) >> 4) | (((bits1.s4 >> 5) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s5; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x0F00) >> 8) | (((bits1.s4 >> 6) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s6; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0xF000) >> 12) | (((bits1.s4 >> 7) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s7; \
|
||||
shared_y = sub_group_broadcast(y, 1); \
|
||||
total_sums.s0 += (((bits4.s4 & 0x000F) | (((bits1.s1 ) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s0; \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x00F0) >> 4) | (((bits1.s1 >> 1) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s1; \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x0F00) >> 8) | (((bits1.s1 >> 2) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s2; \
|
||||
total_sums.s0 += ((((bits4.s4 & 0xF000) >> 12) | (((bits1.s1 >> 3) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s3; \
|
||||
total_sums.s0 += (((bits4.s6 & 0x000F) | (((bits1.s1 >> 4) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s4; \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x00F0) >> 4) | (((bits1.s1 >> 5) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s5; \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x0F00) >> 8) | (((bits1.s1 >> 6) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s6; \
|
||||
total_sums.s0 += ((((bits4.s6 & 0xF000) >> 12) | (((bits1.s1 >> 7) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s7; \
|
||||
total_sums.s1 += (((bits4.s5 & 0x000F) | (((bits1.s5 ) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s0; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x00F0) >> 4) | (((bits1.s5 >> 1) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s1; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x0F00) >> 8) | (((bits1.s5 >> 2) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s2; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0xF000) >> 12) | (((bits1.s5 >> 3) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s3; \
|
||||
total_sums.s1 += (((bits4.s7 & 0x000F) | (((bits1.s5 >> 4) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s4; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x00F0) >> 4) | (((bits1.s5 >> 5) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s5; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x0F00) >> 8) | (((bits1.s5 >> 6) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s6; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0xF000) >> 12) | (((bits1.s5 >> 7) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s7; \
|
||||
|
||||
|
||||
#define dequantizeBlockAccum_ns_q5_0_sgbroadcast_8_lo(total_sums, bits4, bits1, scale, y) \
|
||||
shared_y = sub_group_broadcast(y, 2); \
|
||||
total_sums.s0 += (((bits4.s0 & 0x000F) | (((bits1.s2 ) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s0; \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x00F0) >> 4) | (((bits1.s2 >> 1) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s1; \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x0F00) >> 8) | (((bits1.s2 >> 2) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s2; \
|
||||
total_sums.s0 += ((((bits4.s0 & 0xF000) >> 12) | (((bits1.s2 >> 3) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s3; \
|
||||
total_sums.s0 += (((bits4.s2 & 0x000F) | (((bits1.s2 >> 4) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s4; \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x00F0) >> 4) | (((bits1.s2 >> 5) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s5; \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x0F00) >> 8) | (((bits1.s2 >> 6) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s6; \
|
||||
total_sums.s0 += ((((bits4.s2 & 0xF000) >> 12) | (((bits1.s2 >> 7) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s7; \
|
||||
total_sums.s1 += (((bits4.s1 & 0x000F) | (((bits1.s6 ) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s0; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x00F0) >> 4) | (((bits1.s6 >> 1) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s1; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x0F00) >> 8) | (((bits1.s6 >> 2) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s2; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0xF000) >> 12) | (((bits1.s6 >> 3) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s3; \
|
||||
total_sums.s1 += (((bits4.s3 & 0x000F) | (((bits1.s6 >> 4) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s4; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x00F0) >> 4) | (((bits1.s6 >> 5) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s5; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x0F00) >> 8) | (((bits1.s6 >> 6) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s6; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0xF000) >> 12) | (((bits1.s6 >> 7) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s7; \
|
||||
shared_y = sub_group_broadcast(y, 3); \
|
||||
total_sums.s0 += (((bits4.s4 & 0x000F) | (((bits1.s3 ) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s0; \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x00F0) >> 4) | (((bits1.s3 >> 1) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s1; \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x0F00) >> 8) | (((bits1.s3 >> 2) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s2; \
|
||||
total_sums.s0 += ((((bits4.s4 & 0xF000) >> 12) | (((bits1.s3 >> 3) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s3; \
|
||||
total_sums.s0 += (((bits4.s6 & 0x000F) | (((bits1.s3 >> 4) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s4; \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x00F0) >> 4) | (((bits1.s3 >> 5) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s5; \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x0F00) >> 8) | (((bits1.s3 >> 6) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s6; \
|
||||
total_sums.s0 += ((((bits4.s6 & 0xF000) >> 12) | (((bits1.s3 >> 7) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s7; \
|
||||
total_sums.s1 += (((bits4.s5 & 0x000F) | (((bits1.s7 ) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s0; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x00F0) >> 4) | (((bits1.s7 >> 1) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s1; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x0F00) >> 8) | (((bits1.s7 >> 2) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s2; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0xF000) >> 12) | (((bits1.s7 >> 3) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s3; \
|
||||
total_sums.s1 += (((bits4.s7 & 0x000F) | (((bits1.s7 >> 4) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s4; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x00F0) >> 4) | (((bits1.s7 >> 5) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s5; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x0F00) >> 8) | (((bits1.s7 >> 6) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s6; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0xF000) >> 12) | (((bits1.s7 >> 7) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s7; \
|
||||
|
||||
#ifdef ADRENO_GPU
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
#endif
|
||||
__kernel void kernel_gemv_noshuffle_q5_0_f32(
|
||||
__read_only image1d_buffer_t src0_qs, // quantized A
|
||||
global ushort * src0_qh, // 5th bits
|
||||
global half2 * src0_d, // A scales
|
||||
__read_only image1d_buffer_t src1, // B activations
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00, // K
|
||||
int ne01) // M
|
||||
{
|
||||
uint groupId = get_local_id(1);
|
||||
uint gid = get_global_id(0);
|
||||
ushort slid = get_sub_group_local_id();
|
||||
|
||||
uint K = ne00;
|
||||
uint M = ne01;
|
||||
|
||||
uint LINE_STRIDE_A = M / 2;
|
||||
uint BLOCK_STRIDE_A = NSUBGROUPS * M;
|
||||
|
||||
private uint4 regA;
|
||||
private half2 regS;
|
||||
private float8 regB;
|
||||
|
||||
private float2 totalSum = (float2)(0.0f);
|
||||
|
||||
for (uint k = groupId; k < (K / QK5_0); k += NSUBGROUPS) {
|
||||
regS = src0_d[gid + k * LINE_STRIDE_A];
|
||||
|
||||
ushort4 qh_raw;
|
||||
qh_raw.s0 = src0_qh[gid + (4*k + 0) * LINE_STRIDE_A];
|
||||
qh_raw.s1 = src0_qh[gid + (4*k + 1) * LINE_STRIDE_A];
|
||||
qh_raw.s2 = src0_qh[gid + (4*k + 2) * LINE_STRIDE_A];
|
||||
qh_raw.s3 = src0_qh[gid + (4*k + 3) * LINE_STRIDE_A];
|
||||
|
||||
uchar8 raw = as_uchar8(qh_raw);
|
||||
uchar8 qh_bytes = (uchar8)(raw.s0, raw.s2, raw.s4, raw.s6,
|
||||
raw.s1, raw.s3, raw.s5, raw.s7);
|
||||
|
||||
// Load activations
|
||||
if (slid < 4) {
|
||||
regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
|
||||
regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
|
||||
}
|
||||
|
||||
regA.s0 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
|
||||
regA.s1 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
|
||||
regA.s2 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
|
||||
regA.s3 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
|
||||
|
||||
#ifdef VECTOR_SUB_GROUP_BROADCAST
|
||||
dequantizeBlockAccum_ns_q5_0_sgbroadcast_8_hi(totalSum, as_ushort8(regA), qh_bytes, regS, regB);
|
||||
#else
|
||||
dequantizeBlockAccum_ns_q5_0_sgbroadcast_1_hi(totalSum, as_ushort8(regA), qh_bytes, regS, regB);
|
||||
#endif // VECTOR_SUB_GROUP_BROADCAST
|
||||
|
||||
regA.s0 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
|
||||
regA.s1 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
|
||||
regA.s2 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
|
||||
regA.s3 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
|
||||
#ifdef VECTOR_SUB_GROUP_BROADCAST
|
||||
dequantizeBlockAccum_ns_q5_0_sgbroadcast_8_lo(totalSum, as_ushort8(regA), qh_bytes, regS, regB);
|
||||
#else
|
||||
dequantizeBlockAccum_ns_q5_0_sgbroadcast_1_lo(totalSum, as_ushort8(regA), qh_bytes, regS, regB);
|
||||
#endif // VECTOR_SUB_GROUP_BROADCAST
|
||||
}
|
||||
|
||||
// reduction in local memory, assumes #wave=4
|
||||
local float2 reduceLM[SUBGROUP_SIZE * 3];
|
||||
if (groupId == 1) {
|
||||
reduceLM[SUBGROUP_SIZE * 0 + slid] = totalSum;
|
||||
}
|
||||
if (groupId == 2) {
|
||||
reduceLM[SUBGROUP_SIZE * 1 + slid] = totalSum;
|
||||
}
|
||||
if (groupId == 3) {
|
||||
reduceLM[SUBGROUP_SIZE * 2 + slid] = totalSum;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (groupId == 0) {
|
||||
totalSum += reduceLM[SUBGROUP_SIZE * 0 + slid];
|
||||
}
|
||||
if (groupId == 0) {
|
||||
totalSum += reduceLM[SUBGROUP_SIZE * 1 + slid];
|
||||
}
|
||||
if (groupId == 0) {
|
||||
totalSum += reduceLM[SUBGROUP_SIZE * 2 + slid];
|
||||
}
|
||||
|
||||
// 2 outputs per fiber in wave 0
|
||||
if (groupId == 0) {
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
vstore2(totalSum, 0, &(dst[gid * 2]));
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,294 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
|
||||
#ifdef cl_qcom_reqd_sub_group_size
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#endif
|
||||
|
||||
#define QK5_1 32
|
||||
#define NSUBGROUPS 4
|
||||
#define SUBGROUP_SIZE 64
|
||||
|
||||
#define dequantizeBlockAccum_ns_q5_1_sgbroadcast_1_hi(total_sums, bits4, bits1, scale, minv, y) \
|
||||
float shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s0, 0); \
|
||||
total_sums.s0 += (((bits4.s0 & 0x000F) | (((bits1.s0 ) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += (((bits4.s1 & 0x000F) | (((bits1.s4 ) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s1, 0); \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x00F0) >> 4) | (((bits1.s0 >> 1) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x00F0) >> 4) | (((bits1.s4 >> 1) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s2, 0); \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x0F00) >> 8) | (((bits1.s0 >> 2) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x0F00) >> 8) | (((bits1.s4 >> 2) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s3, 0); \
|
||||
total_sums.s0 += ((((bits4.s0 & 0xF000) >> 12) | (((bits1.s0 >> 3) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0xF000) >> 12) | (((bits1.s4 >> 3) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s4, 0); \
|
||||
total_sums.s0 += (((bits4.s2 & 0x000F) | (((bits1.s0 >> 4) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += (((bits4.s3 & 0x000F) | (((bits1.s4 >> 4) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s5, 0); \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x00F0) >> 4) | (((bits1.s0 >> 5) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x00F0) >> 4) | (((bits1.s4 >> 5) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s6, 0); \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x0F00) >> 8) | (((bits1.s0 >> 6) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x0F00) >> 8) | (((bits1.s4 >> 6) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s7, 0); \
|
||||
total_sums.s0 += ((((bits4.s2 & 0xF000) >> 12) | (((bits1.s0 >> 7) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0xF000) >> 12) | (((bits1.s4 >> 7) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s0, 1); \
|
||||
total_sums.s0 += (((bits4.s4 & 0x000F) | (((bits1.s1 ) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += (((bits4.s5 & 0x000F) | (((bits1.s5 ) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s1, 1); \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x00F0) >> 4) | (((bits1.s1 >> 1) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x00F0) >> 4) | (((bits1.s5 >> 1) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s2, 1); \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x0F00) >> 8) | (((bits1.s1 >> 2) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x0F00) >> 8) | (((bits1.s5 >> 2) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s3, 1); \
|
||||
total_sums.s0 += ((((bits4.s4 & 0xF000) >> 12) | (((bits1.s1 >> 3) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0xF000) >> 12) | (((bits1.s5 >> 3) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s4, 1); \
|
||||
total_sums.s0 += (((bits4.s6 & 0x000F) | (((bits1.s1 >> 4) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += (((bits4.s7 & 0x000F) | (((bits1.s5 >> 4) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s5, 1); \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x00F0) >> 4) | (((bits1.s1 >> 5) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x00F0) >> 4) | (((bits1.s5 >> 5) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s6, 1); \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x0F00) >> 8) | (((bits1.s1 >> 6) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x0F00) >> 8) | (((bits1.s5 >> 6) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s7, 1); \
|
||||
total_sums.s0 += ((((bits4.s6 & 0xF000) >> 12) | (((bits1.s1 >> 7) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0xF000) >> 12) | (((bits1.s5 >> 7) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
|
||||
|
||||
#define dequantizeBlockAccum_ns_q5_1_sgbroadcast_1_lo(total_sums, bits4, bits1, scale, minv, y) \
|
||||
shared_y = sub_group_broadcast(y.s0, 2); \
|
||||
total_sums.s0 += (((bits4.s0 & 0x000F) | (((bits1.s2 ) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += (((bits4.s1 & 0x000F) | (((bits1.s6 ) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s1, 2); \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x00F0) >> 4) | (((bits1.s2 >> 1) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x00F0) >> 4) | (((bits1.s6 >> 1) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s2, 2); \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x0F00) >> 8) | (((bits1.s2 >> 2) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x0F00) >> 8) | (((bits1.s6 >> 2) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s3, 2); \
|
||||
total_sums.s0 += ((((bits4.s0 & 0xF000) >> 12) | (((bits1.s2 >> 3) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0xF000) >> 12) | (((bits1.s6 >> 3) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s4, 2); \
|
||||
total_sums.s0 += (((bits4.s2 & 0x000F) | (((bits1.s2 >> 4) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += (((bits4.s3 & 0x000F) | (((bits1.s6 >> 4) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s5, 2); \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x00F0) >> 4) | (((bits1.s2 >> 5) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x00F0) >> 4) | (((bits1.s6 >> 5) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s6, 2); \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x0F00) >> 8) | (((bits1.s2 >> 6) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x0F00) >> 8) | (((bits1.s6 >> 6) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s7, 2); \
|
||||
total_sums.s0 += ((((bits4.s2 & 0xF000) >> 12) | (((bits1.s2 >> 7) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0xF000) >> 12) | (((bits1.s6 >> 7) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s0, 3); \
|
||||
total_sums.s0 += (((bits4.s4 & 0x000F) | (((bits1.s3 ) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += (((bits4.s5 & 0x000F) | (((bits1.s7 ) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s1, 3); \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x00F0) >> 4) | (((bits1.s3 >> 1) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x00F0) >> 4) | (((bits1.s7 >> 1) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s2, 3); \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x0F00) >> 8) | (((bits1.s3 >> 2) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x0F00) >> 8) | (((bits1.s7 >> 2) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s3, 3); \
|
||||
total_sums.s0 += ((((bits4.s4 & 0xF000) >> 12) | (((bits1.s3 >> 3) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0xF000) >> 12) | (((bits1.s7 >> 3) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s4, 3); \
|
||||
total_sums.s0 += (((bits4.s6 & 0x000F) | (((bits1.s3 >> 4) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += (((bits4.s7 & 0x000F) | (((bits1.s7 >> 4) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s5, 3); \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x00F0) >> 4) | (((bits1.s3 >> 5) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x00F0) >> 4) | (((bits1.s7 >> 5) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s6, 3); \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x0F00) >> 8) | (((bits1.s3 >> 6) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x0F00) >> 8) | (((bits1.s7 >> 6) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s7, 3); \
|
||||
total_sums.s0 += ((((bits4.s6 & 0xF000) >> 12) | (((bits1.s3 >> 7) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0xF000) >> 12) | (((bits1.s7 >> 7) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \
|
||||
|
||||
|
||||
#define dequantizeBlockAccum_ns_q5_1_sgbroadcast_8_hi(total_sums, bits4, bits1, scale, minv, y) \
|
||||
float8 shared_y; \
|
||||
shared_y = sub_group_broadcast(y, 0); \
|
||||
total_sums.s0 += (((bits4.s0 & 0x000F) | (((bits1.s0 ) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s0; \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x00F0) >> 4) | (((bits1.s0 >> 1) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s1; \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x0F00) >> 8) | (((bits1.s0 >> 2) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s2; \
|
||||
total_sums.s0 += ((((bits4.s0 & 0xF000) >> 12) | (((bits1.s0 >> 3) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s3; \
|
||||
total_sums.s0 += (((bits4.s2 & 0x000F) | (((bits1.s0 >> 4) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s4; \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x00F0) >> 4) | (((bits1.s0 >> 5) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s5; \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x0F00) >> 8) | (((bits1.s0 >> 6) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s6; \
|
||||
total_sums.s0 += ((((bits4.s2 & 0xF000) >> 12) | (((bits1.s0 >> 7) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s7; \
|
||||
total_sums.s1 += (((bits4.s1 & 0x000F) | (((bits1.s4 ) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s0; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x00F0) >> 4) | (((bits1.s4 >> 1) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s1; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x0F00) >> 8) | (((bits1.s4 >> 2) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s2; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0xF000) >> 12) | (((bits1.s4 >> 3) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s3; \
|
||||
total_sums.s1 += (((bits4.s3 & 0x000F) | (((bits1.s4 >> 4) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s4; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x00F0) >> 4) | (((bits1.s4 >> 5) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s5; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x0F00) >> 8) | (((bits1.s4 >> 6) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s6; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0xF000) >> 12) | (((bits1.s4 >> 7) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s7; \
|
||||
shared_y = sub_group_broadcast(y, 1); \
|
||||
total_sums.s0 += (((bits4.s4 & 0x000F) | (((bits1.s1 ) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s0; \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x00F0) >> 4) | (((bits1.s1 >> 1) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s1; \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x0F00) >> 8) | (((bits1.s1 >> 2) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s2; \
|
||||
total_sums.s0 += ((((bits4.s4 & 0xF000) >> 12) | (((bits1.s1 >> 3) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s3; \
|
||||
total_sums.s0 += (((bits4.s6 & 0x000F) | (((bits1.s1 >> 4) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s4; \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x00F0) >> 4) | (((bits1.s1 >> 5) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s5; \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x0F00) >> 8) | (((bits1.s1 >> 6) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s6; \
|
||||
total_sums.s0 += ((((bits4.s6 & 0xF000) >> 12) | (((bits1.s1 >> 7) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s7; \
|
||||
total_sums.s1 += (((bits4.s5 & 0x000F) | (((bits1.s5 ) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s0; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x00F0) >> 4) | (((bits1.s5 >> 1) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s1; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x0F00) >> 8) | (((bits1.s5 >> 2) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s2; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0xF000) >> 12) | (((bits1.s5 >> 3) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s3; \
|
||||
total_sums.s1 += (((bits4.s7 & 0x000F) | (((bits1.s5 >> 4) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s4; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x00F0) >> 4) | (((bits1.s5 >> 5) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s5; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x0F00) >> 8) | (((bits1.s5 >> 6) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s6; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0xF000) >> 12) | (((bits1.s5 >> 7) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s7; \
|
||||
|
||||
|
||||
#define dequantizeBlockAccum_ns_q5_1_sgbroadcast_8_lo(total_sums, bits4, bits1, scale, minv, y) \
|
||||
shared_y = sub_group_broadcast(y, 2); \
|
||||
total_sums.s0 += (((bits4.s0 & 0x000F) | (((bits1.s2 ) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s0; \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x00F0) >> 4) | (((bits1.s2 >> 1) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s1; \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x0F00) >> 8) | (((bits1.s2 >> 2) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s2; \
|
||||
total_sums.s0 += ((((bits4.s0 & 0xF000) >> 12) | (((bits1.s2 >> 3) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s3; \
|
||||
total_sums.s0 += (((bits4.s2 & 0x000F) | (((bits1.s2 >> 4) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s4; \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x00F0) >> 4) | (((bits1.s2 >> 5) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s5; \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x0F00) >> 8) | (((bits1.s2 >> 6) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s6; \
|
||||
total_sums.s0 += ((((bits4.s2 & 0xF000) >> 12) | (((bits1.s2 >> 7) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s7; \
|
||||
total_sums.s1 += (((bits4.s1 & 0x000F) | (((bits1.s6 ) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s0; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x00F0) >> 4) | (((bits1.s6 >> 1) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s1; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x0F00) >> 8) | (((bits1.s6 >> 2) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s2; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0xF000) >> 12) | (((bits1.s6 >> 3) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s3; \
|
||||
total_sums.s1 += (((bits4.s3 & 0x000F) | (((bits1.s6 >> 4) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s4; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x00F0) >> 4) | (((bits1.s6 >> 5) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s5; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x0F00) >> 8) | (((bits1.s6 >> 6) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s6; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0xF000) >> 12) | (((bits1.s6 >> 7) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s7; \
|
||||
shared_y = sub_group_broadcast(y, 3); \
|
||||
total_sums.s0 += (((bits4.s4 & 0x000F) | (((bits1.s3 ) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s0; \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x00F0) >> 4) | (((bits1.s3 >> 1) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s1; \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x0F00) >> 8) | (((bits1.s3 >> 2) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s2; \
|
||||
total_sums.s0 += ((((bits4.s4 & 0xF000) >> 12) | (((bits1.s3 >> 3) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s3; \
|
||||
total_sums.s0 += (((bits4.s6 & 0x000F) | (((bits1.s3 >> 4) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s4; \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x00F0) >> 4) | (((bits1.s3 >> 5) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s5; \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x0F00) >> 8) | (((bits1.s3 >> 6) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s6; \
|
||||
total_sums.s0 += ((((bits4.s6 & 0xF000) >> 12) | (((bits1.s3 >> 7) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s7; \
|
||||
total_sums.s1 += (((bits4.s5 & 0x000F) | (((bits1.s7 ) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s0; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x00F0) >> 4) | (((bits1.s7 >> 1) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s1; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x0F00) >> 8) | (((bits1.s7 >> 2) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s2; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0xF000) >> 12) | (((bits1.s7 >> 3) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s3; \
|
||||
total_sums.s1 += (((bits4.s7 & 0x000F) | (((bits1.s7 >> 4) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s4; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x00F0) >> 4) | (((bits1.s7 >> 5) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s5; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x0F00) >> 8) | (((bits1.s7 >> 6) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s6; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0xF000) >> 12) | (((bits1.s7 >> 7) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s7; \
|
||||
|
||||
#ifdef ADRENO_GPU
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
#endif
|
||||
__kernel void kernel_gemv_noshuffle_q5_1_f32(
|
||||
__read_only image1d_buffer_t src0_qs, // quantized A
|
||||
global ushort * src0_qh, // 5th bits
|
||||
global half2 * src0_d, // A scales
|
||||
global half2 * src0_m, // A mins
|
||||
__read_only image1d_buffer_t src1, // B activations
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00, // K
|
||||
int ne01) // M
|
||||
{
|
||||
uint groupId = get_local_id(1);
|
||||
uint gid = get_global_id(0);
|
||||
ushort slid = get_sub_group_local_id();
|
||||
|
||||
uint K = ne00;
|
||||
uint M = ne01;
|
||||
|
||||
uint LINE_STRIDE_A = M / 2;
|
||||
uint BLOCK_STRIDE_A = NSUBGROUPS * M;
|
||||
|
||||
__private uint4 regA;
|
||||
__private half2 regS;
|
||||
__private half2 regM;
|
||||
__private float8 regB;
|
||||
|
||||
__private float2 totalSum = (float2)(0.0f);
|
||||
|
||||
for (uint k = groupId; k < (K / QK5_1); k += NSUBGROUPS) {
|
||||
regS = src0_d[gid + k * LINE_STRIDE_A];
|
||||
regM = src0_m[gid + k * LINE_STRIDE_A];
|
||||
|
||||
ushort4 qh_raw;
|
||||
qh_raw.s0 = src0_qh[gid + (4*k + 0) * LINE_STRIDE_A];
|
||||
qh_raw.s1 = src0_qh[gid + (4*k + 1) * LINE_STRIDE_A];
|
||||
qh_raw.s2 = src0_qh[gid + (4*k + 2) * LINE_STRIDE_A];
|
||||
qh_raw.s3 = src0_qh[gid + (4*k + 3) * LINE_STRIDE_A];
|
||||
|
||||
uchar8 raw = as_uchar8(qh_raw);
|
||||
uchar8 qh_bytes = (uchar8)(raw.s0, raw.s2, raw.s4, raw.s6,
|
||||
raw.s1, raw.s3, raw.s5, raw.s7);
|
||||
|
||||
// Load activations
|
||||
if (slid < 4) {
|
||||
regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
|
||||
regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
|
||||
}
|
||||
|
||||
regA.s0 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
|
||||
regA.s1 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
|
||||
regA.s2 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
|
||||
regA.s3 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
|
||||
|
||||
#ifdef VECTOR_SUB_GROUP_BROADCAST
|
||||
dequantizeBlockAccum_ns_q5_1_sgbroadcast_8_hi(totalSum, as_ushort8(regA), qh_bytes, regS, regM, regB);
|
||||
#else
|
||||
dequantizeBlockAccum_ns_q5_1_sgbroadcast_1_hi(totalSum, as_ushort8(regA), qh_bytes, regS, regM, regB);
|
||||
#endif // VECTOR_SUB_GROUP_BROADCAST
|
||||
|
||||
regA.s0 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
|
||||
regA.s1 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
|
||||
regA.s2 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
|
||||
regA.s3 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
|
||||
#ifdef VECTOR_SUB_GROUP_BROADCAST
|
||||
dequantizeBlockAccum_ns_q5_1_sgbroadcast_8_lo(totalSum, as_ushort8(regA), qh_bytes, regS, regM, regB);
|
||||
#else
|
||||
dequantizeBlockAccum_ns_q5_1_sgbroadcast_1_lo(totalSum, as_ushort8(regA), qh_bytes, regS, regM, regB);
|
||||
#endif // VECTOR_SUB_GROUP_BROADCAST
|
||||
}
|
||||
|
||||
// reduction in local memory, assumes #wave=4
|
||||
local float2 reduceLM[SUBGROUP_SIZE * 3];
|
||||
if (groupId == 1) {
|
||||
reduceLM[SUBGROUP_SIZE * 0 + slid] = totalSum;
|
||||
}
|
||||
if (groupId == 2) {
|
||||
reduceLM[SUBGROUP_SIZE * 1 + slid] = totalSum;
|
||||
}
|
||||
if (groupId == 3) {
|
||||
reduceLM[SUBGROUP_SIZE * 2 + slid] = totalSum;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (groupId == 0) {
|
||||
totalSum += reduceLM[SUBGROUP_SIZE * 0 + slid];
|
||||
}
|
||||
if (groupId == 0) {
|
||||
totalSum += reduceLM[SUBGROUP_SIZE * 1 + slid];
|
||||
}
|
||||
if (groupId == 0) {
|
||||
totalSum += reduceLM[SUBGROUP_SIZE * 2 + slid];
|
||||
}
|
||||
|
||||
// 2 outputs per fiber in wave 0
|
||||
if (groupId == 0) {
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
vstore2(totalSum, 0, &(dst[gid * 2]));
|
||||
}
|
||||
|
||||
}
|
||||
@@ -6202,6 +6202,19 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
break;
|
||||
}
|
||||
|
||||
#if VK_HEADER_VERSION >= 287
|
||||
// Honeykrisp driver for Asahi Linux doesn't report VK_VENDOR_ID_APPLE.
|
||||
// Check for Honeykrisp driver and force same configuration as the VK_VENDOR_ID_APPLE case.
|
||||
if (device->driver_id == vk::DriverId::eMesaHoneykrisp) {
|
||||
device->mul_mat_l[i] = false;
|
||||
device->mul_mat_m[i] = true;
|
||||
device->mul_mat_s[i] = false;
|
||||
device->mul_mat_id_l[i] = false;
|
||||
device->mul_mat_id_m[i] = true;
|
||||
device->mul_mat_id_s[i] = false;
|
||||
}
|
||||
#endif
|
||||
|
||||
device->mul_mat_l_int[i] = device->mul_mat_l[i];
|
||||
device->mul_mat_m_int[i] = device->mul_mat_m[i];
|
||||
device->mul_mat_s_int[i] = device->mul_mat_s[i];
|
||||
@@ -7604,8 +7617,12 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void *
|
||||
if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
||||
GGML_ASSERT(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
|
||||
|
||||
for (size_t i = 0; i < height; i++) {
|
||||
memcpy((uint8_t *)dst->ptr + offset + i * dpitch, (const uint8_t *) src + i * spitch, width);
|
||||
if (width == spitch && width == dpitch) {
|
||||
memcpy((uint8_t *)dst->ptr + offset, src, width * height);
|
||||
} else {
|
||||
for (size_t i = 0; i < height; i++) {
|
||||
memcpy((uint8_t *)dst->ptr + offset + i * dpitch, (const uint8_t *) src + i * spitch, width);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
std::lock_guard<std::recursive_mutex> guard(dst->device->mutex);
|
||||
@@ -7724,8 +7741,29 @@ static void ggml_vk_buffer_read_2d(vk_buffer& src, size_t offset, void * dst, si
|
||||
if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible && src->device->uma) {
|
||||
GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
|
||||
|
||||
for (size_t i = 0; i < height; i++) {
|
||||
memcpy((uint8_t *) dst + i * dpitch, (const uint8_t *) src->ptr + offset + i * spitch, width);
|
||||
std::lock_guard<std::recursive_mutex> guard(src->device->mutex);
|
||||
vk_context subctx = ggml_vk_create_temporary_context(src->device->compute_queue.cmd_pool);
|
||||
ggml_vk_ctx_begin(src->device, subctx);
|
||||
subctx->s->buffer->buf.pipelineBarrier(
|
||||
vk::PipelineStageFlagBits::eComputeShader | vk::PipelineStageFlagBits::eTransfer,
|
||||
vk::PipelineStageFlagBits::eHost,
|
||||
{},
|
||||
{ { vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferWrite,
|
||||
vk::AccessFlagBits::eHostRead } },
|
||||
{}, {});
|
||||
ggml_vk_ctx_end(subctx);
|
||||
ggml_vk_submit(subctx, src->device->fence);
|
||||
VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX),
|
||||
"vk_buffer_read_2d uma waitForFences");
|
||||
src->device->device.resetFences({ src->device->fence });
|
||||
ggml_vk_queue_command_pools_cleanup(src->device);
|
||||
|
||||
if (width == spitch && width == dpitch) {
|
||||
memcpy(dst, (const uint8_t *) src->ptr + offset, width * height);
|
||||
} else {
|
||||
for (size_t i = 0; i < height; i++) {
|
||||
memcpy((uint8_t *) dst + i * dpitch, (const uint8_t *) src->ptr + offset + i * spitch, width);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
std::lock_guard<std::recursive_mutex> guard(src->device->mutex);
|
||||
|
||||
@@ -154,6 +154,9 @@ class Keys:
|
||||
HIDDEN_ACT = "{arch}.hidden_activation"
|
||||
DENSE_FEAT_IN_SIZE = "{arch}.{dense}_feat_in"
|
||||
DENSE_FEAT_OUT_SIZE = "{arch}.{dense}_feat_out"
|
||||
TARGET_LAYERS = "{arch}.target_layers"
|
||||
TARGET_HIDDEN_SIZE = "{arch}.target_hidden_size"
|
||||
NORM_BEFORE_RESIDUAL = "{arch}.norm_before_residual"
|
||||
|
||||
class Attention:
|
||||
HEAD_COUNT = "{arch}.attention.head_count"
|
||||
@@ -272,7 +275,8 @@ class Keys:
|
||||
CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}"
|
||||
CHAT_TEMPLATES = "tokenizer.chat_templates"
|
||||
# Normalizer constants
|
||||
NORMALIZER_LOWERCASE = "tokenizer.ggml.normalizer.lowercase"
|
||||
NORMALIZER_LOWERCASE = "tokenizer.ggml.normalizer.lowercase"
|
||||
NORMALIZER_STRIP_ACCENTS = "tokenizer.ggml.normalizer.strip_accents"
|
||||
# FIM/Infill special tokens constants
|
||||
FIM_PRE_ID = "tokenizer.ggml.fim_pre_token_id"
|
||||
FIM_SUF_ID = "tokenizer.ggml.fim_suf_token_id"
|
||||
@@ -510,6 +514,7 @@ class MODEL_ARCH(IntEnum):
|
||||
RND1 = auto()
|
||||
PANGU_EMBED = auto()
|
||||
MISTRAL3 = auto()
|
||||
EAGLE3 = auto()
|
||||
MISTRAL4 = auto()
|
||||
PADDLEOCR = auto()
|
||||
MIMO2 = auto()
|
||||
@@ -900,14 +905,17 @@ class MODEL_TENSOR(IntEnum):
|
||||
A_PER_DIM_K_SCALE = auto() # gemma4
|
||||
A_PER_DIM_SCALE = auto() # gemma4
|
||||
# nextn/mtp
|
||||
NEXTN_PROJ_PRE = auto()
|
||||
NEXTN_PROJ_POST = auto()
|
||||
NEXTN_EH_PROJ = auto()
|
||||
NEXTN_EMBED_TOKENS = auto()
|
||||
NEXTN_ENORM = auto()
|
||||
NEXTN_HNORM = auto()
|
||||
NEXTN_PROJ_PRE = auto()
|
||||
NEXTN_PROJ_POST = auto()
|
||||
NEXTN_EH_PROJ = auto()
|
||||
NEXTN_EMBED_TOKENS = auto()
|
||||
NEXTN_ENORM = auto()
|
||||
NEXTN_HNORM = auto()
|
||||
NEXTN_SHARED_HEAD_HEAD = auto()
|
||||
NEXTN_SHARED_HEAD_NORM = auto()
|
||||
# eagle3
|
||||
FC = auto() # feature fusion layer
|
||||
D2T = auto() # draft to target vocabulary mapping
|
||||
# lfm2 audio
|
||||
A_ENC_NORM_CONV = auto()
|
||||
A_ENC_LINEAR_POS = auto()
|
||||
@@ -1062,6 +1070,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.RND1: "rnd1",
|
||||
MODEL_ARCH.PANGU_EMBED: "pangu-embedded",
|
||||
MODEL_ARCH.MISTRAL3: "mistral3",
|
||||
MODEL_ARCH.EAGLE3: "eagle3",
|
||||
MODEL_ARCH.MISTRAL4: "mistral4",
|
||||
MODEL_ARCH.PADDLEOCR: "paddleocr",
|
||||
MODEL_ARCH.MIMO2: "mimo2",
|
||||
@@ -1094,8 +1103,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
MODEL_TENSOR.POS_EMBD: "position_embd",
|
||||
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
||||
MODEL_TENSOR.OUTPUT: "output",
|
||||
MODEL_TENSOR.DENSE_2_OUT: "dense_2", # embeddinggemma 2_Dense
|
||||
MODEL_TENSOR.DENSE_3_OUT: "dense_3", # embeddinggemma 2_Dense
|
||||
MODEL_TENSOR.DENSE_2_OUT: "dense_2", # embeddinggemma 2_Dense
|
||||
MODEL_TENSOR.DENSE_3_OUT: "dense_3", # embeddinggemma 2_Dense
|
||||
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
||||
MODEL_TENSOR.ROPE_FACTORS_LONG: "rope_factors_long",
|
||||
MODEL_TENSOR.ROPE_FACTORS_SHORT: "rope_factors_short",
|
||||
@@ -1487,6 +1496,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
MODEL_TENSOR.NEXTN_HNORM: "blk.{bid}.nextn.hnorm",
|
||||
MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD: "blk.{bid}.nextn.shared_head_head",
|
||||
MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM: "blk.{bid}.nextn.shared_head_norm",
|
||||
MODEL_TENSOR.FC: "fc",
|
||||
MODEL_TENSOR.D2T: "d2t",
|
||||
}
|
||||
|
||||
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
@@ -4027,6 +4038,24 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
],
|
||||
MODEL_ARCH.EAGLE3: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_NORM_2,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.FC,
|
||||
MODEL_TENSOR.D2T,
|
||||
],
|
||||
MODEL_ARCH.MISTRAL4: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
|
||||
@@ -1124,6 +1124,9 @@ class GGUFWriter:
|
||||
def add_normalizer_lowercase(self, value: bool) -> None:
|
||||
self.add_bool(Keys.Tokenizer.NORMALIZER_LOWERCASE, value)
|
||||
|
||||
def add_normalizer_strip_accents(self, value: bool) -> None:
|
||||
self.add_bool(Keys.Tokenizer.NORMALIZER_STRIP_ACCENTS, value)
|
||||
|
||||
def add_eot_token_id(self, id: int) -> None:
|
||||
self.add_uint32(Keys.Tokenizer.EOT_ID, id)
|
||||
|
||||
|
||||
+19
-4
@@ -53,6 +53,7 @@ class SpecialVocab:
|
||||
special_token_ids: dict[str, int]
|
||||
chat_template: str | Sequence[Mapping[str, str]] | None
|
||||
normalizer_lowercase: bool | None
|
||||
normalizer_strip_accents: bool | None
|
||||
|
||||
def __init__(
|
||||
self, path: str | os.PathLike[str], load_merges: bool = False,
|
||||
@@ -66,6 +67,7 @@ class SpecialVocab:
|
||||
self.merges = []
|
||||
self.chat_template = None
|
||||
self.normalizer_lowercase = None
|
||||
self.normalizer_strip_accents = None
|
||||
if special_token_types is not None:
|
||||
self.special_token_types = special_token_types
|
||||
else:
|
||||
@@ -108,6 +110,10 @@ class SpecialVocab:
|
||||
if not quiet:
|
||||
logger.info(f'Setting normalizer_lowercase to {self.normalizer_lowercase}')
|
||||
gw.add_normalizer_lowercase(self.normalizer_lowercase)
|
||||
if self.normalizer_strip_accents is not None:
|
||||
if not quiet:
|
||||
logger.info(f'Setting normalizer_strip_accents to {self.normalizer_strip_accents}')
|
||||
gw.add_normalizer_strip_accents(self.normalizer_strip_accents)
|
||||
|
||||
def _load(self, path: Path) -> None:
|
||||
self._try_load_from_tokenizer_json(path)
|
||||
@@ -155,17 +161,21 @@ class SpecialVocab:
|
||||
def _parse_normalizer(self, normalizer: dict) -> None:
|
||||
# ref: https://huggingface.co/docs/tokenizers/api/normalizers
|
||||
#
|
||||
# Detects lowercase normalization in three possible formats:
|
||||
# 1. Standalone: {"type": "Lowercase"}
|
||||
# 2. BertNormalizer attribute: {"type": "BertNormalizer", "lowercase": true, ...}
|
||||
# 3. Nested in Sequence: {"type": "Sequence", "normalizers": [...]}
|
||||
# Extracts normalizer flags from three possible formats:
|
||||
# 1. Standalone: {"type": "Lowercase"}
|
||||
# 2. BertNormalizer attrs: {"type": "BertNormalizer", ...}
|
||||
# 3. Nested in Sequence: {"type": "Sequence", "normalizers": [...]}
|
||||
|
||||
normalizer_type = normalizer.get('type')
|
||||
if normalizer_type == 'Lowercase':
|
||||
self.normalizer_lowercase = True
|
||||
elif normalizer_type == 'StripAccents':
|
||||
self.normalizer_strip_accents = True
|
||||
elif normalizer_type == 'BertNormalizer':
|
||||
if 'lowercase' in normalizer:
|
||||
self.normalizer_lowercase = normalizer['lowercase']
|
||||
if 'strip_accents' in normalizer:
|
||||
self.normalizer_strip_accents = normalizer['strip_accents']
|
||||
elif normalizer_type == 'Sequence':
|
||||
for norm in normalizer.get('normalizers', []):
|
||||
self._parse_normalizer(norm)
|
||||
@@ -246,6 +256,11 @@ class SpecialVocab:
|
||||
if special_first := tmpl_single[0].get('SpecialToken', {}).get('id'):
|
||||
if not tokenizer_config:
|
||||
special_bos = special_first
|
||||
elif special_first not in (special_bos, special_cls):
|
||||
if not special_bos:
|
||||
tokenizer_config['bos_token'] = special_bos = special_first
|
||||
if not special_cls:
|
||||
tokenizer_config['cls_token'] = special_cls = special_first
|
||||
self.add_special_token['bos'] = True if special_first in (special_bos, special_cls) else False
|
||||
if special_first not in (special_bos, special_cls):
|
||||
logger.warning(f'Unknown leading special token {special_first!r} in TemplateProcessing<single>')
|
||||
|
||||
@@ -1 +1 @@
|
||||
7142aa6bf9fcaeec0fef8d80fcd90afe4268adf1
|
||||
3af5f5760e19a96427f5f7a93b79cbdf3d4b265b
|
||||
|
||||
@@ -5,7 +5,7 @@ import os
|
||||
import sys
|
||||
import subprocess
|
||||
|
||||
HTTPLIB_VERSION = "refs/tags/v0.46.1"
|
||||
HTTPLIB_VERSION = "refs/tags/v0.47.0"
|
||||
|
||||
vendor = {
|
||||
"https://github.com/nlohmann/json/releases/latest/download/json.hpp": "vendor/nlohmann/json.hpp",
|
||||
|
||||
+101
-3
@@ -16,11 +16,80 @@ set(HF_ENABLED "" CACHE STRING "Whether to allow HF Bucket download (ON/O
|
||||
set(BUILD_UI "" CACHE STRING "Build UI via npm (ON/OFF)")
|
||||
set(LLAMA_UI_EMBED "" CACHE STRING "Path to llama-ui-embed helper")
|
||||
|
||||
# IMPORTANT: When adding PWA assets, sync across all 3 places:
|
||||
# 1. tools/ui/src/lib/constants/pwa.ts (APPLE_DEVICES, PUBLIC_ENDPOINTS)
|
||||
# 2. tools/server/server-http.cpp (public_endpoints)
|
||||
# 3. scripts/ui-assets.cmake (ASSETS list)
|
||||
# - C++ (server-http.cpp) - public endpoints (splash screens generated via helper)
|
||||
# - TypeScript (constants/pwa.ts) - APPLE_DEVICES, PWA_MANIFEST, PUBLIC_ENDPOINTS
|
||||
#
|
||||
# When adding/changing PWA assets, update tools/ui/src/lib/constants/pwa.ts first,
|
||||
# then sync any new file names here and in server-http.cpp.
|
||||
set(ASSETS
|
||||
bundle.css
|
||||
bundle.js
|
||||
index.html
|
||||
loading.html
|
||||
# PWA assets
|
||||
favicon.ico
|
||||
favicon-dark.ico
|
||||
favicon.svg
|
||||
favicon-dark.svg
|
||||
pwa-64x64.png
|
||||
pwa-192x192.png
|
||||
pwa-512x512.png
|
||||
maskable-icon-512x512.png
|
||||
apple-touch-icon-180x180.png
|
||||
# iOS splash screens
|
||||
apple-splash-portrait-640x1136.png
|
||||
apple-splash-landscape-1136x640.png
|
||||
apple-splash-portrait-750x1334.png
|
||||
apple-splash-landscape-1334x750.png
|
||||
apple-splash-portrait-1170x2532.png
|
||||
apple-splash-landscape-2532x1170.png
|
||||
apple-splash-portrait-1179x2556.png
|
||||
apple-splash-landscape-2556x1179.png
|
||||
apple-splash-portrait-1206x2622.png
|
||||
apple-splash-landscape-2622x1206.png
|
||||
apple-splash-portrait-1284x2778.png
|
||||
apple-splash-landscape-2778x1284.png
|
||||
apple-splash-portrait-1290x2796.png
|
||||
apple-splash-landscape-2796x1290.png
|
||||
apple-splash-portrait-1320x2868.png
|
||||
apple-splash-landscape-2868x1320.png
|
||||
apple-splash-portrait-1488x2266.png
|
||||
apple-splash-landscape-2266x1488.png
|
||||
apple-splash-portrait-1640x2360.png
|
||||
apple-splash-landscape-2360x1640.png
|
||||
apple-splash-portrait-1668x2388.png
|
||||
apple-splash-landscape-2388x1668.png
|
||||
apple-splash-portrait-2048x2732.png
|
||||
apple-splash-landscape-2732x2048.png
|
||||
# iOS dark splash screens
|
||||
apple-splash-portrait-dark-640x1136.png
|
||||
apple-splash-landscape-dark-1136x640.png
|
||||
apple-splash-portrait-dark-750x1334.png
|
||||
apple-splash-landscape-dark-1334x750.png
|
||||
apple-splash-portrait-dark-1170x2532.png
|
||||
apple-splash-landscape-dark-2532x1170.png
|
||||
apple-splash-portrait-dark-1179x2556.png
|
||||
apple-splash-landscape-dark-2556x1179.png
|
||||
apple-splash-portrait-dark-1206x2622.png
|
||||
apple-splash-landscape-dark-2622x1206.png
|
||||
apple-splash-portrait-dark-1284x2778.png
|
||||
apple-splash-landscape-dark-2778x1284.png
|
||||
apple-splash-portrait-dark-1290x2796.png
|
||||
apple-splash-landscape-dark-2796x1290.png
|
||||
apple-splash-portrait-dark-1320x2868.png
|
||||
apple-splash-landscape-dark-2868x1320.png
|
||||
apple-splash-portrait-dark-1640x2360.png
|
||||
apple-splash-landscape-dark-2360x1640.png
|
||||
apple-splash-portrait-dark-1668x2388.png
|
||||
apple-splash-landscape-dark-2388x1668.png
|
||||
apple-splash-portrait-dark-2048x2732.png
|
||||
apple-splash-landscape-dark-2732x2048.png
|
||||
manifest.webmanifest
|
||||
sw.js
|
||||
_app/version.json
|
||||
build.json
|
||||
)
|
||||
|
||||
set(DIST_DIR "${UI_BINARY_DIR}/dist")
|
||||
@@ -159,7 +228,7 @@ function(npm_build out_var)
|
||||
|
||||
message(STATUS "UI: running npm run build, output -> ${DIST_DIR}")
|
||||
execute_process(
|
||||
COMMAND ${CMAKE_COMMAND} -E env "LLAMA_UI_OUT_DIR=${DIST_DIR}"
|
||||
COMMAND ${CMAKE_COMMAND} -E env "LLAMA_UI_OUT_DIR=${DIST_DIR}" "LLAMA_UI_VERSION=${HF_VERSION}" "LLAMA_BUILD_NUMBER=${LLAMA_BUILD_NUMBER}"
|
||||
${NPM_EXECUTABLE} run build
|
||||
WORKING_DIRECTORY "${UI_SOURCE_DIR}"
|
||||
RESULT_VARIABLE rc
|
||||
@@ -274,8 +343,35 @@ function(emit_files)
|
||||
foreach(asset ${ASSETS})
|
||||
list(APPEND args "${asset}" "${DIST_DIR}/${asset}")
|
||||
endforeach()
|
||||
|
||||
# Bundle files live in _app/immutable/ — vanilla SvelteKit output, no plugin
|
||||
# rewriting. Embedded names must match the exact _app/ paths that index.html
|
||||
# and sw.js reference.
|
||||
file(GLOB_RECURSE detected_bundle_js "${DIST_DIR}/_app/immutable/bundle.*.js")
|
||||
file(GLOB_RECURSE detected_bundle_css "${DIST_DIR}/_app/immutable/assets/bundle.*.css")
|
||||
file(GLOB_RECURSE detected_workbox "${DIST_DIR}/workbox-*.js")
|
||||
# Compute relative path from DIST_DIR to each found file.
|
||||
# e.g. /path/to/build/tools/ui/dist/_app/immutable/bundle.XXX.js
|
||||
# -> _app/immutable/bundle.XXX.js
|
||||
foreach(f ${detected_bundle_js})
|
||||
string(REPLACE "${DIST_DIR}/" "" rel "${f}")
|
||||
list(APPEND args "${rel}" "${f}")
|
||||
endforeach()
|
||||
foreach(f ${detected_bundle_css})
|
||||
string(REPLACE "${DIST_DIR}/" "" rel "${f}")
|
||||
list(APPEND args "${rel}" "${f}")
|
||||
endforeach()
|
||||
foreach(f ${detected_workbox})
|
||||
string(REPLACE "${DIST_DIR}/" "" rel "${f}")
|
||||
list(APPEND args "${rel}" "${f}")
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
# Create build.json with the llama.cpp build number for UI version display.
|
||||
# This is separate from SvelteKit's _app/version.json (used for SW cache invalidation).
|
||||
# build.json is generated by the vite plugin (buildInfoPlugin) during npm build.
|
||||
# CMake just embeds it from the dist that npm produced.
|
||||
|
||||
execute_process(
|
||||
COMMAND "${LLAMA_UI_EMBED}" ${args}
|
||||
RESULT_VARIABLE rc
|
||||
@@ -300,6 +396,8 @@ endif()
|
||||
set(provisioned FALSE)
|
||||
|
||||
if(BUILD_UI)
|
||||
# Resolve version from git build-info if not explicitly set
|
||||
resolve_version(HF_VERSION)
|
||||
npm_build(NPM_OK)
|
||||
if(NPM_OK)
|
||||
set(provisioned TRUE)
|
||||
|
||||
+47
-37
@@ -3,7 +3,6 @@
|
||||
#include "llama-impl.h"
|
||||
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <vector>
|
||||
|
||||
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
@@ -128,6 +127,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_RND1, "rnd1" },
|
||||
{ LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
|
||||
{ LLM_ARCH_MISTRAL3, "mistral3" },
|
||||
{ LLM_ARCH_EAGLE3, "eagle3" },
|
||||
{ LLM_ARCH_MISTRAL4, "mistral4" },
|
||||
{ LLM_ARCH_PADDLEOCR, "paddleocr" },
|
||||
{ LLM_ARCH_MIMO2, "mimo2" },
|
||||
@@ -292,46 +292,51 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
|
||||
{ LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },
|
||||
|
||||
{ LLM_KV_TARGET_LAYERS, "%s.target_layers" },
|
||||
{ LLM_KV_TARGET_HIDDEN_SIZE, "%s.target_hidden_size" },
|
||||
{ LLM_KV_NORM_BEFORE_RESIDUAL, "%s.norm_before_residual" },
|
||||
|
||||
{ LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" },
|
||||
// sentence-transformers dense modules feature dims
|
||||
{ LLM_KV_DENSE_2_FEAT_IN, "%s.dense_2_feat_in" },
|
||||
{ LLM_KV_DENSE_2_FEAT_OUT, "%s.dense_2_feat_out" },
|
||||
{ LLM_KV_DENSE_3_FEAT_IN, "%s.dense_3_feat_in" },
|
||||
{ LLM_KV_DENSE_3_FEAT_OUT, "%s.dense_3_feat_out" },
|
||||
{ LLM_KV_DENSE_2_FEAT_OUT, "%s.dense_2_feat_out" },
|
||||
{ LLM_KV_DENSE_3_FEAT_IN, "%s.dense_3_feat_in" },
|
||||
{ LLM_KV_DENSE_3_FEAT_OUT, "%s.dense_3_feat_out" },
|
||||
|
||||
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
||||
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
|
||||
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
|
||||
{ LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
|
||||
{ LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
|
||||
{ LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" },
|
||||
{ LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" },
|
||||
{ LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" },
|
||||
{ LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" },
|
||||
{ LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
|
||||
{ LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" },
|
||||
{ LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
|
||||
{ LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
|
||||
{ LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
|
||||
{ LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" },
|
||||
{ LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" },
|
||||
{ LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
|
||||
{ LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
|
||||
{ LLM_KV_TOKENIZER_ADD_SEP, "tokenizer.ggml.add_sep_token" },
|
||||
{ LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
|
||||
{ LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, "tokenizer.ggml.remove_extra_whitespaces" },
|
||||
{ LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" },
|
||||
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
||||
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
||||
{ LLM_KV_TOKENIZER_CHAT_TEMPLATE, "tokenizer.chat_template" },
|
||||
{ LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, "tokenizer.ggml.normalizer.lowercase" },
|
||||
{ LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" },
|
||||
{ LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" },
|
||||
{ LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" },
|
||||
{ LLM_KV_TOKENIZER_FIM_PAD_ID, "tokenizer.ggml.fim_pad_token_id" },
|
||||
{ LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" },
|
||||
{ LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" },
|
||||
{ LLM_KV_TOKENIZER_SUPPRESS_TOKENS, "tokenizer.ggml.suppress_tokens" },
|
||||
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
||||
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
|
||||
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
|
||||
{ LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
|
||||
{ LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
|
||||
{ LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" },
|
||||
{ LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" },
|
||||
{ LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" },
|
||||
{ LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" },
|
||||
{ LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
|
||||
{ LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" },
|
||||
{ LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
|
||||
{ LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
|
||||
{ LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
|
||||
{ LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" },
|
||||
{ LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" },
|
||||
{ LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
|
||||
{ LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
|
||||
{ LLM_KV_TOKENIZER_ADD_SEP, "tokenizer.ggml.add_sep_token" },
|
||||
{ LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
|
||||
{ LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, "tokenizer.ggml.remove_extra_whitespaces" },
|
||||
{ LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" },
|
||||
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
||||
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
||||
{ LLM_KV_TOKENIZER_CHAT_TEMPLATE, "tokenizer.chat_template" },
|
||||
{ LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, "tokenizer.ggml.normalizer.lowercase" },
|
||||
{ LLM_KV_TOKENIZER_NORMALIZER_STRIP_ACCENTS, "tokenizer.ggml.normalizer.strip_accents" },
|
||||
{ LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" },
|
||||
{ LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" },
|
||||
{ LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" },
|
||||
{ LLM_KV_TOKENIZER_FIM_PAD_ID, "tokenizer.ggml.fim_pad_token_id" },
|
||||
{ LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" },
|
||||
{ LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" },
|
||||
{ LLM_KV_TOKENIZER_SUPPRESS_TOKENS, "tokenizer.ggml.suppress_tokens" },
|
||||
|
||||
{ LLM_KV_ADAPTER_TYPE, "adapter.type" },
|
||||
{ LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
|
||||
@@ -561,6 +566,8 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
|
||||
{ LLM_TENSOR_INDEXER_ATTN_Q_B, "blk.%d.indexer.attn_q_b" },
|
||||
{ LLM_TENSOR_MASKED_EMBD_CENTROIDS, "masked_embd_centroids" },
|
||||
{ LLM_TENSOR_MASKED_EMBD_ORDERING, "masked_embd_ordering" },
|
||||
{ LLM_TENSOR_FC, "fc" },
|
||||
{ LLM_TENSOR_D2T, "d2t" },
|
||||
};
|
||||
|
||||
// declare information about the model weight tensors:
|
||||
@@ -787,6 +794,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
{LLM_TENSOR_FFN_LATENT_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_MASKED_EMBD_CENTROIDS, {LLM_TENSOR_LAYER_INPUT, GGML_OP_NONE}},
|
||||
{LLM_TENSOR_MASKED_EMBD_ORDERING, {LLM_TENSOR_LAYER_INPUT, GGML_OP_NONE}},
|
||||
// eagle3
|
||||
{LLM_TENSOR_FC, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_D2T, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
|
||||
};
|
||||
|
||||
LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
|
||||
|
||||
@@ -141,6 +141,7 @@ enum llm_arch {
|
||||
LLM_ARCH_KIMI_LINEAR,
|
||||
LLM_ARCH_TALKIE,
|
||||
LLM_ARCH_MELLUM,
|
||||
LLM_ARCH_EAGLE3,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
};
|
||||
|
||||
@@ -314,6 +315,7 @@ enum llm_kv {
|
||||
LLM_KV_TOKENIZER_RWKV,
|
||||
LLM_KV_TOKENIZER_CHAT_TEMPLATE,
|
||||
LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE,
|
||||
LLM_KV_TOKENIZER_NORMALIZER_STRIP_ACCENTS,
|
||||
LLM_KV_TOKENIZER_FIM_PRE_ID,
|
||||
LLM_KV_TOKENIZER_FIM_SUF_ID,
|
||||
LLM_KV_TOKENIZER_FIM_MID_ID,
|
||||
@@ -336,6 +338,10 @@ enum llm_kv {
|
||||
|
||||
LLM_KV_CLASSIFIER_OUTPUT_LABELS,
|
||||
|
||||
LLM_KV_TARGET_LAYERS,
|
||||
LLM_KV_TARGET_HIDDEN_SIZE,
|
||||
LLM_KV_NORM_BEFORE_RESIDUAL,
|
||||
|
||||
LLM_KV_SHORTCONV_L_CACHE,
|
||||
|
||||
LLM_KV_XIELU_ALPHA_N,
|
||||
@@ -568,6 +574,8 @@ enum llm_tensor {
|
||||
LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
|
||||
LLM_TENSOR_MASKED_EMBD_CENTROIDS,
|
||||
LLM_TENSOR_MASKED_EMBD_ORDERING,
|
||||
LLM_TENSOR_FC,
|
||||
LLM_TENSOR_D2T,
|
||||
};
|
||||
|
||||
|
||||
|
||||
+107
-6
@@ -71,6 +71,9 @@ llama_context::llama_context(
|
||||
cparams.no_perf = params.no_perf;
|
||||
cparams.warmup = false;
|
||||
|
||||
cparams.embeddings_layer_inp.resize(hparams.n_layer(), false);
|
||||
embd_layer_inp.resize(hparams.n_layer());
|
||||
|
||||
cparams.ctx_type = params.ctx_type;
|
||||
cparams.pooling_type = params.pooling_type;
|
||||
|
||||
@@ -91,12 +94,21 @@ llama_context::llama_context(
|
||||
if (model.arch == LLM_ARCH_GEMMA4_ASSISTANT) {
|
||||
if (params.ctx_other == nullptr) {
|
||||
// TODO: change from runtime_error to llama_exception to avoid printing error message
|
||||
throw std::runtime_error("Gemma4Assistant requires ctx_other to be set (this is normal during memory fitting)");
|
||||
throw std::runtime_error("Gemma4Assistant requires ctx_other to be set (this warning is normal during memory fitting)");
|
||||
}
|
||||
|
||||
cparams.ctx_other = params.ctx_other;
|
||||
}
|
||||
|
||||
if (model.arch == LLM_ARCH_EAGLE3) {
|
||||
if (model.tok_embd == nullptr || model.output == nullptr) {
|
||||
if (params.ctx_other == nullptr) {
|
||||
throw std::runtime_error("EAGLE3 requires ctx_other to be set (this warning is normal during memory fitting)");
|
||||
}
|
||||
cparams.ctx_other = params.ctx_other;
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize backend samplers here so they are part of the sampling graph
|
||||
// before the reserve passes run later in this function. This avoids a later
|
||||
// re-reserve when graph nodes change.
|
||||
@@ -194,7 +206,7 @@ llama_context::llama_context(
|
||||
|
||||
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
|
||||
|
||||
cparams.n_outputs_max = params.n_outputs_max == 0 ? cparams.n_batch : params.n_outputs_max;
|
||||
cparams.n_outputs_max = params.n_outputs_max == 0 || llama_model_has_encoder(&model) ? cparams.n_batch : params.n_outputs_max;
|
||||
|
||||
cparams.op_offload = params.op_offload;
|
||||
cparams.kv_unified = params.kv_unified;
|
||||
@@ -938,6 +950,14 @@ float * llama_context::get_embeddings_nextn_ith(int32_t i) {
|
||||
}
|
||||
}
|
||||
|
||||
float * llama_context::get_embeddings_layer_inp(uint32_t lid) {
|
||||
output_reorder();
|
||||
|
||||
GGML_ASSERT(lid < embd_layer_inp.size() && embd_layer_inp[lid].has_data());
|
||||
|
||||
return embd_layer_inp[lid].data;
|
||||
}
|
||||
|
||||
llama_token llama_context::get_sampled_token_ith(int32_t idx) {
|
||||
output_reorder();
|
||||
|
||||
@@ -1125,6 +1145,17 @@ void llama_context::set_embeddings_nextn(bool value, bool masked) {
|
||||
cparams.embeddings_nextn_masked = masked;
|
||||
}
|
||||
|
||||
void llama_context::set_embeddings_layer_inp(uint32_t lid, bool enable) {
|
||||
LLAMA_LOG_DEBUG("%s: lid = %d, enable = %d\n", __func__, lid, enable);
|
||||
|
||||
GGML_ASSERT(lid < model.hparams.n_layer());
|
||||
|
||||
cparams.embeddings_layer_inp[lid] = enable;
|
||||
|
||||
// note: without this reserve, the draft acceptance drops to zero. not sure why - this is unexpected
|
||||
sched_need_reserve = true;
|
||||
}
|
||||
|
||||
void llama_context::set_causal_attn(bool value) {
|
||||
LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
|
||||
|
||||
@@ -1350,7 +1381,8 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
||||
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int64_t n_embd = hparams.n_embd_inp();
|
||||
// eagle3/DFlash: features as encoder input, and non-draft paths fall back to model's input dim
|
||||
const int64_t n_embd = hparams.n_embd_inp();
|
||||
const int64_t n_vocab = model.vocab.n_tokens();
|
||||
|
||||
// note: during encode, we always pass the full sequence starting from pos = 0
|
||||
@@ -1925,6 +1957,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
}
|
||||
}
|
||||
|
||||
extract_layer_inputs(res, n_tokens_prev, ubatch.n_tokens);
|
||||
|
||||
// extract nextn embeddings before
|
||||
// only meaningful in LLAMA_POOLING_TYPE_NONE (per-token); other pooling modes are ignored.
|
||||
{
|
||||
@@ -2029,6 +2063,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
|
||||
const auto n_batch = cparams.n_batch;
|
||||
const auto n_vocab = vocab.n_tokens();
|
||||
const auto n_embd = hparams.n_embd;
|
||||
const auto n_embd_out = hparams.n_embd_out();
|
||||
|
||||
bool has_logits = true;
|
||||
@@ -2041,9 +2076,9 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
has_embd = true;
|
||||
}
|
||||
|
||||
|
||||
size_t backend_float_count = 0;
|
||||
size_t backend_token_count = 0;
|
||||
size_t embd_layer_inp_float_count = 0;
|
||||
|
||||
logits.size = has_logits ? n_vocab*n_outputs_max : 0;
|
||||
embd.size = has_embd ? n_embd_out*n_outputs_max : 0;
|
||||
@@ -2055,6 +2090,12 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
embd_nextn.size = (size_t) n_embd_out * n_batch;
|
||||
}
|
||||
|
||||
for (bool enabled : cparams.embeddings_layer_inp) {
|
||||
if (enabled) {
|
||||
embd_layer_inp_float_count += (size_t) n_embd * n_batch;
|
||||
}
|
||||
}
|
||||
|
||||
// Allocate backend sampling output buffers if there are backend samplers configured.
|
||||
const bool has_sampling = !sampling.samplers.empty();
|
||||
if (has_sampling) {
|
||||
@@ -2069,8 +2110,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
|
||||
const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0;
|
||||
const size_t new_size =
|
||||
(logits.size + embd.size + embd_nextn.size + backend_float_count) * sizeof(float) +
|
||||
( backend_token_count) * sizeof(llama_token);
|
||||
(logits.size + embd.size + embd_nextn.size + embd_layer_inp_float_count + backend_float_count) * sizeof(float) +
|
||||
( backend_token_count) * sizeof(llama_token);
|
||||
|
||||
// alloc only when more than the current capacity is required
|
||||
// TODO: also consider shrinking the buffer
|
||||
@@ -2087,6 +2128,9 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
logits.data = nullptr;
|
||||
embd.data = nullptr;
|
||||
embd_nextn.data = nullptr;
|
||||
for (auto & layer_inp : embd_layer_inp) {
|
||||
layer_inp = {nullptr, 0};
|
||||
}
|
||||
}
|
||||
|
||||
auto * buft = ggml_backend_cpu_buffer_type();
|
||||
@@ -2118,6 +2162,15 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
embd_nextn = has_embd_nextn ? buffer_view<float>{(float *) (base + offset), embd_nextn.size} : buffer_view<float>{nullptr, 0};
|
||||
offset += embd_nextn.size * sizeof(float);
|
||||
|
||||
for (uint32_t il = 0; il < embd_layer_inp.size(); ++il) {
|
||||
if (cparams.embeddings_layer_inp[il]) {
|
||||
embd_layer_inp[il] = buffer_view<float>{(float *) (base + offset), (size_t) n_embd * n_batch};
|
||||
offset += embd_layer_inp[il].size * sizeof(float);
|
||||
} else {
|
||||
embd_layer_inp[il] = buffer_view<float>{nullptr, 0};
|
||||
}
|
||||
}
|
||||
|
||||
if (has_sampling) {
|
||||
sampling.logits = {(float *) (base + offset), (size_t)(n_vocab*n_outputs_max)};
|
||||
offset += sampling.logits.size * sizeof(float);
|
||||
@@ -2164,6 +2217,34 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
return n_outputs_max;
|
||||
}
|
||||
|
||||
void llama_context::extract_layer_inputs(const llm_graph_result * res, size_t token_offset, size_t n_tokens) {
|
||||
for (uint32_t il = 0; il < cparams.embeddings_layer_inp.size(); ++il) {
|
||||
if (!cparams.embeddings_layer_inp[il]) {
|
||||
continue;
|
||||
}
|
||||
if (!embd_layer_inp[il].has_data()) {
|
||||
GGML_ABORT("output layer input buffer not allocated");
|
||||
}
|
||||
ggml_tensor * t = res->get_layer_inp((int) il);
|
||||
if (!t) {
|
||||
GGML_ABORT("layer input tensor not found");
|
||||
}
|
||||
|
||||
const size_t nbytes = ggml_nbytes(t);
|
||||
const size_t nfloats = nbytes / sizeof(float);
|
||||
GGML_ASSERT(n_tokens > 0);
|
||||
GGML_ASSERT(nfloats % n_tokens == 0);
|
||||
|
||||
const size_t row_floats = nfloats / n_tokens;
|
||||
const size_t dst_offset = token_offset * row_floats;
|
||||
GGML_ASSERT(dst_offset + nfloats <= embd_layer_inp[il].size);
|
||||
|
||||
ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched.get(), t);
|
||||
GGML_ASSERT(backend != nullptr);
|
||||
ggml_backend_tensor_get_async(backend, t, embd_layer_inp[il].data + dst_offset, 0, nbytes);
|
||||
}
|
||||
}
|
||||
|
||||
void llama_context::output_reorder() {
|
||||
const uint64_t n_vocab = model.vocab.n_tokens();
|
||||
const uint64_t n_embd = model.hparams.n_embd;
|
||||
@@ -2190,6 +2271,16 @@ void llama_context::output_reorder() {
|
||||
}
|
||||
}
|
||||
|
||||
if (embd_layer_inp.size() > 0) {
|
||||
for (int lid = 0; lid < (int) embd_layer_inp.size(); ++lid) {
|
||||
if (embd_layer_inp[lid].size > 0) {
|
||||
for (uint64_t k = 0; k < n_embd; ++k) {
|
||||
std::swap(embd_layer_inp[lid].data[i0*n_embd + k], embd_layer_inp[lid].data[i1*n_embd + k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!sampling.samplers.empty()) {
|
||||
assert(sampling.logits.size > 0);
|
||||
assert(sampling.probs.size > 0);
|
||||
@@ -3604,6 +3695,10 @@ void llama_set_embeddings_nextn(llama_context * ctx, bool value, bool masked) {
|
||||
ctx->set_embeddings_nextn(value, masked);
|
||||
}
|
||||
|
||||
void llama_set_embeddings_layer_inp(llama_context * ctx, uint32_t lid, bool value) {
|
||||
ctx->set_embeddings_layer_inp(lid, value);
|
||||
}
|
||||
|
||||
llama_memory_t llama_get_memory(const struct llama_context * ctx) {
|
||||
if (!ctx) {
|
||||
return nullptr;
|
||||
@@ -3624,6 +3719,12 @@ float * llama_get_embeddings_nextn_ith(llama_context * ctx, int32_t i) {
|
||||
return ctx->get_embeddings_nextn_ith(i);
|
||||
}
|
||||
|
||||
float * llama_get_embeddings_layer_inp(llama_context * ctx, uint32_t lid) {
|
||||
ctx->synchronize();
|
||||
|
||||
return ctx->get_embeddings_layer_inp(lid);
|
||||
}
|
||||
|
||||
bool llama_set_sampler(llama_context * ctx, llama_seq_id seq_id, llama_sampler * smpl) {
|
||||
return ctx->set_sampler(seq_id, smpl);
|
||||
}
|
||||
|
||||
@@ -88,6 +88,8 @@ struct llama_context {
|
||||
float * get_embeddings_nextn();
|
||||
float * get_embeddings_nextn_ith(int32_t i);
|
||||
|
||||
float * get_embeddings_layer_inp(uint32_t lid);
|
||||
|
||||
llama_token * get_sampled_tokens() const;
|
||||
llama_token get_sampled_token_ith(int32_t idx);
|
||||
|
||||
@@ -112,6 +114,7 @@ struct llama_context {
|
||||
|
||||
void set_embeddings (bool value);
|
||||
void set_embeddings_nextn(bool value, bool masked);
|
||||
void set_embeddings_layer_inp(uint32_t lid, bool enable);
|
||||
void set_causal_attn(bool value);
|
||||
void set_warmup(bool value);
|
||||
|
||||
@@ -226,6 +229,10 @@ private:
|
||||
// map the output row index `i` to batch index
|
||||
int64_t output_resolve_row(int32_t i) const;
|
||||
|
||||
// async-copy enabled layer-input tensors (per cparams.output_layer_inp)
|
||||
// from backend into host-side embd_layer_inp buffers
|
||||
void extract_layer_inputs(const llm_graph_result * res, size_t token_offset, size_t n_tokens);
|
||||
|
||||
//
|
||||
// graph
|
||||
//
|
||||
@@ -288,6 +295,10 @@ private:
|
||||
// sets llm_graph_result::t_h_nextn
|
||||
buffer_view<float> embd_nextn = {nullptr, 0};
|
||||
|
||||
// host buffers for output layer input embeddings, per layer
|
||||
// populated when cparams.output_layer_inp[il] is true
|
||||
std::vector<buffer_view<float>> embd_layer_inp;
|
||||
|
||||
struct sampling_info {
|
||||
// !samplers.empty() to check if any samplers are active
|
||||
std::map<llama_seq_id, llama_sampler *> samplers;
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
#include "llama.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
|
||||
#define LLAMA_MAX_SEQ 256
|
||||
|
||||
@@ -44,6 +45,8 @@ struct llama_cparams {
|
||||
bool kv_unified;
|
||||
bool pipeline_parallel;
|
||||
|
||||
std::vector<bool> embeddings_layer_inp; // [n_layer()] extract input embeddings for layer
|
||||
|
||||
enum llama_context_type ctx_type;
|
||||
enum llama_pooling_type pooling_type;
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
// this is a staging header for new llama.cpp API
|
||||
// breaking changes and C++ are allowed. everything here should be considered WIP
|
||||
// try as much as possible to not include this header in the rest of the codebase
|
||||
|
||||
#include "llama.h"
|
||||
|
||||
@@ -101,4 +102,20 @@ LLAMA_API float * llama_get_embeddings_nextn(struct llama_context * ctx);
|
||||
// LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
|
||||
LLAMA_API float * llama_get_embeddings_nextn_ith(struct llama_context * ctx, int32_t i);
|
||||
|
||||
// Set whether the context outputs the input embeddings of a specific layer
|
||||
LLAMA_API void llama_set_embeddings_layer_inp(struct llama_context * ctx, uint32_t lid, bool value);
|
||||
|
||||
// mirrors:
|
||||
// LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
||||
LLAMA_API float * llama_get_embeddings_layer_inp(struct llama_context * ctx, uint32_t lid);
|
||||
|
||||
LLAMA_API llama_context * llama_get_ctx_other(struct llama_context * ctx);
|
||||
|
||||
//
|
||||
// model/context data extraction
|
||||
//
|
||||
|
||||
// returns pointer to the target-model layer indices
|
||||
LLAMA_API const int32_t * llama_model_target_layer_ids (const struct llama_model * model);
|
||||
// returns the number of extracted layers from target model
|
||||
LLAMA_API uint32_t llama_model_target_layer_ids_n(const struct llama_model * model);
|
||||
|
||||
+14
-1
@@ -904,6 +904,10 @@ void llm_graph_result::reset() {
|
||||
t_logits = nullptr;
|
||||
t_embd = nullptr;
|
||||
t_embd_pooled = nullptr;
|
||||
|
||||
t_layer_inp.resize(LLAMA_MAX_LAYERS);
|
||||
std::fill(t_layer_inp.begin(), t_layer_inp.end(), nullptr);
|
||||
|
||||
t_sampled.clear();
|
||||
t_sampled_probs.clear();
|
||||
t_sampled_logits.clear();
|
||||
@@ -932,7 +936,7 @@ void llm_graph_result::set_inputs(const llama_ubatch * ubatch) {
|
||||
}
|
||||
}
|
||||
|
||||
void llm_graph_result::set_outputs() {
|
||||
void llm_graph_result::set_outputs(const llm_graph_params & params) {
|
||||
if (t_logits != nullptr) {
|
||||
ggml_set_output(t_logits);
|
||||
}
|
||||
@@ -945,6 +949,15 @@ void llm_graph_result::set_outputs() {
|
||||
if (t_h_nextn != nullptr) {
|
||||
ggml_set_output(t_h_nextn);
|
||||
}
|
||||
{
|
||||
const auto & embeddings_layer_inp = params.cparams.embeddings_layer_inp;
|
||||
for (size_t il = 0; il < embeddings_layer_inp.size(); ++il) {
|
||||
if (embeddings_layer_inp[il]) {
|
||||
GGML_ASSERT(t_layer_inp[il] != nullptr && "layer input tensor is null");
|
||||
ggml_set_output(t_layer_inp[il]);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto & [seq_id, t] : t_sampled) {
|
||||
if (t != nullptr) {
|
||||
ggml_set_output(t);
|
||||
|
||||
+9
-5
@@ -705,6 +705,8 @@ public:
|
||||
ggml_tensor * get_embd_pooled() const { return t_embd_pooled; }
|
||||
ggml_tensor * get_h_nextn() const { return t_h_nextn; }
|
||||
|
||||
ggml_tensor * get_layer_inp(int il) const { return t_layer_inp[il]; }
|
||||
|
||||
ggml_cgraph * get_gf() const { return gf; }
|
||||
ggml_context * get_ctx() const { return ctx_compute.get(); }
|
||||
|
||||
@@ -713,7 +715,7 @@ public:
|
||||
void reset();
|
||||
|
||||
void set_inputs(const llama_ubatch * ubatch);
|
||||
void set_outputs();
|
||||
void set_outputs(const llm_graph_params & params);
|
||||
|
||||
// try to update the existing graph result using the new graph parameters in order to reuse it
|
||||
// this can only be done if we determine that the resulting graph using the new graph parameters
|
||||
@@ -734,10 +736,12 @@ public:
|
||||
ggml_tensor * t_embd_pooled = nullptr;
|
||||
ggml_tensor * t_h_nextn = nullptr; // [n_embd, n_outputs] hidden state before final output norm
|
||||
|
||||
std::map<llama_seq_id, ggml_tensor*> t_sampled_logits;
|
||||
std::map<llama_seq_id, ggml_tensor*> t_candidates;
|
||||
std::map<llama_seq_id, ggml_tensor*> t_sampled;
|
||||
std::map<llama_seq_id, ggml_tensor*> t_sampled_probs;
|
||||
std::vector<ggml_tensor *> t_layer_inp;
|
||||
|
||||
std::map<llama_seq_id, ggml_tensor *> t_sampled_logits;
|
||||
std::map<llama_seq_id, ggml_tensor *> t_candidates;
|
||||
std::map<llama_seq_id, ggml_tensor *> t_sampled;
|
||||
std::map<llama_seq_id, ggml_tensor *> t_sampled_probs;
|
||||
|
||||
std::vector<llm_graph_input_ptr> inputs;
|
||||
|
||||
|
||||
@@ -45,6 +45,7 @@ struct llama_hparams {
|
||||
bool rope_finetuned;
|
||||
bool use_par_res;
|
||||
bool swin_norm;
|
||||
bool norm_before_residual = false;
|
||||
|
||||
uint32_t n_ctx_train; // context size the model was trained on
|
||||
uint32_t n_embd;
|
||||
|
||||
@@ -394,6 +394,7 @@ namespace GGUFMeta {
|
||||
|
||||
template bool llama_model_loader::get_arr<std::vector<std::string>>(enum llm_kv kid, std::vector<std::string> & result, bool required);
|
||||
template bool llama_model_loader::get_arr<std::array<int32_t, 512>>(enum llm_kv kid, std::array<int32_t, 512> & result, bool required);
|
||||
template bool llama_model_loader::get_arr<std::vector<int32_t>>(enum llm_kv kid, std::vector<int32_t> & result, bool required);
|
||||
|
||||
template<typename T>
|
||||
bool llama_model_loader::get_key(const std::string & key, T & result, bool required) {
|
||||
|
||||
+16
-3
@@ -287,6 +287,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params
|
||||
return new llama_model_qwen35moe(params);
|
||||
case LLM_ARCH_MISTRAL3:
|
||||
return new llama_model_mistral3(params);
|
||||
case LLM_ARCH_EAGLE3:
|
||||
return new llama_model_eagle3(params);
|
||||
case LLM_ARCH_MIMO2:
|
||||
return new llama_model_mimo2(params);
|
||||
case LLM_ARCH_KIMI_LINEAR:
|
||||
@@ -2238,7 +2240,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
// TODO: move reranking logic here and generalize
|
||||
llm->build_dense_out(dense_2_out_layers, dense_2_out_layers_b, dense_3_out_layers);
|
||||
|
||||
llm->res->set_outputs();
|
||||
llm->res->set_outputs(params);
|
||||
|
||||
return llm->res->get_gf();
|
||||
}
|
||||
@@ -2406,6 +2408,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_ERNIE4_5:
|
||||
case LLM_ARCH_ERNIE4_5_MOE:
|
||||
case LLM_ARCH_MISTRAL3:
|
||||
case LLM_ARCH_EAGLE3:
|
||||
case LLM_ARCH_MISTRAL4:
|
||||
case LLM_ARCH_LLAMA_EMBED:
|
||||
case LLM_ARCH_MAINCODER:
|
||||
@@ -2600,8 +2603,9 @@ uint64_t llama_model_n_params(const llama_model * model) {
|
||||
|
||||
bool llama_model_has_encoder(const llama_model * model) {
|
||||
switch (model->arch) {
|
||||
case LLM_ARCH_T5: return true;
|
||||
case LLM_ARCH_T5ENCODER: return true;
|
||||
case LLM_ARCH_T5:
|
||||
case LLM_ARCH_T5ENCODER:
|
||||
case LLM_ARCH_EAGLE3: return true;
|
||||
default: return false;
|
||||
}
|
||||
}
|
||||
@@ -2687,3 +2691,12 @@ void llama_model_base::create_tensor_qkv(llama_layer & layer, int bid,
|
||||
layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", bid), {n_embd_v_}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
}
|
||||
|
||||
const int32_t * llama_model_target_layer_ids(const struct llama_model * model) {
|
||||
const auto & v = model->target_layer_ids;
|
||||
return v.empty() ? nullptr : v.data();
|
||||
}
|
||||
|
||||
uint32_t llama_model_target_layer_ids_n(const struct llama_model * model) {
|
||||
return (uint32_t) model->target_layer_ids.size();
|
||||
}
|
||||
|
||||
@@ -569,6 +569,13 @@ struct llama_model {
|
||||
struct ggml_tensor * per_layer_model_proj = nullptr;
|
||||
struct ggml_tensor * per_layer_proj_norm = nullptr;
|
||||
|
||||
// eagle3
|
||||
struct ggml_tensor * fc = nullptr; // feature fusion layer
|
||||
struct ggml_tensor * d2t = nullptr; // draft to target vocabulary mapping
|
||||
|
||||
// unified vector to store target-model extracted layer ids in eagle3, dflash, etc.
|
||||
std::vector<int32_t> target_layer_ids;
|
||||
|
||||
std::vector<llama_layer> layers;
|
||||
|
||||
//Dense linear projections for SentenceTransformers models like embeddinggemma
|
||||
|
||||
+23
-12
@@ -764,7 +764,7 @@ struct llm_tokenizer_wpm_session {
|
||||
|
||||
void tokenize(const std::string & text, std::vector<llama_token> & output) {
|
||||
// normalize and split by whitespace
|
||||
std::vector<std::string> words = preprocess(text, vocab.get_normalizer_lowercase());
|
||||
std::vector<std::string> words = preprocess(text, vocab.get_normalizer_opts());
|
||||
// bos token prepended already
|
||||
|
||||
// find the longest tokens that form the words
|
||||
@@ -809,11 +809,14 @@ struct llm_tokenizer_wpm_session {
|
||||
}
|
||||
|
||||
// TODO: reduce string copies by using cpts_offs array
|
||||
static std::vector<std::string> preprocess(const std::string & text, bool lowercase) {
|
||||
const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
|
||||
static std::vector<std::string> preprocess(const std::string & text, const llama_vocab::normalizer_options & normalizer_opts) {
|
||||
std::vector<uint32_t> cpts = unicode_cpts_from_utf8(text);
|
||||
if (normalizer_opts.strip_accents) {
|
||||
cpts = unicode_cpts_normalize_nfd(cpts);
|
||||
}
|
||||
std::vector<std::string> words(1, "");
|
||||
|
||||
for (const uint32_t cpt : cpts_nfd) {
|
||||
for (const uint32_t cpt : cpts) {
|
||||
const auto flags = unicode_cpt_flags_from_cpt(cpt);
|
||||
|
||||
if (flags.is_whitespace) {
|
||||
@@ -828,7 +831,11 @@ struct llm_tokenizer_wpm_session {
|
||||
continue;
|
||||
}
|
||||
|
||||
const std::string s = unicode_cpt_to_utf8(lowercase ? unicode_tolower(cpt) : cpt);
|
||||
if (normalizer_opts.strip_accents && flags.is_accent_mark) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const std::string s = unicode_cpt_to_utf8(normalizer_opts.lowercase ? unicode_tolower(cpt) : cpt);
|
||||
if (flags.is_punctuation || ( cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)) {
|
||||
if (words.back().size()) { // finish previous word if any
|
||||
words.emplace_back();
|
||||
@@ -1692,7 +1699,7 @@ struct llm_tokenizer_whitespace_session : llm_tokenizer_bpe_session {
|
||||
llm_tokenizer_whitespace_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : llm_tokenizer_bpe_session{vocab, tokenizer}, vocab{vocab} {}
|
||||
|
||||
void tokenize(const std::string & text, std::vector<llama_token> & output) override {
|
||||
const bool lowercase = vocab.get_normalizer_lowercase();
|
||||
const bool lowercase = vocab.get_normalizer_opts().lowercase;
|
||||
|
||||
std::string segment;
|
||||
auto flush = [&]() {
|
||||
@@ -1797,7 +1804,9 @@ struct llama_vocab::impl {
|
||||
bool remove_extra_whitespaces = false;
|
||||
bool escape_whitespaces = true;
|
||||
bool treat_whitespace_as_suffix = false;
|
||||
bool normalizer_lowercase = true; // Lowercase normalizer (tokenizer.json)
|
||||
|
||||
// BertNormalizer options
|
||||
llama_vocab::normalizer_options normalizer_opts;
|
||||
|
||||
std::unordered_map<std::string, llama_token> token_to_id;
|
||||
std::vector<token_data> id_to_token;
|
||||
@@ -2172,7 +2181,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
} else if (
|
||||
tokenizer_pre == "whitespace") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_WHITESPACE;
|
||||
normalizer_lowercase = false;
|
||||
normalizer_opts.lowercase = false;
|
||||
} else if (
|
||||
tokenizer_pre == "refact") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT;
|
||||
@@ -2532,8 +2541,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
}
|
||||
}
|
||||
|
||||
// Lowercase normalizer flag (consulted by WPM / whitespace BPE)
|
||||
ml.get_key(LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, normalizer_lowercase, false);
|
||||
// BertNormalizer options
|
||||
ml.get_key(LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, normalizer_opts.lowercase, false);
|
||||
normalizer_opts.strip_accents = normalizer_opts.lowercase;
|
||||
ml.get_key(LLM_KV_TOKENIZER_NORMALIZER_STRIP_ACCENTS, normalizer_opts.strip_accents, false);
|
||||
|
||||
// suppress tokens
|
||||
{
|
||||
@@ -3969,8 +3980,8 @@ bool llama_vocab::get_treat_whitespace_as_suffix() const {
|
||||
return pimpl->treat_whitespace_as_suffix;
|
||||
}
|
||||
|
||||
bool llama_vocab::get_normalizer_lowercase() const {
|
||||
return pimpl->normalizer_lowercase;
|
||||
const llama_vocab::normalizer_options & llama_vocab::get_normalizer_opts() const {
|
||||
return pimpl->normalizer_opts;
|
||||
}
|
||||
|
||||
const std::vector<llama_token> & llama_vocab::get_suppress_tokens() const {
|
||||
|
||||
+7
-1
@@ -76,6 +76,12 @@ struct llama_vocab {
|
||||
llama_token_attr attr;
|
||||
};
|
||||
|
||||
struct normalizer_options {
|
||||
bool lowercase = true;
|
||||
bool strip_accents = true;
|
||||
// TODO: clean_text, handle_chinese_chars
|
||||
};
|
||||
|
||||
llama_vocab();
|
||||
~llama_vocab();
|
||||
|
||||
@@ -141,7 +147,7 @@ struct llama_vocab {
|
||||
bool get_remove_extra_whitespaces () const;
|
||||
bool get_escape_whitespaces () const;
|
||||
bool get_treat_whitespace_as_suffix() const;
|
||||
bool get_normalizer_lowercase () const;
|
||||
const normalizer_options & get_normalizer_opts() const;
|
||||
|
||||
const std::vector<llama_token> & get_suppress_tokens() const;
|
||||
|
||||
|
||||
@@ -0,0 +1,323 @@
|
||||
#include "models.h"
|
||||
|
||||
void llama_model_eagle3::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
if (!ml.get_arr(LLM_KV_TARGET_LAYERS, target_layer_ids, false)) {
|
||||
throw std::runtime_error("EAGLE3 model requires 'extract_layers' in GGUF metadata");
|
||||
}
|
||||
if (target_layer_ids.size() != 3) {
|
||||
throw std::runtime_error("EAGLE3 requires exactly 3 entries in 'extract_layers'");
|
||||
}
|
||||
LLAMA_LOG_INFO("%s: EAGLE3 extract_layers = [%d, %d, %d]\n", __func__,
|
||||
target_layer_ids[0],
|
||||
target_layer_ids[1],
|
||||
target_layer_ids[2]);
|
||||
|
||||
uint32_t n_embd_tgt = 0;
|
||||
|
||||
ml.get_key(LLM_KV_TARGET_HIDDEN_SIZE, n_embd_tgt);
|
||||
LLAMA_LOG_INFO("%s: EAGLE3 n_embd_tgt = %u (draft n_embd = %u)\n", __func__, n_embd_tgt, hparams.n_embd);
|
||||
|
||||
hparams.n_embd_inp_impl = (uint32_t) target_layer_ids.size() * n_embd_tgt;
|
||||
|
||||
// eagle3 norm_before_residual (optional, default false)
|
||||
// compatible with Readhat eagle3 speculator model
|
||||
ml.get_key(LLM_KV_NORM_BEFORE_RESIDUAL, hparams.norm_before_residual, false);
|
||||
if (hparams.norm_before_residual) {
|
||||
LLAMA_LOG_INFO("%s: EAGLE3gnorm_before_residual = true\n", __func__);
|
||||
}
|
||||
|
||||
type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
void llama_model_eagle3::load_arch_tensors(llama_model_loader &) {
|
||||
LLAMA_LOAD_LOCALS;
|
||||
|
||||
const int64_t n_embd_inp = hparams.n_embd_inp();
|
||||
const int64_t n_embd_attn_input = 2 * n_embd;
|
||||
|
||||
// Get vocab size from the d2t tensor in the GGUF file (optional - only needed if eagle3 has different vocab_size than target)
|
||||
// d2t: draft to target vocabulary mapping
|
||||
int64_t n_draft_vocab = n_vocab; // Default: same as target vocab
|
||||
const struct ggml_tensor * d2t_meta = ml->get_tensor_meta("d2t");
|
||||
if (d2t_meta) {
|
||||
n_draft_vocab = d2t_meta->ne[0]; // update draft vocab size
|
||||
d2t = create_tensor(tn(LLM_TENSOR_D2T), {n_draft_vocab}, 0);
|
||||
LLAMA_LOG_INFO("%s: EAGLE3 using d2t mapping (draft_vocab_size = %lld)\n", __func__, (long long)n_draft_vocab);
|
||||
} else {
|
||||
d2t = nullptr; // no d2t, use default vocab size
|
||||
LLAMA_LOG_INFO("%s: EAGLE3 without d2t - sharing same vocab_size with target (vocab_size = %lld)\n", __func__, (long long)n_draft_vocab);
|
||||
}
|
||||
|
||||
// Feature fusion layer: projects 3 target layers to draft hidden size
|
||||
fc = create_tensor(tn(LLM_TENSOR_FC, "weight"), {n_embd_inp, n_embd}, 0);
|
||||
|
||||
// Output layer (uses draft vocab size)
|
||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_draft_vocab}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
// Token embeddings (optional - Llama 3.3 70B EAGLE3 has its own)
|
||||
const struct ggml_tensor * tok_embd_meta = ml->get_tensor_meta(tn(LLM_TENSOR_TOKEN_EMBD, "weight").str().c_str());
|
||||
if (tok_embd_meta) {
|
||||
const int64_t n_target_vocab = tok_embd_meta->ne[1];
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_target_vocab}, 0);
|
||||
LLAMA_LOG_INFO("%s: EAGLE3 using its own token_embd (vocab = %lld)\n", __func__, (long long)n_target_vocab);
|
||||
}
|
||||
|
||||
// Single decoder layer
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = layers[i];
|
||||
|
||||
// input_layernorm: applied to token embeddings
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
// eagle3 specific: hidden_norm applied to fused target features
|
||||
layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
|
||||
|
||||
// Attention takes input_embeds_normed + fused_target_normed as input
|
||||
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd_attn_input, n_embd_head_k * n_head}, 0);
|
||||
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd_attn_input, n_embd_k_gqa}, 0);
|
||||
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd_attn_input, n_embd_v_gqa}, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
|
||||
// rope_freqs for llama3 rope scaling (optional - only if eagle3 config has rope_scaling)
|
||||
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<llm_graph_context> llama_model_eagle3::build_arch_graph(const llm_graph_params & params) const {
|
||||
switch (params.gtype) {
|
||||
case LLM_GRAPH_TYPE_ENCODER:
|
||||
return std::make_unique<graph<true>>(*this, params);
|
||||
case LLM_GRAPH_TYPE_DEFAULT:
|
||||
case LLM_GRAPH_TYPE_DECODER:
|
||||
return std::make_unique<graph<false>>(*this, params);
|
||||
default:
|
||||
GGML_ABORT("invalid graph type");
|
||||
};
|
||||
}
|
||||
|
||||
template <>
|
||||
ggml_tensor * llama_model_eagle3::graph<true>::build_inp_embd_enc() const {
|
||||
ggml_tensor * cur = nullptr;
|
||||
|
||||
// Input: Target model features (3 layers concatenated: low, mid, high)
|
||||
// Data will be provided via ubatch->embd in encode_eagle3_features()
|
||||
auto inp_target = std::make_unique<llm_graph_input_embd>(hparams.n_embd_inp());
|
||||
inp_target->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32,hparams.n_embd_inp(), n_tokens);
|
||||
ggml_set_input(inp_target->embd);
|
||||
|
||||
cur = inp_target->embd;
|
||||
cb(cur, "inp_embd", -1);
|
||||
|
||||
res->add_input(std::move(inp_target));
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
// eagle3 Encoder: processes target model features through feature fusion layer
|
||||
// Input: target_features e.g. [12288, n_tokens] from target model layers low, middle, high
|
||||
// Output: g_embeddings e.g. [4096, n_tokens] stored in context
|
||||
template <>
|
||||
llama_model_eagle3::graph<true>::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
ggml_tensor * cur = nullptr;
|
||||
|
||||
cur = build_inp_embd_enc();
|
||||
|
||||
// Feature fusion layer
|
||||
cur = build_lora_mm(model.fc, cur);
|
||||
cb(cur, "fc_out", -1);
|
||||
|
||||
// Output: g_embeddings e.g. [4096, n_tokens]
|
||||
// store in t_h_nextn (same as MTP) so can be read via llama_get_embeddings_nextn(ctx_dft)
|
||||
ggml_set_output(cur);
|
||||
res->t_h_nextn = cur;
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
|
||||
// eagle3 Decoder: processes draft tokens using g_embeddings from encoder
|
||||
// Input: draft tokens + g_embeddings from encoder
|
||||
// Output: draft logits
|
||||
template <>
|
||||
llama_model_eagle3::graph<false>::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_layer == 1); // eagle3 has only one decoder layer
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
||||
// eagle3 Decoder receives:
|
||||
// 1. Token embeddings (e.g.from eagle3's own tok_embd for Llama 3.3 70B, or target model for Llama 3.1 8B)
|
||||
// 2. g_embeddings from encoder
|
||||
auto * tok_embd = model.tok_embd;
|
||||
if (model.tok_embd == nullptr) {
|
||||
GGML_ASSERT(cparams.ctx_other != nullptr);
|
||||
const auto * model_other = llama_get_model(cparams.ctx_other);
|
||||
|
||||
GGML_ASSERT(model_other->tok_embd != nullptr && "EAGLE3 decoder requires token embeddings (own or from target model)");
|
||||
tok_embd = model_other->tok_embd;
|
||||
}
|
||||
|
||||
auto inp = std::make_unique<llm_graph_input_embd>(n_embd);
|
||||
|
||||
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
||||
ggml_set_input(inp->tokens);
|
||||
|
||||
inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
||||
ggml_set_input(inp->embd);
|
||||
|
||||
ggml_tensor * inp_embd = ggml_get_rows(ctx0, tok_embd, inp->tokens);
|
||||
cb(inp_embd, "inp_embd", -1);
|
||||
|
||||
ggml_tensor * inp_g = inp->embd;
|
||||
cb(inp_g, "inp_g_embeddings", -1);
|
||||
|
||||
res->add_input(std::move(inp));
|
||||
|
||||
inpL = inp_g;
|
||||
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv();
|
||||
|
||||
const float kq_scale = 1.0f/sqrtf(float(n_embd_head));
|
||||
|
||||
// Single decoder layer (il = 0)
|
||||
const int il = 0;
|
||||
{
|
||||
// Apply input_layernorm to the token embeddings
|
||||
ggml_tensor * embd_norm = build_norm(inp_embd,
|
||||
model.layers[il].attn_norm, NULL,
|
||||
LLM_NORM_RMS, il);
|
||||
cb(embd_norm, "embd_norm", il);
|
||||
|
||||
// Apply hidden_norm to inp_g
|
||||
ggml_tensor * g_norm = build_norm(inp_g,
|
||||
model.layers[il].attn_norm_2, NULL,
|
||||
LLM_NORM_RMS, -1);
|
||||
cb(g_norm, "g_norm", il);
|
||||
|
||||
// norm_before_residual: determines what goes into the residual connection (compatible with Readhat eagle3 speculator model)
|
||||
// - false (default): use raw inp_g for residual
|
||||
// - true: use normalized g_norm for residual
|
||||
// inpL is the concatenated input (normalized inp_embd + normalized inp_g)
|
||||
ggml_tensor * inpSA = hparams.norm_before_residual ? g_norm : inpL;
|
||||
|
||||
// Concatenate normalized inp_embd and normalized inp_g
|
||||
cur = ggml_concat(ctx0, embd_norm, g_norm, il);
|
||||
cb(cur, "concat_embd", il);
|
||||
|
||||
// Self-attention with concatenated input
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
|
||||
// rope freq factors, returns nullptr if not available
|
||||
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
||||
|
||||
// RoPE
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, rope_factors,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
Kcur = ggml_rope_ext(
|
||||
ctx0, Kcur, inp_pos, rope_factors,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
cb(Qcur, "Qcur_rope", il);
|
||||
cb(Kcur, "Kcur_rope", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, NULL, nullptr,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||
|
||||
// Add residual and update it
|
||||
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
// Apply FFN norm to the sum
|
||||
cur = build_norm(ffn_inp,
|
||||
model.layers[il].ffn_norm, NULL,
|
||||
LLM_NORM_RMS, il);
|
||||
cb(cur, "post_attn_norm", il);
|
||||
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up, NULL, NULL,
|
||||
model.layers[il].ffn_gate, NULL, NULL,
|
||||
model.layers[il].ffn_down, NULL, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
// Output norm with residual
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(cur, "eagle3_prenorm", il);
|
||||
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
// Output prenorm state (for next token's g_embeddings in autoregressive generation)
|
||||
ggml_set_output(cur);
|
||||
res->t_h_nextn = cur;
|
||||
|
||||
cur = build_norm(cur,
|
||||
model.output_norm, NULL,
|
||||
LLM_NORM_RMS, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
// lm_head - projects to draft vocabulary
|
||||
// if the draft has no own output projection, inherit the target model's lm_head
|
||||
auto * output = model.output;
|
||||
if (output == nullptr) {
|
||||
GGML_ASSERT(cparams.ctx_other != nullptr);
|
||||
const auto * model_other = llama_get_model(cparams.ctx_other);
|
||||
|
||||
GGML_ASSERT(model_other->output != nullptr && "EAGLE3 decoder requires an output projection (own or from target model)");
|
||||
output = model_other->output;
|
||||
}
|
||||
cur = build_lora_mm(output, cur);
|
||||
|
||||
if (model.d2t) {
|
||||
const int64_t n_draft_vocab = cur->ne[0];
|
||||
const int64_t n_outputs = cur->ne[1];
|
||||
const int64_t n_vocab = (int64_t) model.vocab.n_tokens();
|
||||
|
||||
GGML_ASSERT(model.d2t->type == GGML_TYPE_I64);
|
||||
GGML_ASSERT(model.d2t->ne[0] == n_draft_vocab);
|
||||
|
||||
ggml_tensor * logits = ggml_fill(ctx0, ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, n_vocab, n_outputs), -INFINITY);
|
||||
cur = ggml_set_rows(ctx0, logits,
|
||||
ggml_reshape_3d(ctx0, cur, 1, n_draft_vocab, n_outputs),
|
||||
ggml_reshape_3d(ctx0, model.d2t, n_draft_vocab, 1, 1));
|
||||
cur = ggml_reshape_2d(ctx0, cur, n_vocab, n_outputs);
|
||||
}
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
@@ -210,6 +210,8 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para
|
||||
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
||||
const int n_rot_l = hparams.n_rot(il);
|
||||
|
||||
res->t_layer_inp[il] = inpL;
|
||||
|
||||
// norm
|
||||
cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
@@ -124,6 +124,8 @@ llama_model_llama::graph<embed>::graph(const llama_model & model, const llm_grap
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
res->t_layer_inp[il] = inpL;
|
||||
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
|
||||
@@ -1089,6 +1089,21 @@ struct llama_model_glm_dsa : public llama_model_base {
|
||||
std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
|
||||
};
|
||||
|
||||
struct llama_model_eagle3 : public llama_model_base {
|
||||
llama_model_eagle3(const struct llama_model_params & params) : llama_model_base(params) {}
|
||||
void load_arch_hparams(llama_model_loader & ml) override;
|
||||
void load_arch_tensors(llama_model_loader & ml) override;
|
||||
|
||||
template <bool is_enc>
|
||||
struct graph : public llm_graph_context {
|
||||
graph(const llama_model & model, const llm_graph_params & params);
|
||||
|
||||
ggml_tensor * build_inp_embd_enc() const;
|
||||
};
|
||||
|
||||
std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
|
||||
};
|
||||
|
||||
|
||||
struct llama_model_mistral4 : public llama_model_deepseek2 {
|
||||
llama_model_mistral4(const struct llama_model_params & params) : llama_model_deepseek2(params) {}
|
||||
|
||||
@@ -75,6 +75,8 @@ llama_model_openai_moe::graph::graph(const llama_model & model, const llm_graph_
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
res->t_layer_inp[il] = inpL;
|
||||
|
||||
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
||||
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
||||
|
||||
|
||||
@@ -69,6 +69,8 @@ llama_model_qwen3::graph::graph(const llama_model & model, const llm_graph_param
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
res->t_layer_inp[il] = inpL;
|
||||
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
|
||||
@@ -173,7 +173,7 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para
|
||||
}
|
||||
|
||||
if (il == n_layer - 1 && inp_out_ids && cparams.embeddings_nextn_masked) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
|
||||
|
||||
@@ -78,6 +78,8 @@ llama_model_qwen3moe::graph::graph(const llama_model & model, const llm_graph_pa
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
res->t_layer_inp[il] = inpL;
|
||||
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
|
||||
@@ -8849,7 +8849,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
for (int v : { 0, 1, 2, 3 }) {
|
||||
for (int dim : { 0, 1, 2, 3, }) {
|
||||
test_cases.emplace_back(new test_concat(GGML_TYPE_F32, {11, 12, 13, 14}, 7, dim, v));
|
||||
test_cases.emplace_back(new test_concat(GGML_TYPE_F16, {11, 12, 13, 14}, 7, dim, v));
|
||||
test_cases.emplace_back(new test_concat(GGML_TYPE_BF16, {11, 12, 13, 14}, 7, dim, v));
|
||||
test_cases.emplace_back(new test_concat(GGML_TYPE_I8, {11, 12, 13, 14}, 7, dim, v));
|
||||
test_cases.emplace_back(new test_concat(GGML_TYPE_I16, {11, 12, 13, 14}, 7, dim, v));
|
||||
test_cases.emplace_back(new test_concat(GGML_TYPE_I32, {11, 12, 13, 14}, 7, dim, v));
|
||||
test_cases.emplace_back(new test_concat(GGML_TYPE_I64, {11, 12, 13, 14}, 7, dim, v));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -450,6 +450,9 @@ static int save_models(const llm_arch target_arch, const size_t seed, const ggml
|
||||
if (arch == LLM_ARCH_GEMMA4 || arch == LLM_ARCH_GEMMA4_ASSISTANT) {
|
||||
continue; // FIXME: ISWA KV cache initialization needs more fixture params
|
||||
}
|
||||
if (arch == LLM_ARCH_EAGLE3) {
|
||||
continue;
|
||||
}
|
||||
for (bool moe : {false, true}) {
|
||||
if (moe && !moe_implemented(arch)) {
|
||||
continue;
|
||||
@@ -553,6 +556,9 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg
|
||||
if (arch == LLM_ARCH_GEMMA4 || arch == LLM_ARCH_GEMMA4_ASSISTANT) {
|
||||
continue; // FIXME: ISWA KV cache initialization needs more fixture params
|
||||
}
|
||||
if (arch == LLM_ARCH_EAGLE3) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const bool encode = arch == LLM_ARCH_T5 || arch == LLM_ARCH_DREAM || arch == LLM_ARCH_LLADA || arch == LLM_ARCH_LLADA_MOE || arch == LLM_ARCH_RND1;
|
||||
for (bool moe : {false, true}) {
|
||||
|
||||
@@ -54,6 +54,10 @@ struct clip_graph {
|
||||
virtual ggml_tensor * build_mm(ggml_tensor * w, ggml_tensor * x) const;
|
||||
// TODO: build_mm(w, b, x) to support bias
|
||||
|
||||
virtual bool support_batch() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
//
|
||||
// utility functions
|
||||
//
|
||||
|
||||
+54
-23
@@ -171,6 +171,8 @@ struct clip_ctx {
|
||||
std::map<ggml_backend_dev_t, size_t> mem_usage;
|
||||
std::map<ggml_backend_dev_t, size_t> mem_compute;
|
||||
|
||||
bool support_batch = false;
|
||||
|
||||
clip_ctx(clip_context_params & ctx_params) {
|
||||
flash_attn_type = ctx_params.flash_attn_type;
|
||||
no_alloc = ctx_params.no_alloc;
|
||||
@@ -314,7 +316,7 @@ ggml_tensor * clip_graph::build_vit(
|
||||
std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos,
|
||||
const build_vit_opts & opts
|
||||
) {
|
||||
// batch dim: inp is [n_embd, n_pos] (B==1) or [n_embd, n_pos, B] (multi-tile encode)
|
||||
// batch dim: inp is [n_embd, n_pos, B]
|
||||
const int64_t B = inp->ne[2];
|
||||
|
||||
if (learned_pos_embd) {
|
||||
@@ -862,7 +864,7 @@ ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale
|
||||
return cur;
|
||||
}
|
||||
|
||||
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
|
||||
static std::unique_ptr<clip_graph> clip_get_graph_builder(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
|
||||
const clip_image_f32 & img = *imgs.entries[0];
|
||||
std::unique_ptr<clip_graph> builder;
|
||||
|
||||
@@ -1025,7 +1027,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
// TODO [QWEN_VIDEO]: improve this in the future
|
||||
builder->n_batch = imgs.entries.size();
|
||||
|
||||
return builder->build();
|
||||
return builder;
|
||||
}
|
||||
|
||||
//
|
||||
@@ -2819,7 +2821,7 @@ struct clip_model_loader {
|
||||
std::vector<support_info_op> ops;
|
||||
};
|
||||
|
||||
static void warmup(clip_ctx & ctx_clip) {
|
||||
static clip_image_f32_batch get_dummy_batch(clip_ctx & ctx_clip) {
|
||||
// create a fake batch
|
||||
const auto & hparams = ctx_clip.model.hparams;
|
||||
clip_image_f32_batch batch;
|
||||
@@ -2833,6 +2835,20 @@ struct clip_model_loader {
|
||||
LOG_INF("%s: warmup with audio size = %d\n", __func__, hparams.warmup_audio_size);
|
||||
}
|
||||
batch.entries.push_back(std::move(img));
|
||||
return batch;
|
||||
}
|
||||
|
||||
static void init_ctx(clip_ctx & ctx_clip) {
|
||||
ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead());
|
||||
|
||||
// check batching support
|
||||
auto batch = get_dummy_batch(ctx_clip);
|
||||
auto builder = clip_get_graph_builder(&ctx_clip, batch);
|
||||
ctx_clip.support_batch = builder->support_batch();
|
||||
}
|
||||
|
||||
static void warmup(clip_ctx & ctx_clip) {
|
||||
auto batch = get_dummy_batch(ctx_clip);
|
||||
warmup(ctx_clip, batch);
|
||||
}
|
||||
|
||||
@@ -2905,9 +2921,7 @@ struct clip_model_loader {
|
||||
|
||||
// only initialize backend buffers, but do not allocate them yet
|
||||
static support_info_graph reserve_compute_meta(clip_ctx & ctx_clip, const clip_image_f32_batch & batch) {
|
||||
ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead());
|
||||
|
||||
ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch);
|
||||
ggml_cgraph * gf = clip_get_graph_builder(&ctx_clip, batch)->build();
|
||||
ggml_backend_sched_reserve(ctx_clip.sched.get(), gf);
|
||||
|
||||
ctx_clip.mem_compute.clear();
|
||||
@@ -3070,6 +3084,7 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
|
||||
ctx_vision = new clip_ctx(ctx_params);
|
||||
loader.load_hparams(ctx_vision->model, CLIP_MODALITY_VISION);
|
||||
loader.load_tensors(*ctx_vision);
|
||||
loader.init_ctx(*ctx_vision);
|
||||
if (ctx_params.warmup) {
|
||||
loader.warmup(*ctx_vision);
|
||||
}
|
||||
@@ -3083,6 +3098,7 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
|
||||
ctx_audio = new clip_ctx(ctx_params);
|
||||
loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO);
|
||||
loader.load_tensors(*ctx_audio);
|
||||
loader.init_ctx(*ctx_audio);
|
||||
if (ctx_params.warmup) {
|
||||
loader.warmup(*ctx_audio);
|
||||
}
|
||||
@@ -3484,25 +3500,22 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
return n_patches;
|
||||
}
|
||||
|
||||
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
|
||||
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, std::vector<float> & out_vec) {
|
||||
clip_image_f32_batch imgs;
|
||||
clip_image_f32_ptr img_copy(clip_image_f32_init());
|
||||
*img_copy = *img;
|
||||
imgs.entries.push_back(std::move(img_copy));
|
||||
|
||||
return clip_image_batch_encode(ctx, n_threads, &imgs, vec);
|
||||
return clip_image_batch_encode(ctx, n_threads, &imgs, out_vec);
|
||||
}
|
||||
|
||||
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
|
||||
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, std::vector<float> & out_batch_embd) {
|
||||
const clip_image_f32_batch & imgs = *imgs_c_ptr;
|
||||
int n_batch_cur = imgs.entries.size();
|
||||
|
||||
// maximum supported batch size, usually == 2 for qwen-vl-based models
|
||||
int n_batch_max = clip_model_n_batch_max(ctx);
|
||||
|
||||
// TODO @ngxson : implement batch size > 1 as a loop
|
||||
// we don't need true batching support because the cgraph will gonna be big anyway
|
||||
if (n_batch_cur > n_batch_max) {
|
||||
// [QWEN_VIDEO] for video models, the batch dimension is used as temporal dimension for merged frames
|
||||
if (!ctx->support_batch && n_batch_cur > clip_model_n_temporal_merge(ctx)) {
|
||||
LOG_ERR("%s: batch size %d exceeds maximum supported batch/temporal-merge size %d\n", __func__, n_batch_cur, clip_model_n_temporal_merge(ctx));
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -3513,7 +3526,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
|
||||
// build the inference graph
|
||||
ggml_backend_sched_reset(ctx->sched.get());
|
||||
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
|
||||
ggml_cgraph * gf = clip_get_graph_builder(ctx, imgs)->build();
|
||||
ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
|
||||
|
||||
// set inputs
|
||||
@@ -3582,6 +3595,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
const int n = nx * ny;
|
||||
|
||||
for (int b = 0; b < n_batch_cur; b++) {
|
||||
LOG_DBG("%s: copying image %d/%d to input buffer (nx=%d, ny=%d)\n", __func__, b+1, n_batch_cur, nx, ny);
|
||||
const auto & buf = imgs.entries[b]->get_ro_buf();
|
||||
float * batch_entry = inp_raw.data() + b * (3*n);
|
||||
for (int y = 0; y < ny; y++) {
|
||||
@@ -4416,7 +4430,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
// the last node is the embedding tensor
|
||||
ggml_tensor * embeddings = ggml_graph_node(gf, -1);
|
||||
|
||||
// sanity check (only support batch size of 1 for now)
|
||||
// sanity check (assuming that all images in batch have the same number of tokens, so we only check the first one)
|
||||
const int n_tokens_out = embeddings->ne[1];
|
||||
const int expected_n_tokens_out = clip_n_output_tokens(ctx, imgs.entries[0].get());
|
||||
if (n_tokens_out != expected_n_tokens_out) {
|
||||
@@ -4424,16 +4438,26 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
GGML_ABORT("Invalid number of output tokens");
|
||||
}
|
||||
|
||||
// copy the embeddings to the location passed by the user
|
||||
if (vec != nullptr) {
|
||||
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
|
||||
LOG_DBG("%s: output embedding shape [%d, %d, %d]\n", __func__,
|
||||
(int)embeddings->ne[0], (int)embeddings->ne[1], (int)embeddings->ne[2]);
|
||||
|
||||
// copy output to user buffer if provided
|
||||
// if output is empty, skip the copy
|
||||
if (!out_batch_embd.empty()) {
|
||||
if (out_batch_embd.size() != (size_t)ggml_nelements(embeddings)) {
|
||||
LOG_ERR("%s: output buffer has %zu elements but expected %zu\n", __func__, out_batch_embd.size(), (size_t)ggml_nelements(embeddings));
|
||||
GGML_ABORT("Output buffer size mismatch");
|
||||
}
|
||||
ggml_backend_tensor_get(embeddings, out_batch_embd.data(), 0, ggml_nbytes(embeddings));
|
||||
} else {
|
||||
LOG_WRN("%s: output buffer is empty, skipping copy\n", __func__);
|
||||
}
|
||||
|
||||
// Debug: dump final embeddings if MTMD_DEBUG_EMBEDDINGS is set
|
||||
if (ctx->debug_output_embeddings) {
|
||||
const int64_t n_embd = embeddings->ne[0];
|
||||
const int64_t n_tokens = embeddings->ne[1];
|
||||
std::vector<float> emb_data(n_embd * n_tokens);
|
||||
std::vector<float> emb_data(ggml_nelements(embeddings));
|
||||
ggml_backend_tensor_get(embeddings, emb_data.data(), 0, ggml_nbytes(embeddings));
|
||||
|
||||
LOG_INF("\n=== MTMD_DEBUG_EMBEDDINGS ===\n");
|
||||
@@ -4570,7 +4594,14 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
|
||||
return ctx->model.modality == CLIP_MODALITY_AUDIO;
|
||||
}
|
||||
|
||||
int clip_model_n_batch_max(const struct clip_ctx * ctx) {
|
||||
bool clip_support_batch(const struct clip_ctx * ctx) {
|
||||
return ctx->support_batch;
|
||||
}
|
||||
|
||||
// TODO @ngxson : this is no longer correct with mtmd_batch API
|
||||
// this was only meant to be used by qwen-vl-based models, to fuse 2 input images into one (qwen-vl video support)
|
||||
// this logic should be refactored in near future to distinctly handle "merge frames" and "batching"
|
||||
int clip_model_n_temporal_merge(const struct clip_ctx * ctx) {
|
||||
switch (ctx->proj_type()) {
|
||||
case PROJECTOR_TYPE_QWEN2VL:
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
|
||||
+5
-3
@@ -97,8 +97,8 @@ size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int id
|
||||
size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
|
||||
struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
|
||||
|
||||
bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
|
||||
bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
|
||||
bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, std::vector<float> & out_vec);
|
||||
bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, std::vector<float> & out_batch_embd);
|
||||
|
||||
bool clip_is_llava(const struct clip_ctx * ctx);
|
||||
// note for contributor: this clip_is_(model) pattern is deprecated
|
||||
@@ -107,7 +107,9 @@ bool clip_is_llava(const struct clip_ctx * ctx);
|
||||
bool clip_has_vision_encoder(const struct clip_ctx * ctx);
|
||||
bool clip_has_audio_encoder(const struct clip_ctx * ctx);
|
||||
|
||||
int clip_model_n_batch_max(const struct clip_ctx * ctx);
|
||||
bool clip_support_batch(const struct clip_ctx * ctx);
|
||||
|
||||
int clip_model_n_temporal_merge(const struct clip_ctx * ctx); // TODO @ngxson : remove, refactor this
|
||||
|
||||
std::map<ggml_backend_dev_t, size_t> clip_get_mem_usage(const struct clip_ctx * ctx);
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@ ggml_cgraph * clip_graph_gemma4v::build() {
|
||||
ggml_set_name(inp_raw, "inp_raw_scaled");
|
||||
|
||||
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||
inp = ggml_reshape_2d(ctx0, inp, n_patches, n_embd);
|
||||
inp = ggml_reshape_3d(ctx0, inp, n_patches, n_embd, n_batch);
|
||||
inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
|
||||
ggml_set_name(inp, "inp");
|
||||
// note: no patch bias
|
||||
@@ -51,10 +51,11 @@ ggml_cgraph * clip_graph_gemma4v::build() {
|
||||
// first half
|
||||
ggml_tensor * first;
|
||||
{
|
||||
first = ggml_view_3d(ctx0, cur,
|
||||
n_dim/2, n_head, n_pos,
|
||||
first = ggml_view_4d(ctx0, cur,
|
||||
n_dim/2, n_head, n_pos, n_batch,
|
||||
cur->nb[1],
|
||||
cur->nb[2],
|
||||
cur->nb[3],
|
||||
0);
|
||||
first = ggml_rope_ext(
|
||||
ctx0,
|
||||
@@ -70,10 +71,11 @@ ggml_cgraph * clip_graph_gemma4v::build() {
|
||||
// second half
|
||||
ggml_tensor * second;
|
||||
{
|
||||
second = ggml_view_3d(ctx0, cur,
|
||||
n_dim/2, n_head, n_pos,
|
||||
second = ggml_view_4d(ctx0, cur,
|
||||
n_dim/2, n_head, n_pos, n_batch,
|
||||
cur->nb[1],
|
||||
cur->nb[2],
|
||||
cur->nb[3],
|
||||
n_dim/2 * ggml_element_size(cur));
|
||||
second = ggml_rope_ext(
|
||||
ctx0,
|
||||
@@ -103,14 +105,14 @@ ggml_cgraph * clip_graph_gemma4v::build() {
|
||||
const int kernel_size = hparams.n_merge;
|
||||
GGML_ASSERT(kernel_size > 0);
|
||||
|
||||
// [n_embd, n_patches] -> [n_patches_x, n_patches_y, n_embd, 1]
|
||||
cur = ggml_cont_4d(ctx0, ggml_transpose(ctx0, cur), n_patches_x, n_patches_y, n_embd, 1);
|
||||
// [n_embd, n_patches] -> [n_patches_x, n_patches_y, n_embd, n_batch]
|
||||
cur = ggml_cont_4d(ctx0, ggml_transpose(ctx0, cur), n_patches_x, n_patches_y, n_embd, n_batch);
|
||||
cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG,
|
||||
kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
|
||||
const int out_x = n_patches_x / kernel_size;
|
||||
const int out_y = n_patches_y / kernel_size;
|
||||
// [out_x, out_y, n_embd, 1] -> [n_embd, out_x * out_y]
|
||||
cur = ggml_reshape_3d(ctx0, cur, out_x * out_y, n_embd, 1);
|
||||
// [out_x, out_y, n_embd, n_batch] -> [n_embd, out_x * out_y, n_batch]
|
||||
cur = ggml_reshape_3d(ctx0, cur, out_x * out_y, n_embd, n_batch);
|
||||
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
||||
cur = ggml_scale(ctx0, cur, sqrtf((float)n_embd));
|
||||
cb(cur, "pooled", -1);
|
||||
|
||||
@@ -16,6 +16,7 @@ struct clip_graph_gemma4v : clip_graph {
|
||||
clip_graph_gemma4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
ggml_tensor * build_mm(ggml_tensor * w, ggml_tensor * x) const override;
|
||||
bool support_batch() const override { return true; }
|
||||
};
|
||||
|
||||
struct clip_graph_gemma4uv : clip_graph {
|
||||
|
||||
@@ -67,8 +67,8 @@ MTMD_API void mtmd_helper_image_get_decoder_pos(const mtmd_image_tokens * image,
|
||||
|
||||
// helper function that automatically:
|
||||
// 1. run llama_decode() on text chunks
|
||||
// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
|
||||
// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
|
||||
// 2. run mtmd_encode_chunk() on image chunks, then mtmd_get_output_embd() and then llama_decode()
|
||||
// if any of the mtmd_encode_chunk() or llama_decode() calls return non-zero, stop and forward the error
|
||||
// otherwise, returns 0 on success
|
||||
// this function is NOT thread-safe
|
||||
MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
|
||||
@@ -157,13 +157,16 @@ MTMD_API int32_t mtmd_helper_video_read_next(mtmd_helper_video * ctx,
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
#include <set>
|
||||
#include <memory>
|
||||
|
||||
namespace mtmd_helper {
|
||||
|
||||
//
|
||||
// C++ wrappers
|
||||
//
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace mtmd_helper {
|
||||
|
||||
// video-related C++ wrappers
|
||||
struct mtmd_helper_video_deleter {
|
||||
void operator()(mtmd_helper_video * val) { mtmd_helper_video_free(val); }
|
||||
|
||||
+296
-66
@@ -69,8 +69,8 @@ struct mtmd_bitmap {
|
||||
return data.size();
|
||||
}
|
||||
|
||||
bool can_batch_with(const mtmd_bitmap & other) const {
|
||||
// [QWEN_VIDEO] can batch if both are images with same size
|
||||
bool can_merge_with(const mtmd_bitmap & other) const {
|
||||
// [QWEN_VIDEO] can (temporal) merge if both are images with same size
|
||||
return !is_audio && !other.is_audio && nx == other.nx && ny == other.ny;
|
||||
}
|
||||
|
||||
@@ -90,12 +90,24 @@ struct mtmd_image_tokens {
|
||||
uint32_t ny = 0; // number of tokens in y direction
|
||||
mtmd_pos_type pos = MTMD_POS_TYPE_NORMAL;
|
||||
uint32_t image_idx = 0; // 0-based position of this image among image chunks in the prompt(used by pos == MTMD_POS_TYPE_HUNYUANVL)
|
||||
uint32_t n_temporal_merge = 1; // for qwen-vl style temporal merge
|
||||
uint32_t n_tokens() const {
|
||||
if (pos == MTMD_POS_TYPE_HUNYUANVL) {
|
||||
// [BOI] [row0 tokens + newline] ... [row(ny-1) tokens + newline] [EOI]
|
||||
return (nx + 1) * ny + 2;
|
||||
}
|
||||
return nx * ny;
|
||||
// [QWEN_VIDEO] this logic is quite ugly, it's mostly to make qwen-vl temporal merge work, can be improved in the future
|
||||
if (batch_f32.entries.size() == 1 || n_temporal_merge == 1) {
|
||||
return nx * ny;
|
||||
}
|
||||
uint32_t nz = batch_f32.entries.size();
|
||||
// TODO: simplify this by repeating the last frame until it fits the temporal merge
|
||||
if (nz % n_temporal_merge != 0) {
|
||||
nz = nz / n_temporal_merge + 1;
|
||||
} else {
|
||||
nz = nz / n_temporal_merge;
|
||||
}
|
||||
return nx * ny * nz;
|
||||
}
|
||||
clip_image_f32_batch batch_f32; // preprocessed image patches
|
||||
std::string id; // optional user-defined ID, useful for KV cache tracking
|
||||
@@ -110,12 +122,17 @@ struct mtmd_image_tokens {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool can_batch_with(const mtmd_image_tokens & other) {
|
||||
return nx == other.nx && ny == other.ny && pos == other.pos;
|
||||
}
|
||||
|
||||
mtmd_image_tokens clone() {
|
||||
return mtmd_image_tokens{
|
||||
nx,
|
||||
ny,
|
||||
pos,
|
||||
image_idx,
|
||||
n_temporal_merge,
|
||||
batch_f32.clone(),
|
||||
id
|
||||
};
|
||||
@@ -153,12 +170,49 @@ struct mtmd_input_chunk {
|
||||
std::vector<llama_token> tokens_text;
|
||||
mtmd_image_tokens_ptr tokens_image;
|
||||
mtmd_audio_tokens_ptr tokens_audio;
|
||||
|
||||
bool can_batch_with(const mtmd_input_chunk & other) const {
|
||||
if (type != other.type) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (tokens_image && other.tokens_image) {
|
||||
return tokens_image->can_batch_with(*other.tokens_image);
|
||||
}
|
||||
|
||||
// TODO: allow batching audio chunks of the same size
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool is_placeholder() const {
|
||||
if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
||||
return tokens_image && tokens_image->is_placeholder();
|
||||
} else if (type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
|
||||
return tokens_audio && tokens_audio->is_placeholder();
|
||||
}
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
struct mtmd_input_chunks {
|
||||
std::vector<mtmd_input_chunk> entries;
|
||||
};
|
||||
|
||||
struct mtmd_batch {
|
||||
mtmd_context * ctx;
|
||||
std::vector<const mtmd_input_chunk *> entries;
|
||||
std::vector<float> output_embd; // aggregated output embedding for the whole batch
|
||||
mtmd_batch(mtmd_context * ctx): ctx(ctx) {}
|
||||
int32_t n_tokens() const {
|
||||
int32_t n = 0;
|
||||
for (const auto * chunk : entries) {
|
||||
n += mtmd_input_chunk_get_n_tokens(chunk);
|
||||
}
|
||||
return n;
|
||||
}
|
||||
};
|
||||
|
||||
// slice template, used by some llava-uhd models to correctly place the special tokens around image embeddings
|
||||
// models not having it (llava-1.6) will process embeddings without any special tokens in-between
|
||||
enum mtmd_slice_tmpl {
|
||||
@@ -197,6 +251,7 @@ mtmd_context_params mtmd_context_params_default() {
|
||||
/* image_max_tokens */ -1,
|
||||
/* cb_eval */ nullptr,
|
||||
/* cb_eval_user_data */ nullptr,
|
||||
/* batch_max_tokens */ 1024,
|
||||
};
|
||||
return params;
|
||||
}
|
||||
@@ -204,7 +259,7 @@ mtmd_context_params mtmd_context_params_default() {
|
||||
struct mtmd_context {
|
||||
struct clip_ctx * ctx_v; // vision
|
||||
struct clip_ctx * ctx_a; // audio
|
||||
std::vector<float> image_embd_v; // image embedding vector
|
||||
std::vector<float> out_embd; // image embedding vector
|
||||
|
||||
bool print_timings;
|
||||
int n_threads;
|
||||
@@ -239,17 +294,21 @@ struct mtmd_context {
|
||||
std::unique_ptr<mtmd_audio_preprocessor> audio_preproc;
|
||||
std::unique_ptr<mtmd_image_preprocessor> image_preproc;
|
||||
|
||||
// batching
|
||||
int32_t batch_max_tokens;
|
||||
|
||||
// TODO @ngxson : add timings
|
||||
|
||||
mtmd_context(const char * mmproj_fname,
|
||||
const llama_model * text_model,
|
||||
const mtmd_context_params & ctx_params,
|
||||
bool no_alloc = false) :
|
||||
print_timings(ctx_params.print_timings),
|
||||
n_threads (ctx_params.n_threads),
|
||||
media_marker (ctx_params.media_marker),
|
||||
n_embd_text (text_model ? llama_model_n_embd_inp(text_model) : -1),
|
||||
vocab (text_model ? llama_model_get_vocab(text_model) : nullptr)
|
||||
print_timings (ctx_params.print_timings),
|
||||
n_threads (ctx_params.n_threads),
|
||||
media_marker (ctx_params.media_marker),
|
||||
n_embd_text (text_model ? llama_model_n_embd_inp(text_model) : -1),
|
||||
vocab (text_model ? llama_model_get_vocab(text_model) : nullptr),
|
||||
batch_max_tokens(ctx_params.batch_max_tokens)
|
||||
{
|
||||
if (ctx_params.image_marker != nullptr) {
|
||||
throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead");
|
||||
@@ -680,6 +739,16 @@ struct mtmd_context {
|
||||
return ctx_a ? clip_get_projector_type(ctx_a) : PROJECTOR_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
int64_t n_embd_out() const {
|
||||
if (ctx_v) {
|
||||
return clip_n_mmproj_embd(ctx_v);
|
||||
} else if (ctx_a) {
|
||||
return clip_n_mmproj_embd(ctx_a);
|
||||
} else {
|
||||
throw std::runtime_error("no CLIP model loaded");
|
||||
}
|
||||
}
|
||||
|
||||
~mtmd_context() {
|
||||
clip_free(ctx_a);
|
||||
clip_free(ctx_v);
|
||||
@@ -845,7 +914,7 @@ struct mtmd_tokenizer {
|
||||
// [QWEN_VIDEO] handle frame merging for models that support it (i.e. qwen-vl)
|
||||
int n_merge_frames = 1;
|
||||
if (ctx->ctx_v) {
|
||||
n_merge_frames = clip_model_n_batch_max(ctx->ctx_v);
|
||||
n_merge_frames = clip_model_n_temporal_merge(ctx->ctx_v);
|
||||
GGML_ASSERT(n_merge_frames <= 2 && "we only support merging maximum 2 images for now; open an issue if this model supports merging more");
|
||||
}
|
||||
|
||||
@@ -860,7 +929,7 @@ struct mtmd_tokenizer {
|
||||
if (i + 1 < parts.size() && parts[i + 1].bitmap != nullptr) {
|
||||
const mtmd_bitmap * bm_a = parts[i].bitmap;
|
||||
const mtmd_bitmap * bm_b = parts[i + 1].bitmap;
|
||||
if (bm_a->can_batch_with(*bm_b)) {
|
||||
if (bm_a->can_merge_with(*bm_b)) {
|
||||
LOG_DBG("%s: merging 2 frames at part index %zu and %zu\n", __func__, i, i + 1);
|
||||
merged_bitmaps.push_back({bm_a, bm_b});
|
||||
parts.erase(parts.begin() + i + 1); // collapse the second bitmap part
|
||||
@@ -1103,13 +1172,17 @@ struct mtmd_tokenizer {
|
||||
size_t n_tokens = 0;
|
||||
for (const auto & e : batch_f32.entries) {
|
||||
n_tokens += clip_n_output_tokens(ctx->ctx_v, e.get());
|
||||
if (clip_model_n_batch_max(ctx->ctx_v) == 2) {
|
||||
if (clip_model_n_temporal_merge(ctx->ctx_v) == 2) {
|
||||
// [QWEN_VIDEO] pair input is merged to the same embd, so only count as one image
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
|
||||
|
||||
// [QWEN_VIDEO] improve this in the future
|
||||
image_tokens->n_temporal_merge = clip_model_n_temporal_merge(ctx->ctx_v);
|
||||
|
||||
if (mtmd_decode_use_mrope(ctx)) {
|
||||
// for Qwen2VL, we need this information for M-RoPE decoding positions
|
||||
image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get());
|
||||
@@ -1327,60 +1400,18 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
||||
}
|
||||
}
|
||||
|
||||
int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
|
||||
if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
||||
LOG_WRN("mtmd_encode_chunk has no effect for text chunks\n");
|
||||
return 0;
|
||||
} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
||||
if (!ctx->ctx_v) {
|
||||
LOG_ERR("%s: model does not support vision input\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
if (chunk->tokens_image == nullptr) {
|
||||
LOG_ERR("%s: image tokens are null\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
if (chunk->tokens_image->is_placeholder()) {
|
||||
LOG_ERR("%s: image tokens batch is placeholder\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
return mtmd_encode(ctx, chunk->tokens_image.get());
|
||||
} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
|
||||
if (!ctx->ctx_a) {
|
||||
LOG_ERR("%s: model does not support audio input\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
if (chunk->tokens_audio == nullptr) {
|
||||
LOG_ERR("%s: audio tokens are null\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
if (chunk->tokens_audio->is_placeholder()) {
|
||||
LOG_ERR("%s: audio tokens batch is placeholder\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
int n_mmproj_embd = ctx->n_embd_text;
|
||||
ctx->image_embd_v.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd);
|
||||
bool ok = clip_image_batch_encode(
|
||||
ctx->ctx_a,
|
||||
ctx->n_threads,
|
||||
&chunk->tokens_audio->batch_f32,
|
||||
ctx->image_embd_v.data());
|
||||
return ok ? 0 : 1;
|
||||
}
|
||||
|
||||
LOG_ERR("%s: unknown chunk type %d\n", __func__, (int)chunk->type);
|
||||
return 1;
|
||||
}
|
||||
|
||||
int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
|
||||
static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tokens, std::vector<float> & out_embd) {
|
||||
clip_ctx * ctx_clip = ctx->ctx_v;
|
||||
if (!ctx_clip) {
|
||||
LOG_ERR("%s: this API does not support non-vision input, please use mtmd_encode_chunk instead\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
auto proj_type = clip_get_projector_type(ctx_clip);
|
||||
int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip);
|
||||
ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
|
||||
|
||||
int n_embd_out = ctx->n_embd_out();
|
||||
auto n_tokens_out = image_tokens->n_tokens();
|
||||
out_embd.resize((size_t)n_embd_out * n_tokens_out);
|
||||
|
||||
bool ok = false;
|
||||
|
||||
if (clip_is_llava(ctx_clip)
|
||||
@@ -1400,12 +1431,19 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
|
||||
return 1;
|
||||
}
|
||||
int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get());
|
||||
ok = clip_image_encode(
|
||||
std::vector<float> tmp_embd((size_t)n_tokens_per_image * n_embd_out);
|
||||
bool ok_i = clip_image_encode(
|
||||
ctx_clip,
|
||||
ctx->n_threads,
|
||||
entries[i].get(),
|
||||
ctx->image_embd_v.data() + offset);
|
||||
offset += static_cast<size_t>(n_mmproj_embd) * n_tokens_per_image;
|
||||
tmp_embd);
|
||||
if (!ok_i) {
|
||||
LOG_ERR("%s: failed to encode image %zu\n", __func__, i);
|
||||
return 1;
|
||||
}
|
||||
ok = true;
|
||||
std::copy(tmp_embd.begin(), tmp_embd.end(), out_embd.begin() + offset);
|
||||
offset += static_cast<size_t>(n_embd_out) * n_tokens_per_image;
|
||||
}
|
||||
} else {
|
||||
if (image_tokens->is_placeholder()) {
|
||||
@@ -1416,14 +1454,206 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
|
||||
ctx_clip,
|
||||
ctx->n_threads,
|
||||
&image_tokens->batch_f32,
|
||||
ctx->image_embd_v.data());
|
||||
out_embd);
|
||||
}
|
||||
|
||||
return ok ? 0 : 1;
|
||||
}
|
||||
|
||||
static int32_t mtmd_encode_chunk_impl(mtmd_context * ctx, const mtmd_input_chunk * chunk, std::vector<float> & out_embd) {
|
||||
if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
||||
LOG_WRN("mtmd_encode_chunk has no effect for text chunks\n");
|
||||
return 0;
|
||||
} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
||||
if (!ctx->ctx_v) {
|
||||
LOG_ERR("%s: model does not support vision input\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
if (chunk->tokens_image == nullptr) {
|
||||
LOG_ERR("%s: image tokens are null\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
if (chunk->tokens_image->is_placeholder()) {
|
||||
LOG_ERR("%s: image tokens batch is placeholder\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
return mtmd_encode_impl(ctx, chunk->tokens_image.get(), out_embd);
|
||||
} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
|
||||
if (!ctx->ctx_a) {
|
||||
LOG_ERR("%s: model does not support audio input\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
if (chunk->tokens_audio == nullptr) {
|
||||
LOG_ERR("%s: audio tokens are null\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
if (chunk->tokens_audio->is_placeholder()) {
|
||||
LOG_ERR("%s: audio tokens batch is placeholder\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
int n_mmproj_embd = ctx->n_embd_out();
|
||||
out_embd.resize((size_t)chunk->tokens_audio->n_tokens * n_mmproj_embd);
|
||||
bool ok = clip_image_batch_encode(
|
||||
ctx->ctx_a,
|
||||
ctx->n_threads,
|
||||
&chunk->tokens_audio->batch_f32,
|
||||
out_embd);
|
||||
return ok ? 0 : 1;
|
||||
}
|
||||
|
||||
LOG_ERR("%s: unknown chunk type %d\n", __func__, (int)chunk->type);
|
||||
return 1;
|
||||
}
|
||||
|
||||
int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
|
||||
// this is the non-batching version
|
||||
try {
|
||||
return mtmd_encode_chunk_impl(ctx, chunk, ctx->out_embd);
|
||||
} catch (const std::exception & e) {
|
||||
LOG_ERR("%s: error: %s\n", __func__, e.what());
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
|
||||
try {
|
||||
return mtmd_encode_impl(ctx, image_tokens, ctx->out_embd);
|
||||
} catch (const std::exception & e) {
|
||||
LOG_ERR("%s: error: %s\n", __func__, e.what());
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
float * mtmd_get_output_embd(mtmd_context * ctx) {
|
||||
return ctx->image_embd_v.data();
|
||||
return ctx->out_embd.data();
|
||||
}
|
||||
|
||||
mtmd_batch * mtmd_batch_init(mtmd_context * ctx) {
|
||||
return new mtmd_batch(ctx);
|
||||
}
|
||||
|
||||
void mtmd_batch_free(mtmd_batch * batch) {
|
||||
if (batch) {
|
||||
delete batch;
|
||||
}
|
||||
}
|
||||
|
||||
int32_t mtmd_batch_add_chunk(mtmd_batch * batch, const mtmd_input_chunk * chunk) {
|
||||
if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
||||
LOG_ERR("%s: text chunk is not supported in batch\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
auto * ctx = batch->ctx->get_clip_ctx(chunk);
|
||||
if (!ctx) {
|
||||
LOG_ERR("%s: model does not support input chunk type %d\n", __func__, (int)chunk->type);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (batch->entries.empty()) {
|
||||
// batch must have at least one chunk
|
||||
batch->entries.push_back(chunk);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!clip_support_batch(ctx)) {
|
||||
// if no batching support, batch can only have one single chunk
|
||||
return 2; // "batch too large" error code
|
||||
}
|
||||
|
||||
int32_t new_n_tokens = batch->n_tokens() + (int32_t)mtmd_input_chunk_get_n_tokens(chunk);
|
||||
if (new_n_tokens > batch->ctx->batch_max_tokens) {
|
||||
return 2; // "batch too large" error code
|
||||
}
|
||||
|
||||
auto & first_chunk = batch->entries[0];
|
||||
if (first_chunk->can_batch_with(*chunk)) {
|
||||
batch->entries.push_back(chunk);
|
||||
return 0;
|
||||
}
|
||||
|
||||
return 3; // "cannot batch" error code
|
||||
}
|
||||
|
||||
static int32_t mtmd_batch_encode_impl(mtmd_batch * batch) {
|
||||
if (batch->entries.empty()) {
|
||||
LOG_ERR("%s: batch is empty\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
for (const auto * chunk : batch->entries) {
|
||||
if (chunk->is_placeholder()) {
|
||||
LOG_ERR("%s: chunk is placeholder\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
// represent the whole batch as one single chunk
|
||||
mtmd::input_chunk_ptr batch_chunk(mtmd_input_chunk_copy(batch->entries[0]));
|
||||
if (batch_chunk->tokens_image) {
|
||||
auto & b0_f32 = batch_chunk->tokens_image->batch_f32;
|
||||
// copy all entries from other chunks into the first chunk's batch_f32
|
||||
// note: skip first entry because it's already in batch_chunk
|
||||
for (size_t ic = 1; ic < batch->entries.size(); ic++) {
|
||||
auto & chunk = batch->entries[ic];
|
||||
GGML_ASSERT(chunk->tokens_image);
|
||||
auto b1_f32 = chunk->tokens_image->batch_f32.clone();
|
||||
for (size_t i = 0; i < b1_f32.entries.size(); i++) {
|
||||
b0_f32.entries.push_back(std::move(b1_f32.entries[i]));
|
||||
}
|
||||
}
|
||||
} else if (batch_chunk->tokens_audio) {
|
||||
auto & b0_f32 = batch_chunk->tokens_audio->batch_f32;
|
||||
// copy all entries from other chunks into the first chunk's batch_f32
|
||||
// note: skip first entry because it's already in batch_chunk
|
||||
for (size_t ic = 1; ic < batch->entries.size(); ic++) {
|
||||
auto & chunk = batch->entries[ic];
|
||||
GGML_ASSERT(chunk->tokens_audio);
|
||||
auto b1_f32 = chunk->tokens_audio->batch_f32.clone();
|
||||
for (size_t i = 0; i < b1_f32.entries.size(); i++) {
|
||||
b0_f32.entries.push_back(std::move(b1_f32.entries[i]));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
LOG_ERR("%s: unsupported chunk type\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
LOG_DBG("%s: encoding batch with %zu entries and total %zu tokens\n",
|
||||
__func__, batch->entries.size(), mtmd_input_chunk_get_n_tokens(batch_chunk.get()));
|
||||
int32_t res = mtmd_encode_chunk_impl(
|
||||
batch->ctx,
|
||||
batch_chunk.get(),
|
||||
batch->output_embd);
|
||||
return res;
|
||||
}
|
||||
|
||||
int32_t mtmd_batch_encode(mtmd_batch * batch) {
|
||||
try {
|
||||
return mtmd_batch_encode_impl(batch);
|
||||
} catch (const std::exception & e) {
|
||||
LOG_ERR("%s: error: %s\n", __func__, e.what());
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
float * mtmd_batch_get_output_embd(mtmd_batch * batch, const mtmd_input_chunk * chunk) {
|
||||
if (batch->output_embd.empty()) {
|
||||
LOG_ERR("%s: batch has not been encoded yet\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
size_t offset = 0;
|
||||
const size_t n_embd = batch->ctx->n_embd_out();
|
||||
for (const auto * c : batch->entries) {
|
||||
size_t offset_prev = offset;
|
||||
size_t n_tokens = mtmd_input_chunk_get_n_tokens(c);
|
||||
offset += n_tokens * n_embd;
|
||||
GGML_ASSERT(offset_prev < batch->output_embd.size());
|
||||
GGML_ASSERT(offset <= batch->output_embd.size());
|
||||
if (c == chunk) {
|
||||
return &batch->output_embd.data()[offset_prev];
|
||||
}
|
||||
}
|
||||
return nullptr; // not found
|
||||
}
|
||||
|
||||
bool mtmd_decode_use_non_causal(const mtmd_context * ctx, const mtmd_input_chunk * chunk) {
|
||||
@@ -1801,7 +2031,7 @@ static void mtmd_debug_encode_impl(mtmd_context * ctx, clip_ctx * ctx_clip, clip
|
||||
ctx_clip,
|
||||
ctx->n_threads,
|
||||
&image,
|
||||
embd_output.data());
|
||||
embd_output);
|
||||
if (!ok) {
|
||||
LOG_ERR("%s: failed to encode image\n", __func__);
|
||||
}
|
||||
|
||||
+36
-4
@@ -63,6 +63,7 @@ struct mtmd_bitmap;
|
||||
struct mtmd_image_tokens;
|
||||
struct mtmd_input_chunk;
|
||||
struct mtmd_input_chunks;
|
||||
struct mtmd_batch;
|
||||
|
||||
struct mtmd_input_text {
|
||||
const char * text;
|
||||
@@ -80,6 +81,7 @@ typedef struct mtmd_image_tokens mtmd_image_tokens;
|
||||
typedef struct mtmd_input_chunk mtmd_input_chunk;
|
||||
typedef struct mtmd_input_chunks mtmd_input_chunks;
|
||||
typedef struct mtmd_input_text mtmd_input_text;
|
||||
typedef struct mtmd_batch mtmd_batch;
|
||||
|
||||
struct mtmd_context_params {
|
||||
bool use_gpu;
|
||||
@@ -97,6 +99,11 @@ struct mtmd_context_params {
|
||||
// callback function passed over to mtmd proper
|
||||
ggml_backend_sched_eval_callback cb_eval;
|
||||
void * cb_eval_user_data;
|
||||
|
||||
// batching params
|
||||
int32_t batch_max_tokens; // maximum number of output tokens in a batch
|
||||
// (note: this is not a hard-limit, the first image will always be added even if it exceeds this limit)
|
||||
// (default: 1024)
|
||||
};
|
||||
|
||||
MTMD_API const char * mtmd_default_marker(void);
|
||||
@@ -265,12 +272,12 @@ MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
|
||||
const mtmd_bitmap ** bitmaps,
|
||||
size_t n_bitmaps);
|
||||
|
||||
// returns 0 on success
|
||||
// TODO: deprecate
|
||||
MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
|
||||
const mtmd_image_tokens * image_tokens);
|
||||
DEPRECATED(MTMD_API int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens),
|
||||
"use mtmd_encode_chunk() instead");
|
||||
|
||||
// text chunk will be ignored silently, only media chunk will be encoded
|
||||
// returns 0 on success
|
||||
// returns 1 on generic error
|
||||
MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx,
|
||||
const mtmd_input_chunk * chunk);
|
||||
|
||||
@@ -279,6 +286,26 @@ MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx,
|
||||
// llama_model_n_embd_inp(model) * mtmd_input_chunk_get_n_tokens(chunk) * sizeof(float)
|
||||
MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
|
||||
|
||||
|
||||
// batch encoding API
|
||||
// chunks are not owned by the batch, they will not be freed by mtmd_batch_free()
|
||||
// batch is valid for a given context, cannot be shared across contexts
|
||||
MTMD_API mtmd_batch * mtmd_batch_init(mtmd_context * ctx);
|
||||
MTMD_API void mtmd_batch_free(mtmd_batch * batch);
|
||||
|
||||
// only media chunks are allowed, text chunks will be rejected
|
||||
// returns 0 on success
|
||||
// returns 1 on generic error
|
||||
// returns 2 if the batch is too large (chunk won't be added)
|
||||
// returns 3 if it cannot be batched with the existing chunks in the batch
|
||||
MTMD_API int32_t mtmd_batch_add_chunk(mtmd_batch * batch, const mtmd_input_chunk * chunk);
|
||||
|
||||
// returns 0 on success
|
||||
// returns 1 on generic error
|
||||
MTMD_API int32_t mtmd_batch_encode(mtmd_batch * batch);
|
||||
MTMD_API float * mtmd_batch_get_output_embd(mtmd_batch * batch, const mtmd_input_chunk * chunk);
|
||||
|
||||
|
||||
// Set callback for all future logging events.
|
||||
// If this is not called, or NULL is supplied, everything is output on stderr.
|
||||
MTMD_API void mtmd_log_set(ggml_log_callback log_callback, void * user_data);
|
||||
@@ -336,6 +363,11 @@ struct mtmd_input_chunk_deleter {
|
||||
};
|
||||
using input_chunk_ptr = std::unique_ptr<mtmd_input_chunk, mtmd_input_chunk_deleter>;
|
||||
|
||||
struct mtmd_batch_deleter {
|
||||
void operator()(mtmd_batch * val) { mtmd_batch_free(val); }
|
||||
};
|
||||
using batch_ptr = std::unique_ptr<mtmd_batch, mtmd_batch_deleter>;
|
||||
|
||||
struct bitmap {
|
||||
bitmap_ptr ptr;
|
||||
bitmap() : ptr(nullptr) {}
|
||||
|
||||
@@ -344,6 +344,14 @@ const mtmd::input_chunk_ptr & server_tokens::find_chunk(size_t idx) const {
|
||||
throw std::runtime_error("Chunk not found");
|
||||
}
|
||||
|
||||
std::pair<const mtmd::input_chunk_ptr *, size_t> server_tokens::find_next_media_chunk(size_t idx) const {
|
||||
auto it = map_idx_to_media.upper_bound(idx);
|
||||
if (it != map_idx_to_media.end()) {
|
||||
return { &it->second, it->first };
|
||||
}
|
||||
return { nullptr, 0 };
|
||||
}
|
||||
|
||||
void server_tokens::push_back(llama_token tok) {
|
||||
if (tok == LLAMA_TOKEN_NULL) {
|
||||
throw std::runtime_error("Invalid token");
|
||||
@@ -1126,9 +1134,9 @@ json oaicompat_chat_params_parse(
|
||||
|
||||
// Reasoning budget: pass parameters through to sampling layer
|
||||
{
|
||||
int reasoning_budget = opt.reasoning_budget;
|
||||
if (reasoning_budget == -1 && body.contains("thinking_budget_tokens")) {
|
||||
reasoning_budget = json_value(body, "thinking_budget_tokens", -1);
|
||||
int reasoning_budget = json_value(body, "thinking_budget_tokens", -1);
|
||||
if (reasoning_budget == -1) {
|
||||
reasoning_budget = opt.reasoning_budget;
|
||||
}
|
||||
|
||||
if (!chat_params.thinking_end_tag.empty()) {
|
||||
|
||||
@@ -180,6 +180,10 @@ public:
|
||||
|
||||
const mtmd::input_chunk_ptr & find_chunk(size_t idx) const;
|
||||
|
||||
// find next media chunk after idx
|
||||
// returns a pair of pointer to the chunk (nullptr if not found) and its start index in tokens
|
||||
std::pair<const mtmd::input_chunk_ptr *, size_t> find_next_media_chunk(size_t idx) const;
|
||||
|
||||
void push_back(llama_token tok);
|
||||
|
||||
// will create a copy of the chunk if it contains non-text data
|
||||
|
||||
+116
-19
@@ -80,6 +80,8 @@ struct server_slot {
|
||||
|
||||
// multimodal
|
||||
mtmd_context * mctx = nullptr;
|
||||
mtmd::batch_ptr mbatch = nullptr;
|
||||
std::array<llama_context *, 2> mtgt = {nullptr, nullptr}; // [0] for main context, [1] for optional draft context
|
||||
|
||||
// speculative decoding
|
||||
common_speculative * spec;
|
||||
@@ -239,6 +241,18 @@ struct server_slot {
|
||||
|
||||
// clear alora start
|
||||
alora_invocation_start = -1;
|
||||
|
||||
// clear multimodal state
|
||||
mbatch.reset();
|
||||
mtgt[0] = ctx_tgt;
|
||||
mtgt[1] = nullptr;
|
||||
if (ctx_dft && llama_get_ctx_other(ctx_dft) != ctx_tgt) {
|
||||
// TODO: in the future, figure out how to infuse target embeddings to the images
|
||||
// for now, we re-decode the same chunk in both ctx_tgt and ctx_dft
|
||||
// maybe we simply need to call `common_speculative_process()` ?
|
||||
// [TAG_MTMD_DRAFT_PROCESSING]
|
||||
mtgt[1] = ctx_dft;
|
||||
}
|
||||
}
|
||||
|
||||
void init_sampler() const {
|
||||
@@ -578,6 +592,87 @@ struct server_slot {
|
||||
other.prompt = prompt.clone();
|
||||
other.init_sampler();
|
||||
}
|
||||
|
||||
// returns 0 on success
|
||||
// caller need to update prompt.tokens after a successful call to keep track of the processing progress
|
||||
int process_mtmd_chunk(size_t idx, size_t & n_tokens_out) {
|
||||
GGML_ASSERT(mctx);
|
||||
const auto & input_tokens = task->tokens;
|
||||
auto & chunk = input_tokens.find_chunk(idx);
|
||||
int32_t res = 0;
|
||||
|
||||
auto try_decode = [&]() -> int32_t {
|
||||
if (mbatch) {
|
||||
float * embd = mtmd_batch_get_output_embd(mbatch.get(), chunk.get());
|
||||
if (embd) {
|
||||
for (auto * lctx : mtgt) {
|
||||
if (lctx == nullptr) {
|
||||
continue;
|
||||
}
|
||||
llama_pos new_n_past; // unused for now
|
||||
res = mtmd_helper_decode_image_chunk(
|
||||
mctx,
|
||||
lctx,
|
||||
chunk.get(),
|
||||
embd,
|
||||
prompt.tokens.pos_next(),
|
||||
id,
|
||||
llama_n_batch(lctx),
|
||||
&new_n_past
|
||||
);
|
||||
if (res != 0) {
|
||||
SLT_ERR(*this, "failed to decode mtmd chunk, idx = %zu, res = %d\n", idx, res);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
n_tokens_out = mtmd_input_chunk_get_n_tokens(chunk.get());
|
||||
return 0; // success
|
||||
}
|
||||
}
|
||||
return 1; // (non-error) need to create & encode batch
|
||||
};
|
||||
|
||||
// if the batch is already exist, try searching & encode
|
||||
res = try_decode();
|
||||
if (res == 0) {
|
||||
return 0;
|
||||
} else if (res < 0) {
|
||||
// fatal error
|
||||
return res;
|
||||
}
|
||||
|
||||
// otherwise, the batch is either uninitialized or is used up
|
||||
// we need to create & encode a new batch
|
||||
mbatch.reset(mtmd_batch_init(mctx));
|
||||
res = mtmd_batch_add_chunk(mbatch.get(), chunk.get());
|
||||
GGML_ASSERT(res == 0); // we should never have an empty batch
|
||||
|
||||
// try batching as much as possible
|
||||
int n_added = 1;
|
||||
size_t idx_cur = idx;
|
||||
while (res == 0) {
|
||||
auto [next_chunk, next_idx] = input_tokens.find_next_media_chunk(idx_cur);
|
||||
if (next_chunk == nullptr) {
|
||||
break;
|
||||
}
|
||||
res = mtmd_batch_add_chunk(mbatch.get(), next_chunk->get());
|
||||
n_added += (res == 0 ? 1 : 0);
|
||||
idx_cur = next_idx;
|
||||
SLT_DBG(*this, "try adding media chunk idx = %zu to batch, res = %d\n", next_idx, res);
|
||||
// if res != 0, batch is full or chunk is not compatible -> this loop breaks
|
||||
}
|
||||
|
||||
// TODO @ngxson : move this log line to debug when it become more stable
|
||||
SLT_INF(*this, "encoding mtmd batch from idx = %zu, n_chunks = %d\n", idx, n_added);
|
||||
|
||||
res = mtmd_batch_encode(mbatch.get());
|
||||
if (res != 0) {
|
||||
SLT_ERR(*this, "failed to encode mtmd batch for chunk idx = %zu, res = %d\n", idx, res);
|
||||
return -1;
|
||||
}
|
||||
|
||||
return try_decode();
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -781,6 +876,7 @@ private:
|
||||
mparams.warmup = params_base.warmup;
|
||||
mparams.image_min_tokens = params_base.image_min_tokens;
|
||||
mparams.image_max_tokens = params_base.image_max_tokens;
|
||||
mparams.batch_max_tokens = params_base.mtmd_batch_max_tokens;
|
||||
mparams.media_marker = get_media_marker();
|
||||
}
|
||||
|
||||
@@ -866,10 +962,7 @@ private:
|
||||
}
|
||||
|
||||
for (size_t j = 0; j < devs.size(); ++j) {
|
||||
const size_t bytes =
|
||||
(measure_model_bytes ? dmd[j].mb.model : 0) +
|
||||
dmd[j].mb.context +
|
||||
dmd[j].mb.compute;
|
||||
const size_t bytes = (measure_model_bytes ? dmd[j].model : 0) + dmd[j].context + dmd[j].compute;
|
||||
total += bytes;
|
||||
for (size_t i = 0; i < tgt_devices.size(); i++) {
|
||||
if (tgt_devices[i] == devs[j]) {
|
||||
@@ -2046,6 +2139,9 @@ private:
|
||||
|
||||
auto & cur = slot.prompt.checkpoints.emplace_back();
|
||||
|
||||
// [TAG_CHECKPOINTS_FIX_POS_MIN]
|
||||
// TODO: here we incorrectly deterimne that the saved checkpoint data covers the [pos_min, pos_max] range
|
||||
// this is not true for SWA models: https://github.com/ggml-org/llama.cpp/pull/24411#issuecomment-4677983225
|
||||
cur.update_pos(slot.prompt.n_tokens() - n_tokens_cur, pos_min, pos_max);
|
||||
|
||||
cur.update_tgt(ctx_tgt, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
|
||||
@@ -2860,6 +2956,10 @@ private:
|
||||
// guarantee that a checkpoint will result in at least one token being processed [TAG_PROMPT_LOGITS]
|
||||
LOG_INF("slot %12.*s: id %2d | task %d | Checking checkpoint with [%d, %d] against %d...\n", 12,
|
||||
func_name, (slot).id, ((slot).task ? (slot).task->id : -1), cur.pos_min, cur.pos_max, pos_min_thold);
|
||||
// workaround for [TAG_CHECKPOINTS_FIX_POS_MIN]
|
||||
if (cur.pos_max > pos_next) {
|
||||
return false;
|
||||
}
|
||||
return cur.pos_min < pos_min_thold || cur.pos_min == 0;
|
||||
}
|
||||
);
|
||||
@@ -2921,7 +3021,7 @@ private:
|
||||
send_partial_response(slot, {}, false, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
} // end of SLOT_STATE_STARTED
|
||||
|
||||
if (!slot.can_split()) {
|
||||
// cannot fit the prompt in the current batch - will try next iter
|
||||
@@ -2976,10 +3076,18 @@ private:
|
||||
bool has_mtmd = false;
|
||||
|
||||
// check if we should process the image
|
||||
while (slot.prompt.n_tokens() < slot.task->n_tokens() && input_tokens[slot.prompt.n_tokens()] == LLAMA_TOKEN_NULL) {
|
||||
while (true) {
|
||||
auto cur_token_idx = slot.prompt.n_tokens();
|
||||
if (
|
||||
cur_token_idx >= slot.task->n_tokens() ||
|
||||
input_tokens[cur_token_idx] != LLAMA_TOKEN_NULL // encountered a text token
|
||||
) {
|
||||
break;
|
||||
}
|
||||
|
||||
// process the image
|
||||
size_t n_tokens_out = 0;
|
||||
int32_t res = input_tokens.process_chunk(ctx_tgt, mctx, slot.prompt.n_tokens(), slot.prompt.tokens.pos_next(), slot.id, n_tokens_out);
|
||||
int32_t res = slot.process_mtmd_chunk(cur_token_idx, n_tokens_out);
|
||||
if (res != 0) {
|
||||
SLT_ERR(slot, "failed to process image, res = %d\n", res);
|
||||
send_error(slot, "failed to process image", ERROR_TYPE_SERVER);
|
||||
@@ -2987,22 +3095,11 @@ private:
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ctx_dft && llama_get_ctx_other(ctx_dft.get()) != ctx_tgt) {
|
||||
// TODO: in the future, figure out how to infuse target embeddings to the images
|
||||
// for now, we skip this for simplicity
|
||||
// maybe we simply need to call `common_speculative_process()` on the mtmd batches in the `process_chunk` above?
|
||||
// [TAG_MTMD_DRAFT_PROCESSING]
|
||||
res = input_tokens.process_chunk(ctx_dft.get(), mctx, slot.prompt.n_tokens(), slot.prompt.tokens.pos_next(), slot.id, n_tokens_out);
|
||||
if (res != 0) {
|
||||
GGML_ABORT("failed to process multi-modal data on draft context\n");
|
||||
}
|
||||
}
|
||||
|
||||
slot.n_prompt_tokens_processed += n_tokens_out;
|
||||
|
||||
// add the image chunk to cache
|
||||
{
|
||||
const auto & chunk = input_tokens.find_chunk(slot.prompt.n_tokens());
|
||||
const auto & chunk = input_tokens.find_chunk(cur_token_idx);
|
||||
slot.prompt.tokens.push_back(chunk.get()); // copy
|
||||
}
|
||||
|
||||
|
||||
+208
-10
@@ -113,7 +113,7 @@ bool server_http_context::init(const common_params & params) {
|
||||
#endif
|
||||
|
||||
srv->set_default_headers({{"Server", "llama.cpp"}});
|
||||
srv->set_logger(log_server_request);
|
||||
// srv->set_logger(log_server_request); // TODO @ngxson : this is too spamy, no very useful; improve it in the future
|
||||
srv->set_exception_handler([](const httplib::Request &, httplib::Response & res, const std::exception_ptr & ep) {
|
||||
// this is fail-safe; exceptions should already handled by `ex_wrapper`
|
||||
|
||||
@@ -169,29 +169,108 @@ bool server_http_context::init(const common_params & params) {
|
||||
SRV_INF("api_keys: %zu keys loaded\n", params.api_keys.size());
|
||||
}
|
||||
|
||||
//
|
||||
// Helper: Generate iOS splash screen paths from device dimensions
|
||||
// This centralizes PWA asset paths to avoid duplication across CMake, C++, and TypeScript.
|
||||
// Source of truth: tools/ui/src/lib/constants/pwa.ts (APPLE_DEVICES)
|
||||
//
|
||||
auto generate_splash_endpoints = []() -> std::vector<std::string> {
|
||||
// Apple device dimensions (width x height) with orientation and color scheme
|
||||
// Format: "orientation-dimension1xdimension2.png" or "orientation-dark-dimension1xdimension2.png"
|
||||
// Based on https://developer.apple.com/design/human-interface-guidelines/app-icons
|
||||
static const std::vector<std::pair<std::string, std::string>> splash_specs = {
|
||||
// Portrait screens (light)
|
||||
{"portrait", "640x1136"}, {"portrait", "750x1334"},
|
||||
{"portrait", "1170x2532"}, {"portrait", "1179x2556"},
|
||||
{"portrait", "1206x2622"}, {"portrait", "1284x2778"},
|
||||
{"portrait", "1290x2796"}, {"portrait", "1320x2868"},
|
||||
{"portrait", "1488x2266"}, {"portrait", "1640x2360"},
|
||||
{"portrait", "1668x2388"}, {"portrait", "2048x2732"},
|
||||
// Landscape screens (light) - dimensions swapped
|
||||
{"landscape", "1136x640"}, {"landscape", "1334x750"},
|
||||
{"landscape", "2532x1170"}, {"landscape", "2556x1179"},
|
||||
{"landscape", "2622x1206"}, {"landscape", "2778x1284"},
|
||||
{"landscape", "2796x1290"}, {"landscape", "2868x1320"},
|
||||
{"landscape", "2266x1488"}, {"landscape", "2360x1640"},
|
||||
{"landscape", "2388x1668"}, {"landscape", "2732x2048"},
|
||||
// Portrait screens (dark)
|
||||
{"portrait-dark", "640x1136"}, {"portrait-dark", "750x1334"},
|
||||
{"portrait-dark", "1170x2532"}, {"portrait-dark", "1179x2556"},
|
||||
{"portrait-dark", "1206x2622"}, {"portrait-dark", "1284x2778"},
|
||||
{"portrait-dark", "1290x2796"}, {"portrait-dark", "1320x2868"},
|
||||
{"portrait-dark", "1488x2266"}, {"portrait-dark", "1640x2360"},
|
||||
{"portrait-dark", "1668x2388"}, {"portrait-dark", "2048x2732"},
|
||||
// Landscape screens (dark)
|
||||
{"landscape-dark", "1136x640"}, {"landscape-dark", "1334x750"},
|
||||
{"landscape-dark", "2532x1170"}, {"landscape-dark", "2556x1179"},
|
||||
{"landscape-dark", "2622x1206"}, {"landscape-dark", "2778x1284"},
|
||||
{"landscape-dark", "2796x1290"}, {"landscape-dark", "2868x1320"},
|
||||
{"landscape-dark", "2266x1488"}, {"landscape-dark", "2360x1640"},
|
||||
{"landscape-dark", "2388x1668"}, {"landscape-dark", "2732x2048"}
|
||||
};
|
||||
|
||||
std::vector<std::string> endpoints;
|
||||
endpoints.reserve(splash_specs.size());
|
||||
for (const auto & [orientation, dimensions] : splash_specs) {
|
||||
endpoints.push_back("/apple-splash-" + orientation + "-" + dimensions + ".png");
|
||||
}
|
||||
return endpoints;
|
||||
};
|
||||
|
||||
//
|
||||
// Middlewares
|
||||
//
|
||||
|
||||
auto middleware_validate_api_key = [api_keys = params.api_keys](const httplib::Request & req, httplib::Response & res) {
|
||||
static const std::unordered_set<std::string> public_endpoints = {
|
||||
// Public endpoints list - includes health, UI, and PWA assets
|
||||
// Source of truth for splash screen paths: tools/ui/src/lib/constants/pwa.ts (APPLE_DEVICES)
|
||||
static const std::unordered_set<std::string> get_public_endpoints = [generate_splash_endpoints]() {
|
||||
std::unordered_set<std::string> endpoints {
|
||||
"/health",
|
||||
"/v1/health",
|
||||
"/models",
|
||||
"/v1/models",
|
||||
"/",
|
||||
"/index.html",
|
||||
"/bundle.js",
|
||||
"/bundle.css",
|
||||
// PWA assets
|
||||
"/favicon.ico",
|
||||
"/favicon-dark.ico",
|
||||
"/favicon.svg",
|
||||
"/favicon-dark.svg",
|
||||
"/pwa-64x64.png",
|
||||
"/pwa-192x192.png",
|
||||
"/pwa-512x512.png",
|
||||
"/maskable-icon-512x512.png",
|
||||
"/apple-touch-icon-180x180.png",
|
||||
// iOS splash screens (generated from APPLE_DEVICES in TypeScript)
|
||||
// PWA runtime files
|
||||
"/manifest.webmanifest",
|
||||
"/sw.js",
|
||||
"/version.json",
|
||||
"/workbox-<hash>.js",
|
||||
"/_app/version.json",
|
||||
"/build.json"
|
||||
};
|
||||
// Add all splash screen endpoints
|
||||
auto splash = generate_splash_endpoints();
|
||||
for (const auto & path : splash) {
|
||||
endpoints.insert(path);
|
||||
}
|
||||
return endpoints;
|
||||
}();
|
||||
|
||||
auto middleware_validate_api_key = [api_keys = params.api_keys](const httplib::Request & req, httplib::Response & res) {
|
||||
// If API key is not set, skip validation
|
||||
if (api_keys.empty()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// If path is public or static file, skip validation
|
||||
if (public_endpoints.find(req.path) != public_endpoints.end()) {
|
||||
if (get_public_endpoints.find(req.path) != get_public_endpoints.end()) {
|
||||
return true;
|
||||
}
|
||||
// Static assets (_app/ files, workbox runtime). These are embedded at build time
|
||||
// so no API key is needed — browsers fetch them directly.
|
||||
if (req.path.find("/_app/") == 0 || req.path.find("/workbox-") == 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -315,7 +394,11 @@ bool server_http_context::init(const common_params & params) {
|
||||
}
|
||||
} else {
|
||||
#if defined(LLAMA_UI_HAS_ASSETS)
|
||||
auto serve_asset = [](const std::string & name, const char * mime, bool with_isolation_headers) {
|
||||
// Embedded assets are immutable — cache aggressively for PWA/sw offline support.
|
||||
// PWA runtime files (sw.js, manifest, version.json) use no-cache for revalidation.
|
||||
// Bundle files use Vite content hashes (bundle.<hash>.js/css) so each build
|
||||
// produces a different filename — browsers naturally get a fresh copy on upgrade.
|
||||
auto serve_asset_cached = [](const std::string & name, const char * mime, bool with_isolation_headers) {
|
||||
return [name, mime, with_isolation_headers](const httplib::Request & req, httplib::Response & res) {
|
||||
const llama_ui_asset * a = llama_ui_find_asset(name.c_str());
|
||||
if (!a) {
|
||||
@@ -334,14 +417,129 @@ bool server_http_context::init(const common_params & params) {
|
||||
res.set_header("Cross-Origin-Embedder-Policy", "require-corp");
|
||||
res.set_header("Cross-Origin-Opener-Policy", "same-origin");
|
||||
}
|
||||
res.set_header("Cache-Control", "public, max-age=31536000, immutable");
|
||||
res.set_content(reinterpret_cast<const char*>(a->data), a->size, mime);
|
||||
return false;
|
||||
};
|
||||
};
|
||||
|
||||
srv->Get(params.api_prefix + "/", serve_asset("index.html", "text/html; charset=utf-8", true));
|
||||
srv->Get(params.api_prefix + "/bundle.js", serve_asset("bundle.js", "application/javascript; charset=utf-8", false));
|
||||
srv->Get(params.api_prefix + "/bundle.css", serve_asset("bundle.css", "text/css; charset=utf-8", false));
|
||||
auto serve_asset_nocache = [](const std::string & name, const char * mime, bool with_isolation_headers) {
|
||||
return [name, mime, with_isolation_headers](const httplib::Request & /*req*/, httplib::Response & res) {
|
||||
const llama_ui_asset * a = llama_ui_find_asset(name.c_str());
|
||||
if (!a) {
|
||||
res.status = 404;
|
||||
return false;
|
||||
}
|
||||
if (with_isolation_headers) {
|
||||
res.set_header("Cross-Origin-Embedder-Policy", "require-corp");
|
||||
res.set_header("Cross-Origin-Opener-Policy", "same-origin");
|
||||
}
|
||||
res.set_header("Cache-Control", "no-cache");
|
||||
res.set_content(reinterpret_cast<const char*>(a->data), a->size, mime);
|
||||
return false;
|
||||
};
|
||||
};
|
||||
|
||||
// Bundle files in _app/immutable/ — SvelteKit outputs them here and index.html
|
||||
// and sw.js reference them via these paths (vanilla build, no plugin).
|
||||
auto serve_bundle = [serve_asset_cached](const httplib::Request & req, httplib::Response & res) {
|
||||
std::string path = req.path;
|
||||
std::string name;
|
||||
const char * mime;
|
||||
if (path.rfind("/_app/immutable/bundle.", 0) == 0 && path.size() > 22) {
|
||||
name = path.substr(1); // strip leading /
|
||||
mime = "application/javascript; charset=utf-8";
|
||||
} else if (path.rfind("/_app/immutable/assets/bundle.", 0) == 0 && path.size() > 30) {
|
||||
name = path.substr(1); // strip leading /
|
||||
mime = "text/css; charset=utf-8";
|
||||
} else {
|
||||
res.status = 404;
|
||||
return false;
|
||||
}
|
||||
return serve_asset_cached(name, mime, false)(req, res);
|
||||
};
|
||||
|
||||
// _app/ paths — vanilla SvelteKit output, index.html and sw.js reference
|
||||
// bundles and version.json here directly.
|
||||
srv->Get(params.api_prefix + R"(/_app/immutable/bundle\.[^/]+\.js)", serve_bundle);
|
||||
srv->Get(params.api_prefix + R"(/_app/immutable/assets/bundle\.[^/]+\.css)", serve_bundle);
|
||||
srv->Get(params.api_prefix + "/_app/version.json", serve_asset_cached("_app/version.json", "application/json; charset=utf-8", false));
|
||||
|
||||
auto serve_workbox = [serve_asset_cached](const httplib::Request & req, httplib::Response & res) {
|
||||
std::string name = req.path.substr(1);
|
||||
if (name.rfind("workbox-", 0) == 0 && name.size() > 10) {
|
||||
return serve_asset_cached(name, "application/javascript; charset=utf-8", false)(req, res);
|
||||
}
|
||||
res.status = 404;
|
||||
return false;
|
||||
};
|
||||
srv->Get(params.api_prefix + R"(/workbox-[^/]+\.js)", serve_workbox);
|
||||
srv->Get(params.api_prefix + R"(/sw\.js)", serve_asset_cached("sw.js", "application/javascript; charset=utf-8", false));
|
||||
srv->Get(params.api_prefix + "/manifest.webmanifest", serve_asset_cached("manifest.webmanifest", "application/manifest+json; charset=utf-8", false));
|
||||
srv->Get(params.api_prefix + "/version.json", serve_asset_cached("_app/version.json", "application/json; charset=utf-8", false));
|
||||
srv->Get(params.api_prefix + "/build.json", serve_asset_cached("build.json", "application/json; charset=utf-8", false));
|
||||
|
||||
// Finally serve index.html for all other routes (SPA fallback)
|
||||
srv->Get(params.api_prefix + "/", serve_asset_cached("index.html", "text/html; charset=utf-8", true));
|
||||
srv->Get(params.api_prefix + "/favicon.ico", serve_asset_cached("favicon.ico", "image/x-icon", false));
|
||||
srv->Get(params.api_prefix + "/favicon-dark.ico", serve_asset_cached("favicon-dark.ico", "image/x-icon", false));
|
||||
srv->Get(params.api_prefix + "/favicon.svg", serve_asset_cached("favicon.svg", "image/svg+xml", false));
|
||||
srv->Get(params.api_prefix + "/favicon-dark.svg", serve_asset_cached("favicon-dark.svg", "image/svg+xml", false));
|
||||
srv->Get(params.api_prefix + "/pwa-64x64.png", serve_asset_cached("pwa-64x64.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/pwa-192x192.png", serve_asset_cached("pwa-192x192.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/pwa-512x512.png", serve_asset_cached("pwa-512x512.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/maskable-icon-512x512.png", serve_asset_cached("maskable-icon-512x512.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-touch-icon-180x180.png", serve_asset_cached("apple-touch-icon-180x180.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-portrait-640x1136.png", serve_asset_cached("apple-splash-portrait-640x1136.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-landscape-1136x640.png", serve_asset_cached("apple-splash-landscape-1136x640.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-portrait-750x1334.png", serve_asset_cached("apple-splash-portrait-750x1334.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-landscape-1334x750.png", serve_asset_cached("apple-splash-landscape-1334x750.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-portrait-1170x2532.png", serve_asset_cached("apple-splash-portrait-1170x2532.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-landscape-2532x1170.png", serve_asset_cached("apple-splash-landscape-2532x1170.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-portrait-1179x2556.png", serve_asset_cached("apple-splash-portrait-1179x2556.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-landscape-2556x1179.png", serve_asset_cached("apple-splash-landscape-2556x1179.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-portrait-1206x2622.png", serve_asset_cached("apple-splash-portrait-1206x2622.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-landscape-2622x1206.png", serve_asset_cached("apple-splash-landscape-2622x1206.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-portrait-1284x2778.png", serve_asset_cached("apple-splash-portrait-1284x2778.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-landscape-2778x1284.png", serve_asset_cached("apple-splash-landscape-2778x1284.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-portrait-1290x2796.png", serve_asset_cached("apple-splash-portrait-1290x2796.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-landscape-2796x1290.png", serve_asset_cached("apple-splash-landscape-2796x1290.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-portrait-1320x2868.png", serve_asset_cached("apple-splash-portrait-1320x2868.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-landscape-2868x1320.png", serve_asset_cached("apple-splash-landscape-2868x1320.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-portrait-1488x2266.png", serve_asset_cached("apple-splash-portrait-1488x2266.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-landscape-2266x1488.png", serve_asset_cached("apple-splash-landscape-2266x1488.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-portrait-1640x2360.png", serve_asset_cached("apple-splash-portrait-1640x2360.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-landscape-2360x1640.png", serve_asset_cached("apple-splash-landscape-2360x1640.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-portrait-1668x2388.png", serve_asset_cached("apple-splash-portrait-1668x2388.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-landscape-2388x1668.png", serve_asset_cached("apple-splash-landscape-2388x1668.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-portrait-2048x2732.png", serve_asset_cached("apple-splash-portrait-2048x2732.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-landscape-2732x2048.png", serve_asset_cached("apple-splash-landscape-2732x2048.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-portrait-dark-640x1136.png", serve_asset_cached("apple-splash-portrait-dark-640x1136.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-landscape-dark-1136x640.png", serve_asset_cached("apple-splash-landscape-dark-1136x640.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-portrait-dark-750x1334.png", serve_asset_cached("apple-splash-portrait-dark-750x1334.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-landscape-dark-1334x750.png", serve_asset_cached("apple-splash-landscape-dark-1334x750.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-portrait-dark-1170x2532.png", serve_asset_cached("apple-splash-portrait-dark-1170x2532.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-landscape-dark-2532x1170.png", serve_asset_cached("apple-splash-landscape-dark-2532x1170.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-portrait-dark-1179x2556.png", serve_asset_cached("apple-splash-portrait-dark-1179x2556.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-landscape-dark-2556x1179.png", serve_asset_cached("apple-splash-landscape-dark-2556x1179.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-portrait-dark-1206x2622.png", serve_asset_cached("apple-splash-portrait-dark-1206x2622.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-landscape-dark-2622x1206.png", serve_asset_cached("apple-splash-landscape-dark-2622x1206.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-portrait-dark-1284x2778.png", serve_asset_cached("apple-splash-portrait-dark-1284x2778.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-landscape-dark-2778x1284.png", serve_asset_cached("apple-splash-landscape-dark-2778x1284.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-portrait-dark-1290x2796.png", serve_asset_cached("apple-splash-portrait-dark-1290x2796.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-landscape-dark-2796x1290.png", serve_asset_cached("apple-splash-landscape-dark-2796x1290.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-portrait-dark-1320x2868.png", serve_asset_cached("apple-splash-portrait-dark-1320x2868.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-landscape-dark-2868x1320.png", serve_asset_cached("apple-splash-landscape-dark-2868x1320.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-portrait-dark-1640x2360.png", serve_asset_cached("apple-splash-portrait-dark-1640x2360.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-landscape-dark-2360x1640.png", serve_asset_cached("apple-splash-landscape-dark-2360x1640.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-portrait-dark-1668x2388.png", serve_asset_cached("apple-splash-portrait-dark-1668x2388.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-landscape-dark-2388x1668.png", serve_asset_cached("apple-splash-landscape-dark-2388x1668.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-portrait-dark-2048x2732.png", serve_asset_cached("apple-splash-portrait-dark-2048x2732.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/apple-splash-landscape-dark-2732x2048.png", serve_asset_cached("apple-splash-landscape-dark-2732x2048.png", "image/png", false));
|
||||
srv->Get(params.api_prefix + "/manifest.webmanifest", serve_asset_nocache("manifest.webmanifest", "application/manifest+json", false));
|
||||
srv->Get(params.api_prefix + "/sw.js", serve_asset_nocache("sw.js", "application/javascript; charset=utf-8", false));
|
||||
srv->Get(params.api_prefix + "/version.json", serve_asset_nocache("version.json", "application/json", false));
|
||||
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
+14
-12
@@ -94,20 +94,22 @@ int llama_server(int argc, char ** argv) {
|
||||
const bool is_router_server = params.model.path.empty();
|
||||
common_params_print_info(params, !is_router_server);
|
||||
|
||||
// validate batch size for embeddings
|
||||
// embeddings require all tokens to be processed in a single ubatch
|
||||
// see https://github.com/ggml-org/llama.cpp/issues/12836
|
||||
if (params.embedding && params.n_batch > params.n_ubatch) {
|
||||
SRV_WRN("embeddings enabled with n_batch (%d) > n_ubatch (%d)\n", params.n_batch, params.n_ubatch);
|
||||
SRV_WRN("setting n_batch = n_ubatch = %d to avoid assertion failure\n", params.n_ubatch);
|
||||
params.n_batch = params.n_ubatch;
|
||||
}
|
||||
if (!is_router_server) {
|
||||
// validate batch size for embeddings
|
||||
// embeddings require all tokens to be processed in a single ubatch
|
||||
// see https://github.com/ggml-org/llama.cpp/issues/12836
|
||||
if (params.embedding && params.n_batch > params.n_ubatch) {
|
||||
SRV_WRN("embeddings enabled with n_batch (%d) > n_ubatch (%d)\n", params.n_batch, params.n_ubatch);
|
||||
SRV_WRN("setting n_batch = n_ubatch = %d to avoid assertion failure\n", params.n_ubatch);
|
||||
params.n_batch = params.n_ubatch;
|
||||
}
|
||||
|
||||
if (params.n_parallel < 0) {
|
||||
SRV_INF("%s", "n_parallel is set to auto, using n_parallel = 4 and kv_unified = true\n");
|
||||
if (params.n_parallel < 0) {
|
||||
SRV_INF("%s", "n_parallel is set to auto, using n_parallel = 4 and kv_unified = true\n");
|
||||
|
||||
params.n_parallel = 4;
|
||||
params.kv_unified = true;
|
||||
params.n_parallel = 4;
|
||||
params.kv_unified = true;
|
||||
}
|
||||
}
|
||||
|
||||
// for consistency between server router mode and single-model mode, we set the same model name as alias
|
||||
|
||||
@@ -26,7 +26,7 @@ def test_access_static_assets_without_api_key():
|
||||
"""Static web UI assets should not require API key authentication (issue #21229)"""
|
||||
global server
|
||||
server.start()
|
||||
for path in ["/", "/bundle.js", "/bundle.css"]:
|
||||
for path in ["/", "/sw.js", "/manifest.webmanifest", "/_app/version.json"]:
|
||||
res = server.make_request("GET", path)
|
||||
assert res.status_code == 200, f"Expected 200 for {path}, got {res.status_code}"
|
||||
|
||||
|
||||
@@ -8,6 +8,8 @@ node_modules
|
||||
.wrangler
|
||||
/.svelte-kit
|
||||
/build
|
||||
dev-dist
|
||||
dist
|
||||
|
||||
# OS
|
||||
.DS_Store
|
||||
@@ -23,6 +25,15 @@ Thumbs.db
|
||||
vite.config.js.timestamp-*
|
||||
vite.config.ts.timestamp-*
|
||||
|
||||
# PWA Artifacts
|
||||
apple-splash-*.png
|
||||
apple-touch-icon-*.png
|
||||
favicon.ico
|
||||
favicon-dark.ico
|
||||
maskable-icon-*.png
|
||||
pwa-*.png
|
||||
|
||||
# Storybook
|
||||
*storybook.log
|
||||
storybook-static
|
||||
*.code-workspace
|
||||
|
||||
@@ -77,6 +77,7 @@ add_custom_target(llama-ui-assets ALL
|
||||
"-DUI_SOURCE_DIR=${CMAKE_CURRENT_SOURCE_DIR}"
|
||||
"-DUI_BINARY_DIR=${CMAKE_CURRENT_BINARY_DIR}"
|
||||
"-DLLAMA_SOURCE_DIR=${PROJECT_SOURCE_DIR}"
|
||||
"-DLLAMA_BUILD_NUMBER=${LLAMA_BUILD_NUMBER}"
|
||||
"-DHF_BUCKET=${LLAMA_UI_HF_BUCKET}"
|
||||
"-DHF_VERSION=${HF_UI_VERSION}"
|
||||
"-DHF_ENABLED=${LLAMA_USE_PREBUILT_UI}"
|
||||
|
||||
Generated
+6854
-1259
File diff suppressed because it is too large
Load Diff
+31
-25
@@ -4,8 +4,9 @@
|
||||
"version": "1.0.0",
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"build": "npm run build-pwa-assets && vite build",
|
||||
"build-pwa-assets": "npx @vite-pwa/assets-generator --root . --config pwa-assets.config.ts && npx @vite-pwa/assets-generator --root . --config pwa-assets-dark.config.ts && node scripts/make-icons-circular.js",
|
||||
"dev": "bash scripts/dev.sh",
|
||||
"build": "vite build",
|
||||
"preview": "vite preview",
|
||||
"prepare": "svelte-kit sync || echo ''",
|
||||
"check": "svelte-kit sync && svelte-check --tsconfig ./tsconfig.json",
|
||||
@@ -15,12 +16,15 @@
|
||||
"lint": "prettier --check . && eslint .",
|
||||
"test": "npm run test:ui -- --run && npm run test:client -- --run && npm run test:unit -- --run && npm run test:e2e",
|
||||
"test:e2e": "playwright test",
|
||||
"test:e2e:pwa": "playwright test tests/e2e/pwa.e2e.ts",
|
||||
"test:client": "vitest --project=client",
|
||||
"test:unit": "vitest --project=unit",
|
||||
"test:unit:pwa": "vitest --project=unit --run tests/unit/pwa.spec.ts",
|
||||
"test:pwa": "npm run test:unit:pwa && npm run test:e2e:pwa",
|
||||
"test:ui": "vitest --project=ui",
|
||||
"storybook": "storybook dev -p 6006",
|
||||
"build-storybook": "storybook build",
|
||||
"cleanup": "rm -rf .svelte-kit build node_modules test-results"
|
||||
"cleanup": "rm -rf .svelte-kit build node_modules test-results dist dev-dist debug-storybook.log static/pwa-*.png static/maskable-icon-*.png static/apple-touch-icon-*.png static/apple-splash-*.png static/favicon*.ico"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@chromatic-com/storybook": "5.0.0",
|
||||
@@ -41,29 +45,31 @@
|
||||
"@tailwindcss/forms": "0.5.10",
|
||||
"@tailwindcss/typography": "0.5.16",
|
||||
"@tailwindcss/vite": "4.1.11",
|
||||
"@types/node": "^24",
|
||||
"@types/node": "24.13.0",
|
||||
"@vite-pwa/assets-generator": "1.0.2",
|
||||
"@vite-pwa/sveltekit": "1.1.0",
|
||||
"@vitest/browser": "4.1.8",
|
||||
"@vitest/browser-playwright": "4.1.8",
|
||||
"@vitest/coverage-v8": "4.1.8",
|
||||
"bits-ui": "2.18.1",
|
||||
"clsx": "2.1.1",
|
||||
"dexie": "4.0.11",
|
||||
"eslint": "9.39.2",
|
||||
"dexie": "4.4.3",
|
||||
"eslint": "9.39.4",
|
||||
"eslint-config-prettier": "10.1.8",
|
||||
"eslint-plugin-storybook": "10.2.4",
|
||||
"eslint-plugin-svelte": "3.15.0",
|
||||
"globals": "16.3.0",
|
||||
"eslint-plugin-storybook": "10.4.2",
|
||||
"eslint-plugin-svelte": "3.19.0",
|
||||
"globals": "16.5.0",
|
||||
"highlight.js": "11.11.1",
|
||||
"http-server": "14.1.1",
|
||||
"mdast": "3.0.0",
|
||||
"mdsvex": "0.12.6",
|
||||
"mdsvex": "0.12.7",
|
||||
"mermaid": "11.15.0",
|
||||
"mode-watcher": "1.1.0",
|
||||
"pdfjs-dist": "5.4.54",
|
||||
"playwright": "1.56.1",
|
||||
"prettier": "3.6.2",
|
||||
"prettier-plugin-svelte": "3.4.0",
|
||||
"prettier-plugin-tailwindcss": "0.6.14",
|
||||
"prettier": "3.8.3",
|
||||
"prettier-plugin-svelte": "4.1.0",
|
||||
"prettier-plugin-tailwindcss": "0.8.0",
|
||||
"rehype-highlight": "7.0.2",
|
||||
"rehype-katex": "7.0.1",
|
||||
"rehype-stringify": "10.0.1",
|
||||
@@ -73,25 +79,25 @@
|
||||
"remark-html": "16.0.1",
|
||||
"remark-math": "6.0.0",
|
||||
"remark-rehype": "11.1.2",
|
||||
"sass": "1.93.3",
|
||||
"storybook": "10.3.3",
|
||||
"svelte": "5.55.7",
|
||||
"svelte-check": "4.3.0",
|
||||
"svelte-sonner": "1.0.5",
|
||||
"tailwind-merge": "3.3.1",
|
||||
"sass": "1.100.0",
|
||||
"storybook": "10.4.2",
|
||||
"svelte": "5.56.1",
|
||||
"svelte-check": "4.6.0",
|
||||
"svelte-sonner": "1.1.1",
|
||||
"tailwind-merge": "3.6.0",
|
||||
"tailwind-variants": "3.2.2",
|
||||
"tailwindcss": "4.1.11",
|
||||
"tw-animate-css": "1.3.5",
|
||||
"typescript": "5.8.3",
|
||||
"typescript-eslint": "8.56.0",
|
||||
"tailwindcss": "4.3.0",
|
||||
"tw-animate-css": "1.4.0",
|
||||
"typescript": "5.9.3",
|
||||
"typescript-eslint": "8.60.1",
|
||||
"unified": "11.0.5",
|
||||
"unist-util-visit": "5.0.0",
|
||||
"unist-util-visit": "5.1.0",
|
||||
"uuid": "13.0.2",
|
||||
"vite": "7.3.2",
|
||||
"vite": "7.3.5",
|
||||
"vite-plugin-devtools-json": "0.2.1",
|
||||
"vitest": "4.1.8",
|
||||
"vitest-browser-svelte": "2.1.1",
|
||||
"zod": "4.2.1"
|
||||
"workbox-window": "7.4.1"
|
||||
},
|
||||
"overrides": {
|
||||
"cookie": "1.1.1"
|
||||
|
||||
@@ -1,11 +1,31 @@
|
||||
import { defineConfig } from '@playwright/test';
|
||||
import { defineConfig, devices } from '@playwright/test';
|
||||
|
||||
export default defineConfig({
|
||||
testDir: 'tests/e2e',
|
||||
testMatch: ['**/*.e2e.ts'],
|
||||
timeout: 30000,
|
||||
expect: {
|
||||
timeout: 5000
|
||||
},
|
||||
fullyParallel: true,
|
||||
forbidOnly: !!process.env.CI,
|
||||
retries: process.env.CI ? 2 : 0,
|
||||
workers: process.env.CI ? 1 : undefined,
|
||||
reporter: 'line',
|
||||
use: {
|
||||
baseURL: 'http://localhost:8181',
|
||||
trace: 'on-first-retry'
|
||||
},
|
||||
projects: [
|
||||
{
|
||||
name: 'chromium',
|
||||
use: { ...devices['Desktop Chrome'] }
|
||||
}
|
||||
],
|
||||
webServer: {
|
||||
command: 'npm run build && npx http-server ./dist -p 8181',
|
||||
port: 8181,
|
||||
timeout: 120000,
|
||||
reuseExistingServer: false
|
||||
},
|
||||
testDir: 'tests/e2e'
|
||||
reuseExistingServer: !process.env.CI
|
||||
}
|
||||
});
|
||||
|
||||
@@ -0,0 +1,20 @@
|
||||
import { defineConfig } from '@vite-pwa/assets-generator/config';
|
||||
|
||||
export default defineConfig({
|
||||
headLinkOptions: {
|
||||
preset: '2023'
|
||||
},
|
||||
preset: {
|
||||
transparent: {
|
||||
sizes: [],
|
||||
favicons: [[48, 'favicon-dark.ico']]
|
||||
},
|
||||
maskable: {
|
||||
sizes: []
|
||||
},
|
||||
apple: {
|
||||
sizes: []
|
||||
}
|
||||
},
|
||||
images: ['static/favicon-dark.svg']
|
||||
});
|
||||
@@ -0,0 +1,51 @@
|
||||
import {
|
||||
combinePresetAndAppleSplashScreens,
|
||||
defineConfig,
|
||||
minimal2023Preset
|
||||
} from '@vite-pwa/assets-generator/config';
|
||||
import { readFileSync } from 'node:fs';
|
||||
import { resolve } from 'node:path';
|
||||
import { THEME_COLORS, PWA_GENERATOR_DEVICES, PWA_ASSET_GENERATOR } from './src/lib/constants/pwa';
|
||||
import { SplashOrientation } from './src/lib/enums/splash.enums';
|
||||
|
||||
export default defineConfig({
|
||||
headLinkOptions: {
|
||||
preset: PWA_ASSET_GENERATOR.LINK_PRESET
|
||||
},
|
||||
preset: combinePresetAndAppleSplashScreens(
|
||||
minimal2023Preset,
|
||||
{
|
||||
padding: PWA_ASSET_GENERATOR.SPLASH_PADDING,
|
||||
resizeOptions: {
|
||||
background: THEME_COLORS.BACKGROUND_LIGHT,
|
||||
fit: PWA_ASSET_GENERATOR.FIT_MODE
|
||||
},
|
||||
darkResizeOptions: {
|
||||
background: THEME_COLORS.BACKGROUND_DARK,
|
||||
fit: PWA_ASSET_GENERATOR.FIT_MODE
|
||||
},
|
||||
darkImageResolver: async (imageName: string) => {
|
||||
if (imageName.endsWith('favicon.svg')) {
|
||||
return readFileSync(resolve('static/favicon-dark.svg'));
|
||||
}
|
||||
},
|
||||
linkMediaOptions: {
|
||||
log: true,
|
||||
addMediaScreen: PWA_ASSET_GENERATOR.ADD_MEDIA_SCREEN,
|
||||
basePath: PWA_ASSET_GENERATOR.BASE_PATH,
|
||||
xhtml: PWA_ASSET_GENERATOR.XHTML
|
||||
},
|
||||
png: {
|
||||
compressionLevel: PWA_ASSET_GENERATOR.PNG_COMPRESSION_LEVEL,
|
||||
quality: PWA_ASSET_GENERATOR.PNG_QUALITY
|
||||
},
|
||||
name: (landscape, size, dark) => {
|
||||
const orientation = landscape ? SplashOrientation.LANDSCAPE : SplashOrientation.PORTRAIT;
|
||||
const darkPrefix = dark ? PWA_ASSET_GENERATOR.DARK_PREFIX : '';
|
||||
return `apple-splash-${orientation}-${darkPrefix}${size.width}x${size.height}.png`;
|
||||
}
|
||||
},
|
||||
PWA_GENERATOR_DEVICES
|
||||
),
|
||||
images: ['static/favicon.svg']
|
||||
});
|
||||
@@ -0,0 +1,137 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
/**
|
||||
* Apply circular mask to pwa-*.png icons.
|
||||
* Uses the maskable icon as source (white bg, full logo) to avoid
|
||||
* the small-colormap pwa icons looking bad when cropped to a circle.
|
||||
*
|
||||
* Usage: node scripts/make-icons-circular.js [--padding-pct <0-50>] [--scale-pct <50-100>]
|
||||
*
|
||||
* - padding-pct: percentage of icon size kept as padding around the circle (default: 25)
|
||||
* - scale-pct: scale down the source image before cropping (default: 85)
|
||||
*
|
||||
* maskable-icon and apple-touch-icon are left untouched.
|
||||
*/
|
||||
|
||||
import sharp from 'sharp';
|
||||
import fs from 'fs';
|
||||
import path from 'path';
|
||||
import { fileURLToPath } from 'url';
|
||||
|
||||
const __filename = fileURLToPath(import.meta.url);
|
||||
const __dirname = path.dirname(__filename);
|
||||
|
||||
const STATIC_DIR = path.resolve(__dirname, '..', 'static');
|
||||
|
||||
const paddingPct = process.argv.reduce((acc, arg, i, args) => {
|
||||
if (arg === '--padding-pct' && args[i + 1]) return parseFloat(args[i + 1]);
|
||||
return acc;
|
||||
}, 0);
|
||||
|
||||
// Scale down the source image before cropping to circle
|
||||
const scalePct = process.argv.reduce((acc, arg, i, args) => {
|
||||
if (arg === '--scale-pct' && args[i + 1]) return parseFloat(args[i + 1]);
|
||||
return acc;
|
||||
}, 85); // default 85% - icon fills 85% of the circular area
|
||||
|
||||
// Source for circular icons: the maskable icon (white bg, full logo)
|
||||
const sourceIcon = 'maskable-icon-512x512.png';
|
||||
const targetIcons = ['pwa-64x64.png', 'pwa-192x192.png', 'pwa-512x512.png'];
|
||||
|
||||
// maskable-icon and apple-touch-icon stay square
|
||||
const untouchedIcons = ['maskable-icon-512x512.png', 'apple-touch-icon-180x180.png'];
|
||||
|
||||
async function makeCircle(targetFilename) {
|
||||
const targetPath = path.join(STATIC_DIR, targetFilename);
|
||||
const sourcePath = path.join(STATIC_DIR, sourceIcon);
|
||||
|
||||
if (!fs.existsSync(sourcePath)) {
|
||||
console.log(`⏭️ ${sourceIcon} not found, skipping`);
|
||||
return;
|
||||
}
|
||||
if (!fs.existsSync(targetPath)) {
|
||||
console.log(`⏭️ ${targetFilename} not found, skipping`);
|
||||
return;
|
||||
}
|
||||
|
||||
const metadata = await sharp(targetPath).metadata();
|
||||
const size = Math.max(metadata.width, metadata.height);
|
||||
const radius = Math.floor((size * (1 - paddingPct / 100)) / 2);
|
||||
const center = Math.floor(size / 2);
|
||||
|
||||
// Build circular mask as RGBA buffer: white opaque circle on transparent bg
|
||||
const maskBuf = Buffer.alloc(size * size * 4, 0);
|
||||
for (let y = 0; y < size; y++) {
|
||||
for (let x = 0; x < size; x++) {
|
||||
const dx = x - center;
|
||||
const dy = y - center;
|
||||
const dist = Math.sqrt(dx * dx + dy * dy);
|
||||
if (dist < radius) {
|
||||
const i = (y * size + x) * 4;
|
||||
maskBuf[i] = 255;
|
||||
maskBuf[i + 1] = 255;
|
||||
maskBuf[i + 2] = 255;
|
||||
maskBuf[i + 3] = 255;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const tmpMask = path.join(STATIC_DIR, '.mask-tmp.png');
|
||||
await sharp(maskBuf, {
|
||||
raw: { width: size, height: size, channels: 4 }
|
||||
})
|
||||
.png()
|
||||
.toFile(tmpMask);
|
||||
|
||||
// Step 1: Scale source relative to circle diameter (not full icon), composite centered onto white canvas of full size
|
||||
const circleDiameter = Math.floor(size * (1 - paddingPct / 100));
|
||||
const scaledSize = Math.floor((circleDiameter * scalePct) / 100);
|
||||
const offset = Math.floor((size - scaledSize) / 2);
|
||||
|
||||
const scaledBuf = await sharp(sourcePath)
|
||||
.resize(scaledSize, scaledSize, {
|
||||
fit: 'cover',
|
||||
background: { r: 255, g: 255, b: 255, alpha: 1 }
|
||||
})
|
||||
.ensureAlpha()
|
||||
.png()
|
||||
.toBuffer();
|
||||
|
||||
// Step 2: Composite scaled image onto white background, then apply circular mask
|
||||
const output = await sharp({
|
||||
create: {
|
||||
width: size,
|
||||
height: size,
|
||||
channels: 4,
|
||||
background: { r: 255, g: 255, b: 255, alpha: 1 }
|
||||
}
|
||||
})
|
||||
.composite([
|
||||
{ input: scaledBuf, top: offset, left: offset },
|
||||
{ input: tmpMask, top: 0, left: 0, blend: 'dest-in' }
|
||||
])
|
||||
.png()
|
||||
.toBuffer();
|
||||
|
||||
fs.writeFileSync(targetPath, output);
|
||||
fs.unlinkSync(tmpMask);
|
||||
|
||||
console.log(
|
||||
`✓ ${targetFilename} → circle from ${sourceIcon}, ${paddingPct}% padding (size=${size}, r=${radius}, scale=${scalePct}%, circleDiameter=${circleDiameter})`
|
||||
);
|
||||
}
|
||||
|
||||
async function main() {
|
||||
console.log(`Circular mask: ${paddingPct}% padding, ${scalePct}% scale, source=${sourceIcon}\n`);
|
||||
for (const icon of targetIcons) {
|
||||
await makeCircle(icon);
|
||||
}
|
||||
|
||||
console.log('\nUnchanged:');
|
||||
for (const icon of untouchedIcons) {
|
||||
const fp = path.join(STATIC_DIR, icon);
|
||||
console.log(` ${icon} (${fs.existsSync(fp) ? fs.statSync(fp).size + ' bytes' : 'missing'})`);
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
@@ -0,0 +1,42 @@
|
||||
import { writeFileSync, existsSync } from 'node:fs';
|
||||
import { resolve } from 'path';
|
||||
import type { Plugin } from 'vite';
|
||||
import { BUILD_CONFIG } from '../src/lib/constants/pwa';
|
||||
|
||||
let processed = false;
|
||||
|
||||
const OUTPUT_DIR = process.env.LLAMA_UI_OUT_DIR ?? BUILD_CONFIG.OUTPUT_DIR;
|
||||
|
||||
/**
|
||||
* Write build.json with the llama.cpp release build number.
|
||||
*
|
||||
* LLAMA_BUILD_NUMBER is passed from CMake -> npm -> vite via env var.
|
||||
* Used for display of the current llama-server release (e.g. "b1234").
|
||||
*/
|
||||
export function buildInfoPlugin(): Plugin {
|
||||
return {
|
||||
name: 'llamacpp:build-info',
|
||||
apply: 'build',
|
||||
closeBundle() {
|
||||
setTimeout(() => {
|
||||
try {
|
||||
if (processed) return;
|
||||
processed = true;
|
||||
|
||||
const buildNumber = process.env.LLAMA_BUILD_NUMBER;
|
||||
if (!buildNumber) return;
|
||||
|
||||
const outDir = resolve(OUTPUT_DIR);
|
||||
const indexPath = resolve(outDir, 'index.html');
|
||||
if (!existsSync(indexPath)) return;
|
||||
|
||||
const buildJsonPath = resolve(outDir, 'build.json');
|
||||
writeFileSync(buildJsonPath, JSON.stringify({ version: buildNumber }), 'utf-8');
|
||||
console.log(`Created build.json (version: ${buildNumber})`);
|
||||
} catch (error) {
|
||||
console.error('Failed to write build.json:', error);
|
||||
}
|
||||
}, 100);
|
||||
}
|
||||
};
|
||||
}
|
||||
@@ -1,105 +0,0 @@
|
||||
import {
|
||||
readFileSync,
|
||||
writeFileSync,
|
||||
existsSync,
|
||||
readdirSync,
|
||||
copyFileSync,
|
||||
rmSync,
|
||||
unlinkSync
|
||||
} from 'fs';
|
||||
import { resolve } from 'path';
|
||||
import type { Plugin } from 'vite';
|
||||
|
||||
const GUIDE_FOR_FRONTEND = `
|
||||
<!--
|
||||
This is a static build of the frontend.
|
||||
It is automatically generated by the build process.
|
||||
Do not edit this file directly.
|
||||
To make changes, refer to the "Web UI" section in the README.
|
||||
-->
|
||||
`.trim();
|
||||
|
||||
const OUTPUT_DIR = process.env.LLAMA_UI_OUT_DIR ?? './dist';
|
||||
|
||||
export function llamaCppBuildPlugin(): Plugin {
|
||||
return {
|
||||
name: 'llamacpp:build',
|
||||
apply: 'build',
|
||||
closeBundle() {
|
||||
setTimeout(() => {
|
||||
try {
|
||||
const outDir = resolve(OUTPUT_DIR);
|
||||
const indexPath = resolve(outDir, 'index.html');
|
||||
if (!existsSync(indexPath)) return;
|
||||
|
||||
let content = readFileSync(indexPath, 'utf-8');
|
||||
|
||||
// Inline favicon as base64 data URL
|
||||
const faviconPath = resolve('static/favicon.svg');
|
||||
if (existsSync(faviconPath)) {
|
||||
const faviconContent = readFileSync(faviconPath, 'utf-8');
|
||||
const faviconBase64 = Buffer.from(faviconContent).toString('base64');
|
||||
const faviconDataUrl = `data:image/svg+xml;base64,${faviconBase64}`;
|
||||
content = content.replace(/href="[^"]*favicon\.svg"/g, `href="${faviconDataUrl}"`);
|
||||
console.log('✓ Inlined favicon.svg as base64 data URL');
|
||||
}
|
||||
|
||||
content = content.replace(/\r/g, '');
|
||||
content = GUIDE_FOR_FRONTEND + '\n' + content;
|
||||
|
||||
// Keep the Vite hash as a query string so each build busts the browser cache
|
||||
content = content.replace(/\/_app\/immutable\/bundle\.([^".]+)\.js/g, './bundle.js?$1');
|
||||
content = content.replace(
|
||||
/\/_app\/immutable\/assets\/bundle\.([^".]+)\.css/g,
|
||||
'./bundle.css?$1'
|
||||
);
|
||||
content = content.replace(/__sveltekit_[a-z0-9]+/g, '__sveltekit__');
|
||||
|
||||
writeFileSync(indexPath, content, 'utf-8');
|
||||
console.log('✓ Updated index.html');
|
||||
|
||||
// Copy bundle.*.js -> bundle.js at output root
|
||||
const immutableDir = resolve(outDir, '_app/immutable');
|
||||
const bundleDir = resolve(outDir, '_app/immutable/assets');
|
||||
|
||||
if (existsSync(immutableDir)) {
|
||||
const jsFiles = readdirSync(immutableDir).filter((f) => f.match(/^bundle\..+\.js$/));
|
||||
if (jsFiles.length > 0) {
|
||||
copyFileSync(resolve(immutableDir, jsFiles[0]), resolve(outDir, 'bundle.js'));
|
||||
// Normalize __sveltekit_<hash> to __sveltekit__ in bundle.js
|
||||
const bundleJsPath = resolve(outDir, 'bundle.js');
|
||||
let bundleJs = readFileSync(bundleJsPath, 'utf-8');
|
||||
bundleJs = bundleJs.replace(/__sveltekit_[a-z0-9]+/g, '__sveltekit__');
|
||||
writeFileSync(bundleJsPath, bundleJs, 'utf-8');
|
||||
console.log(`✓ Copied ${jsFiles[0]} -> bundle.js`);
|
||||
}
|
||||
}
|
||||
|
||||
// Copy bundle.*.css -> bundle.css at output root
|
||||
if (existsSync(bundleDir)) {
|
||||
const cssFiles = readdirSync(bundleDir).filter((f) => f.match(/^bundle\..+\.css$/));
|
||||
if (cssFiles.length > 0) {
|
||||
copyFileSync(resolve(bundleDir, cssFiles[0]), resolve(outDir, 'bundle.css'));
|
||||
console.log(`✓ Copied ${cssFiles[0]} -> bundle.css`);
|
||||
}
|
||||
}
|
||||
|
||||
// Cleanup: remove _app directory, favicon.svg, and legacy index.html.gz
|
||||
const appDir = resolve(outDir, '_app');
|
||||
if (existsSync(appDir)) {
|
||||
rmSync(appDir, { recursive: true, force: true });
|
||||
console.log('✓ Removed _app directory');
|
||||
}
|
||||
|
||||
const faviconOut = resolve(outDir, 'favicon.svg');
|
||||
if (existsSync(faviconOut)) {
|
||||
unlinkSync(faviconOut);
|
||||
console.log('✓ Removed favicon.svg');
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Failed to process build output:', error);
|
||||
}
|
||||
}, 100);
|
||||
}
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,61 @@
|
||||
import { readFileSync, writeFileSync, existsSync } from 'node:fs';
|
||||
import { resolve } from 'path';
|
||||
import type { Plugin } from 'vite';
|
||||
import { BUILD_CONFIG } from '../src/lib/constants/pwa';
|
||||
|
||||
let processed = false;
|
||||
|
||||
const OUTPUT_DIR = process.env.LLAMA_UI_OUT_DIR ?? BUILD_CONFIG.OUTPUT_DIR;
|
||||
|
||||
function rewrite(path: string, pairs: [string, string][]): void {
|
||||
if (!existsSync(path)) {
|
||||
return;
|
||||
}
|
||||
const text = readFileSync(path, 'utf-8');
|
||||
let out = text;
|
||||
for (const [from, to] of pairs) {
|
||||
out = out.split(from).join(to);
|
||||
}
|
||||
if (out !== text) {
|
||||
writeFileSync(path, out, 'utf-8');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Relativize SvelteKit absolute base refs so the build is relocatable under any subpath.
|
||||
*
|
||||
* SvelteKit bakes root absolute /_app/ paths into the SPA fallback because paths.relative
|
||||
* does not apply to a depth agnostic fallback page. Rewriting to ./_app/ lets a plain
|
||||
* recursive copy of the output into /any/subdir/ resolve assets against the document URL.
|
||||
* Runs after adapter-static writes index.html and the PWA plugin writes sw.js, deferred the
|
||||
* same way as buildInfoPlugin so the emitted files exist.
|
||||
*/
|
||||
export function relativizeBasePlugin(): Plugin {
|
||||
return {
|
||||
name: 'llamacpp:relativize-base',
|
||||
apply: 'build',
|
||||
closeBundle() {
|
||||
setTimeout(() => {
|
||||
try {
|
||||
if (processed) return;
|
||||
processed = true;
|
||||
|
||||
const outDir = resolve(OUTPUT_DIR);
|
||||
|
||||
// index.html: modulepreload, stylesheet and bootstrap import reference "/_app/
|
||||
rewrite(resolve(outDir, 'index.html'), [['"/_app/', '"./_app/']]);
|
||||
|
||||
// sw.js: the only absolute entries are the navigate fallback precache key and handler
|
||||
rewrite(resolve(outDir, 'sw.js'), [
|
||||
['{url:"/"', '{url:"./"'],
|
||||
['createHandlerBoundToURL("/"', 'createHandlerBoundToURL("./"']
|
||||
]);
|
||||
|
||||
console.log('Relativized base refs in index.html and sw.js');
|
||||
} catch (error) {
|
||||
console.error('Failed to relativize base refs:', error);
|
||||
}
|
||||
}, 100);
|
||||
}
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,115 @@
|
||||
import { readdirSync, readFileSync, writeFileSync, existsSync } from 'node:fs';
|
||||
import { resolve } from 'path';
|
||||
import type { Plugin } from 'vite';
|
||||
import { TAB, NEWLINE } from '../src/lib/constants/code';
|
||||
import { APPLE_DEVICES, BUILD_CONFIG, REGEX_PATTERNS, SPLASH_LINK } from '../src/lib/constants/pwa';
|
||||
import type { SplashDimensions } from '../src/lib/types';
|
||||
import { SplashOrientation } from '../src/lib/enums/splash.enums';
|
||||
|
||||
let processed = false;
|
||||
|
||||
const OUTPUT_DIR = process.env.LLAMA_UI_OUT_DIR ?? BUILD_CONFIG.OUTPUT_DIR;
|
||||
|
||||
/**
|
||||
* Generate iOS splash screen <link> tags from generated apple-splash-*.png files.
|
||||
* Returns an array of HTML link strings to be injected into the page head.
|
||||
*/
|
||||
export function generateSplashScreenLinks(outDir: string): string[] {
|
||||
const files = readdirSync(outDir).filter((f) => f.match(REGEX_PATTERNS.SPLASH_FILE));
|
||||
if (files.length === 0) return [];
|
||||
|
||||
const dimMap = new Map<string, SplashDimensions>();
|
||||
for (const [dims, spec] of Object.entries(APPLE_DEVICES)) {
|
||||
const [w, h] = dims.split('x').map(Number);
|
||||
// logical-point dimensions
|
||||
dimMap.set(`${w}x${h}`, { deviceW: spec.width, deviceH: spec.height, dpr: spec.dpr });
|
||||
dimMap.set(`${h}x${w}`, { deviceW: spec.width, deviceH: spec.height, dpr: spec.dpr });
|
||||
// pixel dimensions (used by actual generated splash files)
|
||||
dimMap.set(`${w * spec.dpr}x${h * spec.dpr}`, {
|
||||
deviceW: spec.width,
|
||||
deviceH: spec.height,
|
||||
dpr: spec.dpr
|
||||
});
|
||||
dimMap.set(`${h * spec.dpr}x${w * spec.dpr}`, {
|
||||
deviceW: spec.width,
|
||||
deviceH: spec.height,
|
||||
dpr: spec.dpr
|
||||
});
|
||||
}
|
||||
|
||||
const lightLinks: string[] = [];
|
||||
const darkLinks: string[] = [];
|
||||
|
||||
for (const file of files) {
|
||||
const match = file.match(REGEX_PATTERNS.SPLASH_FILE);
|
||||
if (!match) continue;
|
||||
const orientation = match[1] as SplashOrientation;
|
||||
const isDark = !!match[2];
|
||||
const pixelW = parseInt(match[3]);
|
||||
const pixelH = parseInt(match[4]);
|
||||
|
||||
const key = `${pixelW}x${pixelH}`;
|
||||
const spec = dimMap.get(key);
|
||||
if (!spec) {
|
||||
console.warn(`Unknown splash screen dimensions: ${key} (${file})`);
|
||||
continue;
|
||||
}
|
||||
|
||||
const { deviceW, deviceH, dpr } = spec;
|
||||
const media = `screen and (device-width: ${deviceW}px) and (device-height: ${deviceH}px) and (-webkit-device-pixel-ratio: ${dpr}) and (orientation: ${orientation})`;
|
||||
const href = `./${file}`;
|
||||
|
||||
if (isDark) {
|
||||
darkLinks.push(
|
||||
`${SPLASH_LINK.HTML} media="${media}${SPLASH_LINK.DARK_MEDIA_SUFFIX}" href="${href}">`
|
||||
);
|
||||
} else {
|
||||
lightLinks.push(`${SPLASH_LINK.HTML} media="${media}" href="${href}">`);
|
||||
}
|
||||
}
|
||||
|
||||
return [...lightLinks, ...darkLinks];
|
||||
}
|
||||
|
||||
export function splashScreenPlugin(): Plugin {
|
||||
return {
|
||||
name: 'llamacpp:splash-screen',
|
||||
apply: 'build',
|
||||
closeBundle() {
|
||||
setTimeout(() => {
|
||||
try {
|
||||
if (processed) return;
|
||||
processed = true;
|
||||
|
||||
const outDir = resolve(OUTPUT_DIR);
|
||||
const indexPath = resolve(outDir, 'index.html');
|
||||
if (!existsSync(indexPath)) return;
|
||||
|
||||
let content = readFileSync(indexPath, 'utf-8');
|
||||
|
||||
// Inject iOS splash screen <link> tags into <head>.
|
||||
// The @vite-pwa/assets-generator generates apple-splash-*.png files;
|
||||
// this scans them and creates the <link> tags SvelteKit needs.
|
||||
const splashLinks = generateSplashScreenLinks(outDir);
|
||||
if (splashLinks.length > 0) {
|
||||
console.log(`Generated ${splashLinks.length} apple-splash link tags`);
|
||||
const splashHtml = splashLinks.map((l) => TAB + TAB + l).join(NEWLINE);
|
||||
content = content.replace(
|
||||
REGEX_PATTERNS.HEAD_CLOSE,
|
||||
splashHtml + NEWLINE + TAB + TAB + '</head>'
|
||||
);
|
||||
}
|
||||
|
||||
// Remove trailing \r from Windows line endings
|
||||
content = content.replace(/\r/g, '');
|
||||
content = BUILD_CONFIG.GUIDE_COMMENT + NEWLINE + content;
|
||||
|
||||
writeFileSync(indexPath, content, 'utf-8');
|
||||
console.log('Updated index.html');
|
||||
} catch (error) {
|
||||
console.error('Failed to process build output:', error);
|
||||
}
|
||||
}, 100);
|
||||
}
|
||||
};
|
||||
}
|
||||
Vendored
+3
@@ -1,6 +1,9 @@
|
||||
// See https://svelte.dev/docs/kit/types#app.d.ts
|
||||
// for information about these interfaces
|
||||
|
||||
import 'vite-plugin-pwa/pwa-assets';
|
||||
import 'vite-plugin-pwa/svelte';
|
||||
|
||||
// Import chat types from dedicated module
|
||||
|
||||
import type {
|
||||
|
||||
@@ -2,10 +2,17 @@
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<link rel="icon" href="%sveltekit.assets%/favicon.svg" />
|
||||
<link rel="icon" href="favicon.ico" sizes="48x48" />
|
||||
<link rel="icon" href="favicon.svg" sizes="any" type="image/svg+xml" />
|
||||
|
||||
<link rel="apple-touch-icon" href="apple-touch-icon-180x180.png" />
|
||||
|
||||
<link rel="manifest" href="./manifest.webmanifest" />
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
%sveltekit.head%
|
||||
</head>
|
||||
|
||||
<body data-sveltekit-preload-data="hover">
|
||||
<div style="display: contents">%sveltekit.body%</div>
|
||||
</body>
|
||||
|
||||
@@ -20,6 +20,8 @@
|
||||
import { ColorMode } from '$lib/enums/ui.enums';
|
||||
import { fade } from 'svelte/transition';
|
||||
import { goto } from '$app/navigation';
|
||||
import { Button } from '$lib/components/ui/button';
|
||||
import { RefreshCw } from '@lucide/svelte';
|
||||
import { page } from '$app/state';
|
||||
import { setChatSettingsConfigContext } from '$lib/contexts';
|
||||
import { settingsReferrer } from '$lib/stores/settings-referrer.svelte';
|
||||
@@ -164,6 +166,15 @@
|
||||
onConfigChange={handleConfigChange}
|
||||
onThemeChange={handleThemeChange}
|
||||
/>
|
||||
|
||||
{#if currentSection.title === SETTINGS_SECTION_TITLES.GENERAL}
|
||||
<div class="flex justify-end">
|
||||
<Button variant="outline" onclick={() => window.location.reload()}>
|
||||
<RefreshCw class="h-3 w-3" />
|
||||
Reload app
|
||||
</Button>
|
||||
</div>
|
||||
{/if}
|
||||
</div>
|
||||
{/if}
|
||||
</div>
|
||||
|
||||
@@ -0,0 +1,23 @@
|
||||
<script lang="ts">
|
||||
import { APPLE_META_TAGS, MEDIA_QUERIES, THEME_COLORS } from '$lib/constants/pwa';
|
||||
import { APP_NAME } from '$lib/constants';
|
||||
|
||||
let { appName = APP_NAME } = $props();
|
||||
</script>
|
||||
|
||||
<svelte:head>
|
||||
<!-- Theme color for light/dark modes -->
|
||||
<meta name="theme-color" content={THEME_COLORS.LIGHT} media={MEDIA_QUERIES.PREFERS_LIGHT} />
|
||||
<meta name="theme-color" content={THEME_COLORS.DARK} media={MEDIA_QUERIES.PREFERS_DARK} />
|
||||
|
||||
<!-- Apple mobile web app meta tags -->
|
||||
<meta
|
||||
name={APPLE_META_TAGS.MOBILE_WEB_APP_CAPABLE.name}
|
||||
content={APPLE_META_TAGS.MOBILE_WEB_APP_CAPABLE.content}
|
||||
/>
|
||||
<meta
|
||||
name={APPLE_META_TAGS.STATUS_BAR_STYLE.name}
|
||||
content={APPLE_META_TAGS.STATUS_BAR_STYLE.content}
|
||||
/>
|
||||
<meta name={APPLE_META_TAGS.MOBILE_WEB_APP_TITLE.name} content={appName} />
|
||||
</svelte:head>
|
||||
@@ -0,0 +1,35 @@
|
||||
<script lang="ts">
|
||||
import * as Card from '$lib/components/ui/card';
|
||||
import { Button } from '$lib/components/ui/button';
|
||||
|
||||
let { needRefresh: needRefreshProp, updateServiceWorker, forceReload } = $props();
|
||||
let needRefresh = $derived(needRefreshProp ?? false);
|
||||
</script>
|
||||
|
||||
{#if needRefresh}
|
||||
<Card.Root class="overflow-hidden gap-1 py-5">
|
||||
<Card.Header class="px-5">
|
||||
<Card.Title class="text-sm font-medium">Update available</Card.Title>
|
||||
</Card.Header>
|
||||
|
||||
<Card.Content class="gap-6 grid px-5">
|
||||
<p class="text-xs text-muted-foreground">A new version is available. Reload to update.</p>
|
||||
|
||||
<Button
|
||||
class="justify-self-end-safe"
|
||||
size="sm"
|
||||
onclick={() => {
|
||||
updateServiceWorker();
|
||||
|
||||
if (forceReload) {
|
||||
window.location.reload();
|
||||
}
|
||||
|
||||
needRefresh = false;
|
||||
}}
|
||||
>
|
||||
Reload
|
||||
</Button>
|
||||
</Card.Content>
|
||||
</Card.Root>
|
||||
{/if}
|
||||
@@ -0,0 +1,2 @@
|
||||
export { default as PwaMetaTags } from './PwaMetaTags.svelte';
|
||||
export { default as PwaRefreshAlert } from './PwaRefreshAlert.svelte';
|
||||
@@ -0,0 +1 @@
|
||||
export const APP_NAME = import.meta.env?.VITE_PUBLIC_APP_NAME || 'llama-ui';
|
||||
@@ -1,4 +1,5 @@
|
||||
export const NEWLINE = '\n';
|
||||
export const TAB = '\t';
|
||||
export const DEFAULT_LANGUAGE = 'text';
|
||||
export const LANG_PATTERN = /^(\w*)\n?/;
|
||||
export const AMPERSAND_REGEX = /&/g;
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
|
||||
export * from './agentic';
|
||||
export * from './api-endpoints';
|
||||
export * from './app';
|
||||
export * from './attachment-labels';
|
||||
export * from './database';
|
||||
export * from './reasoning-effort';
|
||||
@@ -36,6 +37,7 @@ export * from './message-export';
|
||||
export * from './model-id';
|
||||
export * from './precision';
|
||||
export * from './processing-info';
|
||||
export * from './pwa';
|
||||
export * from './routes';
|
||||
export * from './sandbox';
|
||||
export * from './settings-keys';
|
||||
|
||||
@@ -0,0 +1,30 @@
|
||||
/**
|
||||
* JPEG and EXIF binary format constants for orientation parsing.
|
||||
*/
|
||||
|
||||
/** Bytes of file prefix to scan, the APP1 EXIF segment sits near the start */
|
||||
export const EXIF_SCAN_BYTE_LIMIT = 128 * 1024;
|
||||
|
||||
/** JPEG start of image marker */
|
||||
export const JPEG_SOI_MARKER = 0xffd8;
|
||||
|
||||
/** APP1 segment marker byte, carries the EXIF payload */
|
||||
export const APP1_MARKER = 0xe1;
|
||||
|
||||
/** Start of scan marker byte, compressed data begins and no EXIF follows */
|
||||
export const SOS_MARKER = 0xda;
|
||||
|
||||
/** "Exif" signature opening the APP1 payload, big endian uint32 */
|
||||
export const EXIF_SIGNATURE = 0x45786966;
|
||||
|
||||
/** TIFF byte order mark for little endian ("II") */
|
||||
export const TIFF_LITTLE_ENDIAN = 0x4949;
|
||||
|
||||
/** TIFF magic number following the byte order mark */
|
||||
export const TIFF_MAGIC = 42;
|
||||
|
||||
/** EXIF tag id holding the orientation value */
|
||||
export const EXIF_ORIENTATION_TAG = 0x0112;
|
||||
|
||||
/** Size in bytes of one IFD directory entry */
|
||||
export const IFD_ENTRY_SIZE = 12;
|
||||
@@ -0,0 +1,352 @@
|
||||
/**
|
||||
* Centralized PWA constants to avoid magic strings, regexes, and duplicated
|
||||
* definitions across the codebase.
|
||||
*/
|
||||
|
||||
import { APP_NAME } from './app';
|
||||
|
||||
export const MEDIA_QUERIES = {
|
||||
PREFERS_DARK: '(prefers-color-scheme: dark)',
|
||||
PREFERS_LIGHT: '(prefers-color-scheme: light)'
|
||||
} as const;
|
||||
|
||||
export const THEME_COLORS = {
|
||||
LIGHT: '#ffffff',
|
||||
DARK: '#0d0d0d',
|
||||
ACCENT_BLUE: '#2563eb',
|
||||
ACCENT_BLUE_HOVER: '#1d4ed8',
|
||||
BACKGROUND_LIGHT: 'white',
|
||||
BACKGROUND_DARK: '#111111',
|
||||
TITLE_UPDATE_ALERT: {
|
||||
BORDER_LIGHT: 'zinc-200',
|
||||
BORDER_DARK: 'zinc-700',
|
||||
BG_LIGHT: 'white',
|
||||
BG_DARK: 'zinc-800',
|
||||
TEXT_LIGHT: 'zinc-500',
|
||||
TEXT_DARK: 'zinc-400'
|
||||
}
|
||||
} as const;
|
||||
|
||||
export const FAVICON_PATHS = {
|
||||
ICO_LIGHT: 'favicon.ico',
|
||||
ICO_DARK: 'favicon-dark.ico',
|
||||
SVG_LIGHT: 'favicon.svg',
|
||||
SVG_DARK: 'favicon-dark.svg'
|
||||
} as const;
|
||||
|
||||
export const FAVICON_SELECTORS = {
|
||||
ICO_48X48: 'link[rel="icon"][sizes="48x48"]',
|
||||
SVG_ANY: 'link[rel="icon"][type="image/svg+xml"]'
|
||||
} as const;
|
||||
|
||||
export const APPLE_ASSETS = {
|
||||
TOUCH_ICON: 'apple-touch-icon-180x180.png'
|
||||
} as const;
|
||||
|
||||
export const PWA_MANIFEST = {
|
||||
name: APP_NAME,
|
||||
short_name: APP_NAME,
|
||||
description: 'Local AI chat interface powered by llama.cpp',
|
||||
start_url: './',
|
||||
display: 'standalone' as const,
|
||||
background_color: THEME_COLORS.BACKGROUND_LIGHT,
|
||||
theme_color: THEME_COLORS.BACKGROUND_LIGHT,
|
||||
icons: [
|
||||
{ src: 'pwa-64x64.png', sizes: '64x64', type: 'image/png' },
|
||||
{ src: 'pwa-192x192.png', sizes: '192x192', type: 'image/png' },
|
||||
{ src: 'pwa-512x512.png', sizes: '512x512', type: 'image/png', purpose: 'any' as const },
|
||||
{
|
||||
src: 'maskable-icon-512x512.png',
|
||||
sizes: '512x512',
|
||||
type: 'image/png',
|
||||
purpose: 'maskable' as const
|
||||
}
|
||||
]
|
||||
};
|
||||
|
||||
export const PWA_ICON_PATHS = {
|
||||
PWA_64: '/pwa-64x64.png',
|
||||
PWA_192: '/pwa-192x192.png',
|
||||
PWA_512: '/pwa-512x512.png',
|
||||
MASKABLE_512: '/maskable-icon-512x512.png'
|
||||
} as const;
|
||||
|
||||
/** Apple device dimensions (logical points) and DPR, from Apple HIG. */
|
||||
export const APPLE_DEVICES = {
|
||||
// iPhones (DPR 3)
|
||||
'1170x2532': { width: 390, height: 844, dpr: 3 }, // iPhone 13, 15
|
||||
'1179x2556': { width: 393, height: 852, dpr: 3 }, // iPhone 14, 15 Pro, 16
|
||||
'1206x2622': { width: 402, height: 874, dpr: 3 }, // iPhone 16 Plus, 16e
|
||||
'1284x2778': { width: 428, height: 926, dpr: 3 }, // iPhone 15 Plus
|
||||
'1290x2796': { width: 430, height: 932, dpr: 3 }, // iPhone 15 Pro Max, 16 Pro
|
||||
'1320x2868': { width: 440, height: 956, dpr: 3 }, // iPhone 16 Pro Max
|
||||
'750x1334': { width: 375, height: 667, dpr: 2 }, // iPhone 6/7/8, 14
|
||||
'640x1136': { width: 320, height: 568, dpr: 2 }, // iPhone 6/7/8 Plus
|
||||
// iPads (DPR 2)
|
||||
'1668x2388': { width: 834, height: 1194, dpr: 2 }, // iPad Air 11", iPad 11"
|
||||
'2048x2732': { width: 1024, height: 1366, dpr: 2 }, // iPad Pro 12.9"
|
||||
'1640x2360': { width: 820, height: 1180, dpr: 2 }, // iPad Air 10.9"
|
||||
'1032x1376': { width: 1032, height: 1376, dpr: 2 }, // iPad Air 13"
|
||||
'744x1133': { width: 376, height: 573, dpr: 2 } // iPad mini 8.3"
|
||||
} as const;
|
||||
|
||||
export type AppleDeviceKey = keyof typeof APPLE_DEVICES;
|
||||
|
||||
export const PWA_FILE_PATHS = {
|
||||
MANIFEST: '/manifest.webmanifest',
|
||||
SERVICE_WORKER: '/sw.js',
|
||||
VERSION: '/version.json',
|
||||
WORKBOX: '/workbox-<hash>.js'
|
||||
} as const;
|
||||
|
||||
// Used by the server middleware to skip API key validation.
|
||||
// Keep in sync with tools/server/server-http.cpp public_endpoints list.
|
||||
|
||||
export const PUBLIC_ENDPOINTS = [
|
||||
'/health',
|
||||
'/v1/health',
|
||||
'/models',
|
||||
'/v1/models',
|
||||
'/props',
|
||||
'/metrics',
|
||||
'/',
|
||||
'/index.html',
|
||||
|
||||
'/favicon.ico',
|
||||
'/favicon-dark.ico',
|
||||
'/favicon.svg',
|
||||
'/favicon-dark.svg',
|
||||
'/pwa-64x64.png',
|
||||
'/pwa-192x192.png',
|
||||
'/pwa-512x512.png',
|
||||
'/maskable-icon-512x512.png',
|
||||
'/apple-touch-icon-180x180.png',
|
||||
'/apple-splash-portrait-640x1136.png',
|
||||
'/apple-splash-landscape-640x1136.png',
|
||||
'/apple-splash-portrait-750x1334.png',
|
||||
'/apple-splash-landscape-750x1334.png',
|
||||
'/apple-splash-portrait-1170x2532.png',
|
||||
'/apple-splash-landscape-1170x2532.png',
|
||||
'/apple-splash-portrait-1179x2556.png',
|
||||
'/apple-splash-landscape-1179x2556.png',
|
||||
'/apple-splash-portrait-1206x2622.png',
|
||||
'/apple-splash-landscape-1206x2622.png',
|
||||
'/apple-splash-portrait-1284x2778.png',
|
||||
'/apple-splash-landscape-1284x2778.png',
|
||||
'/apple-splash-portrait-1290x2796.png',
|
||||
'/apple-splash-landscape-1290x2796.png',
|
||||
'/apple-splash-portrait-1320x2868.png',
|
||||
'/apple-splash-landscape-1320x2868.png',
|
||||
'/apple-splash-portrait-1488x2266.png',
|
||||
'/apple-splash-landscape-1488x2266.png',
|
||||
'/apple-splash-portrait-1640x2360.png',
|
||||
'/apple-splash-landscape-1640x2360.png',
|
||||
'/apple-splash-portrait-1668x2388.png',
|
||||
'/apple-splash-landscape-1668x2388.png',
|
||||
'/apple-splash-portrait-2048x2732.png',
|
||||
'/apple-splash-landscape-2048x2732.png',
|
||||
'/apple-splash-portrait-dark-640x1136.png',
|
||||
'/apple-splash-landscape-dark-640x1136.png',
|
||||
'/apple-splash-portrait-dark-750x1334.png',
|
||||
'/apple-splash-landscape-dark-750x1334.png',
|
||||
'/apple-splash-portrait-dark-1170x2532.png',
|
||||
'/apple-splash-landscape-dark-1170x2532.png',
|
||||
'/apple-splash-portrait-dark-1179x2556.png',
|
||||
'/apple-splash-landscape-dark-1179x2556.png',
|
||||
'/apple-splash-portrait-dark-1206x2622.png',
|
||||
'/apple-splash-landscape-dark-1206x2622.png',
|
||||
'/apple-splash-portrait-dark-1284x2778.png',
|
||||
'/apple-splash-landscape-dark-1284x2778.png',
|
||||
'/apple-splash-portrait-dark-1290x2796.png',
|
||||
'/apple-splash-landscape-dark-1290x2796.png',
|
||||
'/apple-splash-portrait-dark-1320x2868.png',
|
||||
'/apple-splash-landscape-dark-1320x2868.png',
|
||||
'/apple-splash-portrait-dark-1488x2266.png',
|
||||
'/apple-splash-landscape-dark-1488x2266.png',
|
||||
'/apple-splash-portrait-dark-1640x2360.png',
|
||||
'/apple-splash-landscape-dark-1640x2360.png',
|
||||
'/apple-splash-portrait-dark-1668x2388.png',
|
||||
'/apple-splash-landscape-dark-1668x2388.png',
|
||||
'/apple-splash-portrait-dark-2048x2732.png',
|
||||
'/apple-splash-landscape-dark-2048x2732.png',
|
||||
'/manifest.webmanifest',
|
||||
'/sw.js',
|
||||
'/version.json',
|
||||
'/workbox-<hash>.js'
|
||||
] as const;
|
||||
export const BUILD_CONFIG = {
|
||||
OUTPUT_DIR: './dist',
|
||||
GUIDE_COMMENT: `
|
||||
<!--
|
||||
This is a static build of the frontend.
|
||||
It is automatically generated by the build process.
|
||||
Do not edit this file directly.
|
||||
To make changes, refer to the "Web UI" section in the README.
|
||||
-->
|
||||
`.trim()
|
||||
} as const;
|
||||
|
||||
export const REGEX_PATTERNS = {
|
||||
SPLASH_FILE: /^apple-splash-(portrait|landscape)-(dark-)?(\d+)x(\d+)\.png$/,
|
||||
HEAD_CLOSE: /\t*<\/head>/
|
||||
} as const;
|
||||
|
||||
// Device names used by @vite-pwa/assets-generator for splash screen generation.
|
||||
// Keep in sync with pwa-assets.config.ts.
|
||||
export const PWA_GENERATOR_DEVICES = [
|
||||
'iPhone 13',
|
||||
'iPhone 13 Pro',
|
||||
'iPhone 13 Pro Max',
|
||||
'iPhone 14',
|
||||
'iPhone 14 Plus',
|
||||
'iPhone 14 Pro',
|
||||
'iPhone 14 Pro Max',
|
||||
'iPhone 15',
|
||||
'iPhone 15 Plus',
|
||||
'iPhone 15 Pro',
|
||||
'iPhone 15 Pro Max',
|
||||
'iPhone 16',
|
||||
'iPhone 16 Plus',
|
||||
'iPhone 16 Pro',
|
||||
'iPhone 16 Pro Max',
|
||||
'iPhone 16e',
|
||||
'iPhone SE 4"',
|
||||
'iPhone SE 4.7"',
|
||||
'iPad 11"',
|
||||
'iPad Air 10.9"',
|
||||
'iPad Air 11"',
|
||||
'iPad Air 13"',
|
||||
'iPad Pro 11"',
|
||||
'iPad Pro 12.9"',
|
||||
'iPad mini 8.3"'
|
||||
] as const;
|
||||
|
||||
// PWA assets generator configuration — used by pwa-assets.config.ts
|
||||
export const PWA_ASSET_GENERATOR = {
|
||||
LINK_PRESET: '2023',
|
||||
SPLASH_PADDING: 0.75,
|
||||
FIT_MODE: 'contain',
|
||||
ADD_MEDIA_SCREEN: true,
|
||||
BASE_PATH: './',
|
||||
XHTML: false,
|
||||
PNG_COMPRESSION_LEVEL: 9,
|
||||
PNG_QUALITY: 60,
|
||||
DARK_PREFIX: 'dark-'
|
||||
} as const;
|
||||
|
||||
export const CACHE_SETTINGS = {
|
||||
IMMUTABLE_MAX_AGE_SECONDS: 31536000,
|
||||
API_CACHE_MAX_AGE_SECONDS: 60 * 60 * 24,
|
||||
API_CACHE_MAX_ENTRIES: 50,
|
||||
MAX_FILE_SIZE_BYTES: 10 * 1024 * 1024
|
||||
} as const;
|
||||
|
||||
export const GLOB_PATTERNS: string[] = [
|
||||
'**/*.{js,css,html,ico,svg,png,webp,woff,woff2,json,webmanifest}'
|
||||
];
|
||||
|
||||
// loading.html is the model loading page served by llama-server itself.
|
||||
// The SvelteKit PWA manifest transform strips the html extension from every
|
||||
// precache entry to match clean URLs, but loading.html is a plain static asset
|
||||
// with no clean URL, so static servers answer 404 and the SW install fails.
|
||||
export const GLOB_IGNORES: string[] = ['**/loading.html'];
|
||||
|
||||
export const SW_CONFIG = {
|
||||
CHECK_INTERVAL_MS: 60000,
|
||||
UPDATE_FETCH_OPTIONS: {
|
||||
CACHE: 'no-store',
|
||||
HEADERS: {
|
||||
CACHE: 'no-store',
|
||||
CACHE_CONTROL: 'no-cache'
|
||||
}
|
||||
}
|
||||
} as const;
|
||||
|
||||
// Runtime caching configuration for Workbox
|
||||
export const RUNTIME_CACHING = {
|
||||
HANDLER: 'NetworkFirst',
|
||||
CACHE_NAME: 'api-cache'
|
||||
} as const;
|
||||
|
||||
// Workbox runtime caching patterns
|
||||
export const API_CACHING_PATTERNS = {
|
||||
V1_API: /^\/v1\/.*/,
|
||||
STATIC_API: /^\/(health|props|models|tools|slots|cors-proxy).*/
|
||||
} as const;
|
||||
|
||||
// SvelteKit PWA plugin options
|
||||
export const PWA_KIT_OPTIONS = {
|
||||
NAVIGATE_FALLBACK: './'
|
||||
} as const;
|
||||
|
||||
export const APPLE_META_TAGS = {
|
||||
MOBILE_WEB_APP_CAPABLE: { name: 'apple-mobile-web-app-capable', content: 'yes' },
|
||||
STATUS_BAR_STYLE: { name: 'apple-mobile-web-app-status-bar-style', content: 'black-translucent' },
|
||||
MOBILE_WEB_APP_TITLE: { name: 'apple-mobile-web-app-title' }
|
||||
} as const;
|
||||
|
||||
// Splash screen HTML link tag prefix used by generateSplashScreenLinks
|
||||
export const SPLASH_LINK = {
|
||||
HTML: '<link rel="apple-touch-startup-image"',
|
||||
DARK_MEDIA_SUFFIX: ' and (prefers-color-scheme: dark)'
|
||||
} as const;
|
||||
|
||||
// SvelteKit PWA plugin configuration — used by @vite.config.ts
|
||||
import type { SvelteKitPWAOptions } from '@vite-pwa/sveltekit';
|
||||
|
||||
export const SVELTEKIT_PWA_OPTIONS: SvelteKitPWAOptions = {
|
||||
// Strategy: generateSW - the plugin generates a service worker automatically
|
||||
// using Workbox. For a custom SW, use 'injectManifest' instead.
|
||||
// Manifest configuration
|
||||
manifest: PWA_MANIFEST,
|
||||
|
||||
// Workbox configuration for generateSW strategy
|
||||
workbox: {
|
||||
// Match all static assets in the build output.
|
||||
// Uses '**/' because SvelteKit outputs files under _app/immutable/
|
||||
// subdirectories.
|
||||
globPatterns: GLOB_PATTERNS,
|
||||
globIgnores: GLOB_IGNORES,
|
||||
maximumFileSizeToCacheInBytes: CACHE_SETTINGS.MAX_FILE_SIZE_BYTES,
|
||||
|
||||
// Runtime caching for API calls - use NetworkFirst so APIs are always fresh
|
||||
runtimeCaching: [
|
||||
{
|
||||
urlPattern: API_CACHING_PATTERNS.V1_API,
|
||||
handler: RUNTIME_CACHING.HANDLER,
|
||||
options: {
|
||||
cacheName: RUNTIME_CACHING.CACHE_NAME,
|
||||
expiration: {
|
||||
maxEntries: CACHE_SETTINGS.API_CACHE_MAX_ENTRIES,
|
||||
maxAgeSeconds: CACHE_SETTINGS.API_CACHE_MAX_AGE_SECONDS
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
urlPattern: API_CACHING_PATTERNS.STATIC_API,
|
||||
handler: RUNTIME_CACHING.HANDLER,
|
||||
options: {
|
||||
cacheName: RUNTIME_CACHING.CACHE_NAME,
|
||||
expiration: {
|
||||
maxEntries: CACHE_SETTINGS.API_CACHE_MAX_ENTRIES,
|
||||
maxAgeSeconds: CACHE_SETTINGS.API_CACHE_MAX_AGE_SECONDS
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
devOptions: {
|
||||
enabled: true,
|
||||
suppressWarnings: true,
|
||||
// Use PWA_KIT_OPTIONS.NAVIGATE_FALLBACK to match production SW behaviour
|
||||
// (navigateFallback defaults to the configured base path, which is '/' for this SPA).
|
||||
navigateFallback: PWA_KIT_OPTIONS.NAVIGATE_FALLBACK
|
||||
},
|
||||
|
||||
// SvelteKit-specific options
|
||||
kit: {
|
||||
// Include version file for proper cache invalidation
|
||||
includeVersionFile: true
|
||||
}
|
||||
};
|
||||
@@ -31,6 +31,7 @@ export const SETTINGS_KEYS = {
|
||||
SHOW_RAW_MODEL_NAMES: 'showRawModelNames',
|
||||
SHOW_MODEL_QUANTIZATION: 'showModelQuantization',
|
||||
SHOW_MODEL_TAGS: 'showModelTags',
|
||||
SHOW_BUILD_VERSION: 'showBuildVersion',
|
||||
SHOW_SYSTEM_MESSAGE: 'showSystemMessage',
|
||||
// Sampling
|
||||
TEMPERATURE: 'temperature',
|
||||
|
||||
@@ -365,6 +365,14 @@ const SETTINGS_REGISTRY: Record<string, SettingsSectionEntry> = {
|
||||
serverKey: SETTINGS_KEYS.ALWAYS_SHOW_AGENTIC_TURNS,
|
||||
paramType: SyncableParameterType.BOOLEAN
|
||||
}
|
||||
},
|
||||
{
|
||||
key: SETTINGS_KEYS.SHOW_BUILD_VERSION,
|
||||
label: 'Show build version information',
|
||||
help: 'Display the current build version in the bottom-right corner of the interface.',
|
||||
defaultValue: false,
|
||||
type: SettingsFieldType.CHECKBOX,
|
||||
section: SETTINGS_SECTION_SLUGS.DISPLAY
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
@@ -40,6 +40,9 @@ export const DEPRECATED_MCP_DEFAULT_ENABLED_LOCALSTORAGE_KEY = `${STORAGE_APP_NA
|
||||
/** @deprecated Use {@link USER_OVERRIDES_LOCALSTORAGE_KEY} instead */
|
||||
export const DEPRECATED_USER_OVERRIDES_LOCALSTORAGE_KEY = `${STORAGE_APP_NAME_DEPRECATED}.userOverrides`;
|
||||
|
||||
/** Build version stored in localStorage for non-PWA update detection */
|
||||
export const BUILD_VERSION_LOCALSTORAGE_KEY = `${STORAGE_APP_NAME}.buildVersion`;
|
||||
|
||||
/** Maps new keys to their deprecated fallback keys */
|
||||
export const NEW_TO_DEPRECATED_MAP: Record<string, string> = {
|
||||
[ALWAYS_ALLOWED_TOOLS_LOCALSTORAGE_KEY]: DEPRECATED_ALWAYS_ALLOWED_TOOLS_LOCALSTORAGE_KEY,
|
||||
|
||||
@@ -5,7 +5,6 @@ import { ROUTES } from './routes';
|
||||
|
||||
export const FORK_TREE_DEPTH_PADDING = 8;
|
||||
export const SYSTEM_MESSAGE_PLACEHOLDER = 'System message';
|
||||
export const APP_NAME = import.meta.env.VITE_PUBLIC_APP_NAME || 'llama-ui';
|
||||
|
||||
export const ICON_STRIP_TRANSITION_DURATION = 150;
|
||||
export const ICON_STRIP_TRANSITION_DELAY_MULTIPLIER = 50;
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user