mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-07-01 18:17:42 +02:00
Compare commits
51 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| e6f2ec01ff | |||
| edfb440a2f | |||
| 3d66da1809 | |||
| 82b703f8bc | |||
| 51a84efc53 | |||
| b0f0dd3e51 | |||
| 0eb4764182 | |||
| 1f5d15e665 | |||
| c46758d28f | |||
| bf934f28db | |||
| 5c1a7b8355 | |||
| 59d840209a | |||
| ff934e29bc | |||
| ee051c1e4e | |||
| e6f6770515 | |||
| 48cda24c11 | |||
| 871f1a2d2f | |||
| 20197b6fe3 | |||
| ba38f3becc | |||
| 37f230dd7c | |||
| a308e584ca | |||
| d0fa2c9fbb | |||
| 9bcb4eff4d | |||
| 6861f6509a | |||
| 1743d98057 | |||
| 7ca0c9cca7 | |||
| 8c60b8a2be | |||
| 287b5b1eab | |||
| a73bbd5d92 | |||
| ded446b34c | |||
| f8d4abae86 | |||
| 3d5acab3e7 | |||
| 9900b29c3a | |||
| dc8d14c582 | |||
| 93dfbc1291 | |||
| 3cba8bba18 | |||
| 112c78159f | |||
| 0fac87b157 | |||
| 0a524f2404 | |||
| c0159f9c1f | |||
| a970515bdb | |||
| 056b50c319 | |||
| f2c72b8f1f | |||
| ec54ac13a8 | |||
| 80322ebdaf | |||
| 44c51e526b | |||
| 1922f87c2f | |||
| 345de3cd87 | |||
| 9c600bcd4b | |||
| b2704f9028 | |||
| 3fab96cd04 |
@@ -4,7 +4,7 @@
|
||||
|
||||
# Define the CANN base image for easier version updates later
|
||||
ARG CHIP_TYPE=910b
|
||||
ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc2-${CHIP_TYPE}-openeuler24.03-py3.11
|
||||
ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.5.0-${CHIP_TYPE}-openeuler24.03-py3.11
|
||||
|
||||
# ==============================================================================
|
||||
# BUILD STAGE
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
ARG UBUNTU_VERSION=22.04
|
||||
ARG UBUNTU_VERSION=24.04
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||
|
||||
ARG TARGETARCH
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential git cmake libssl-dev
|
||||
apt-get install -y gcc-14 g++-14 build-essential git cmake libssl-dev
|
||||
|
||||
ENV CC=gcc-14 CXX=g++-14
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
@@ -55,8 +57,9 @@ RUN apt-get update \
|
||||
git \
|
||||
python3 \
|
||||
python3-pip \
|
||||
&& pip install --upgrade pip setuptools wheel \
|
||||
&& pip install -r requirements.txt \
|
||||
python3-wheel \
|
||||
&& pip install --break-system-packages --upgrade setuptools \
|
||||
&& pip install --break-system-packages -r requirements.txt \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
ARG ASCEND_VERSION=8.1.RC1.alpha001-910b-openeuler22.03-py3.10
|
||||
ARG ASCEND_VERSION=8.5.0-910b-openeuler22.03-py3.10
|
||||
|
||||
FROM ascendai/cann:$ASCEND_VERSION AS build
|
||||
|
||||
|
||||
@@ -41,6 +41,7 @@
|
||||
effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
|
||||
enableStatic ? effectiveStdenv.hostPlatform.isStatic,
|
||||
precompileMetalShaders ? false,
|
||||
useWebUi ? true,
|
||||
}:
|
||||
|
||||
let
|
||||
@@ -164,6 +165,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
|
||||
cmakeFlags =
|
||||
[
|
||||
(cmakeBool "LLAMA_BUILD_SERVER" true)
|
||||
(cmakeBool "LLAMA_BUILD_WEBUI" useWebUi)
|
||||
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
|
||||
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
|
||||
(cmakeBool "GGML_NATIVE" false)
|
||||
|
||||
@@ -40,13 +40,9 @@ jobs:
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
|
||||
# Disabled due to size (400MB) and always 0 cache hits
|
||||
# - name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.16
|
||||
# with:
|
||||
# key: android-build
|
||||
# evict-old-files: 1d
|
||||
with:
|
||||
fetch-depth: 0
|
||||
lfs: false
|
||||
|
||||
- name: Set up JDK
|
||||
uses: actions/setup-java@v5
|
||||
@@ -55,7 +51,7 @@ jobs:
|
||||
distribution: zulu
|
||||
|
||||
- name: Setup Android SDK
|
||||
uses: android-actions/setup-android@v3
|
||||
uses: android-actions/setup-android@9fc6c4e9069bf8d3d10b2204b1fb8f6ef7065407 # v3
|
||||
with:
|
||||
log-accepted-android-sdk-licenses: false
|
||||
|
||||
@@ -66,10 +62,11 @@ jobs:
|
||||
|
||||
android-ndk:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
env:
|
||||
OPENCL_VERSION: 2025.07.22
|
||||
|
||||
container:
|
||||
image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.3'
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
@@ -82,59 +79,23 @@ jobs:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
lfs: false
|
||||
|
||||
- name: Install OpenCL Headers and Libs
|
||||
id: install_opencl
|
||||
if: ${{ matrix.build == 'arm64-snapdragon' }}
|
||||
run: |
|
||||
mkdir opencl
|
||||
curl -L -o opencl/clhpp.tar.gz https://github.com/KhronosGroup/OpenCL-CLHPP/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
|
||||
curl -L -o opencl/headers.tar.gz https://github.com/KhronosGroup/OpenCL-Headers/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
|
||||
curl -L -o opencl/icd-loader.tar.gz https://github.com/KhronosGroup/OpenCL-ICD-Loader/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
|
||||
tar -xaf opencl/headers.tar.gz -C opencl
|
||||
tar -xaf opencl/clhpp.tar.gz -C opencl
|
||||
tar -xaf opencl/icd-loader.tar.gz -C opencl
|
||||
sudo cp -r opencl/OpenCL-Headers-${OPENCL_VERSION}/CL ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
|
||||
sudo cp -r opencl/OpenCL-CLHPP-${OPENCL_VERSION}/include/CL/* ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include/CL
|
||||
cd opencl/OpenCL-ICD-Loader-${OPENCL_VERSION}
|
||||
cmake -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -DOPENCL_ICD_LOADER_HEADERS_DIR=${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=31 -DANDROID_STL=c++_shared
|
||||
cmake --build build
|
||||
sudo cp build/libOpenCL.so ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
|
||||
rm -rf opencl
|
||||
|
||||
- name: Install Hexagon SDK
|
||||
id: install_hexsdk
|
||||
if: ${{ matrix.build == 'arm64-snapdragon' }}
|
||||
env:
|
||||
HEXSDK_VER: 6.4.0.2
|
||||
HEXTLS_VER: 19.0.04
|
||||
run: |
|
||||
curl -L -o hex-sdk.tar.gz https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v$HEXSDK_VER/hexagon-sdk-v$HEXSDK_VER-amd64-lnx.tar.xz
|
||||
mkdir hex-sdk
|
||||
tar -xaf hex-sdk.tar.gz -C hex-sdk
|
||||
ls -l hex-sdk
|
||||
sudo mv hex-sdk /opt/hexagon
|
||||
echo "HEXAGON_SDK_ROOT=/opt/hexagon/$HEXSDK_VER" >> "$GITHUB_ENV"
|
||||
echo "HEXAGON_TOOLS_ROOT=/opt/hexagon/$HEXSDK_VER/tools/HEXAGON_Tools/$HEXTLS_VER" >> "$GITHUB_ENV"
|
||||
echo "DEFAULT_HLOS_ARCH=64" >> "$GITHUB_ENV"
|
||||
echo "DEFAULT_TOOLS_VARIANT=toolv19" >> "$GITHUB_ENV"
|
||||
echo "DEFAULT_NO_QURT_INC=0" >> "$GITHUB_ENV"
|
||||
echo "DEFAULT_DSP_ARCH=v73" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Update CMake presets
|
||||
id: update_presets
|
||||
if: ${{ matrix.build == 'arm64-snapdragon' }}
|
||||
run: |
|
||||
cp docs/backend/snapdragon/CMakeUserPresets.json .
|
||||
|
||||
- name: Build
|
||||
id: ndk_build
|
||||
- name: Build Llama.CPP for Hexagon Android
|
||||
id: build_llama_cpp_hexagon_android
|
||||
run: |
|
||||
if [[ "${{ matrix.build }}" == "arm64-snapdragon" ]]; then
|
||||
cp docs/backend/snapdragon/CMakeUserPresets.json .
|
||||
fi
|
||||
cmake ${{ matrix.defines }} -B build
|
||||
cmake --build build
|
||||
cmake --install build --prefix pkg-adb/llama.cpp
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
echo "FIXME: test on devices"
|
||||
- name: Upload Llama.CPP Hexagon Android Build Artifact
|
||||
if: ${{ always() && steps.build_llama_cpp_hexagon_android.outcome == 'success' }}
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
name: llama-cpp-android-${{ matrix.build }}
|
||||
path: pkg-adb/llama.cpp
|
||||
|
||||
@@ -63,7 +63,7 @@ jobs:
|
||||
- name: Set container image
|
||||
id: cann-image
|
||||
run: |
|
||||
image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}"
|
||||
image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
|
||||
echo "image=${image}" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Pull container image
|
||||
|
||||
@@ -43,7 +43,7 @@ jobs:
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Setup ${{ matrix.sys }}
|
||||
uses: msys2/setup-msys2@v2
|
||||
uses: msys2/setup-msys2@cafece8e6baf9247cf9b1bf95097b0b983cc558d # v2
|
||||
with:
|
||||
update: true
|
||||
msystem: ${{matrix.sys}}
|
||||
|
||||
@@ -141,60 +141,61 @@ jobs:
|
||||
# amd-smi static
|
||||
# GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
|
||||
ggml-ci-mac-metal:
|
||||
runs-on: [self-hosted, macOS, ARM64]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-mac-webgpu:
|
||||
runs-on: [self-hosted, macOS, ARM64]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Dawn Dependency
|
||||
id: dawn-depends
|
||||
run: |
|
||||
DAWN_VERSION="v2.0.0"
|
||||
DAWN_OWNER="reeselevine"
|
||||
DAWN_REPO="dawn"
|
||||
DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release"
|
||||
echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
|
||||
curl -L -o artifact.zip \
|
||||
"https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
|
||||
mkdir dawn
|
||||
unzip artifact.zip
|
||||
tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
|
||||
bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-mac-vulkan:
|
||||
runs-on: [self-hosted, macOS, ARM64]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
# TODO: sandbox Mac runners
|
||||
# ggml-ci-mac-metal:
|
||||
# runs-on: [self-hosted, macOS, ARM64]
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
#
|
||||
# ggml-ci-mac-webgpu:
|
||||
# runs-on: [self-hosted, macOS, ARM64]
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: Dawn Dependency
|
||||
# id: dawn-depends
|
||||
# run: |
|
||||
# DAWN_VERSION="v2.0.0"
|
||||
# DAWN_OWNER="reeselevine"
|
||||
# DAWN_REPO="dawn"
|
||||
# DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release"
|
||||
# echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
|
||||
# curl -L -o artifact.zip \
|
||||
# "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
|
||||
# mkdir dawn
|
||||
# unzip artifact.zip
|
||||
# tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
|
||||
#
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
|
||||
# bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
#
|
||||
# ggml-ci-mac-vulkan:
|
||||
# runs-on: [self-hosted, macOS, ARM64]
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# vulkaninfo --summary
|
||||
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-linux-intel-vulkan:
|
||||
runs-on: [self-hosted, Linux, Intel]
|
||||
|
||||
+26
-18
@@ -87,7 +87,7 @@ jobs:
|
||||
-DGGML_METAL_EMBED_LIBRARY=OFF \
|
||||
-DGGML_METAL_SHADER_DEBUG=ON \
|
||||
-DGGML_RPC=ON
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
leaks -atExit -- ./build/bin/test-thread-safety -hf ggml-org/gemma-3-270m-qat-GGUF -ngl 99 -p "$(printf 'hello %.0s' {1..128})" -n 16 -c 512 -ub 32 -np 2 -t 2 -lv 1
|
||||
|
||||
- name: Test
|
||||
@@ -124,7 +124,7 @@ jobs:
|
||||
-DGGML_METAL=OFF \
|
||||
-DGGML_RPC=ON \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
@@ -165,8 +165,8 @@ jobs:
|
||||
id: cmake_build
|
||||
run: |
|
||||
export CMAKE_PREFIX_PATH=dawn
|
||||
cmake -B build -DGGML_WEBGPU=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
cmake -B build -G "Ninja" -DCMAKE_BUILD_TYPE=Release -DGGML_WEBGPU=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF
|
||||
time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
@@ -231,7 +231,7 @@ jobs:
|
||||
cmake -B build \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DGGML_RPC=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
@@ -274,14 +274,16 @@ jobs:
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential libssl-dev
|
||||
sudo apt-get install build-essential libssl-dev ninja-build
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-G "Ninja" \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_RPC=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
@@ -300,12 +302,13 @@ jobs:
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get install -y glslc libvulkan-dev libssl-dev
|
||||
sudo apt-get install -y glslc libvulkan-dev libssl-dev ninja-build
|
||||
|
||||
- name: Configure
|
||||
id: cmake_configure
|
||||
run: |
|
||||
cmake -B build \
|
||||
-G "Ninja" \
|
||||
-DCMAKE_BUILD_TYPE=RelWithDebInfo \
|
||||
-DGGML_BACKEND_DL=ON \
|
||||
-DGGML_CPU_ALL_VARIANTS=ON \
|
||||
@@ -314,7 +317,7 @@ jobs:
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake --build build -j $(nproc)
|
||||
time cmake --build build -j $(nproc)
|
||||
|
||||
ubuntu-24-webgpu:
|
||||
runs-on: ubuntu-24.04
|
||||
@@ -336,7 +339,8 @@ jobs:
|
||||
run: |
|
||||
sudo add-apt-repository -y ppa:kisak/kisak-mesa
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev
|
||||
sudo apt-get install -y build-essential mesa-vulkan-drivers \
|
||||
libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev
|
||||
|
||||
- name: Get latest Vulkan SDK version
|
||||
id: vulkan_sdk_version
|
||||
@@ -378,7 +382,7 @@ jobs:
|
||||
export Dawn_DIR=dawn/lib64/cmake/Dawn
|
||||
cmake -B build \
|
||||
-DGGML_WEBGPU=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
@@ -415,11 +419,13 @@ jobs:
|
||||
run: |
|
||||
source emsdk/emsdk_env.sh
|
||||
emcmake cmake -B build-wasm \
|
||||
-G "Ninja" \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_WEBGPU=ON \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DEMDAWNWEBGPU_DIR=emdawnwebgpu_pkg
|
||||
|
||||
cmake --build build-wasm --target test-backend-ops -j $(nproc)
|
||||
time cmake --build build-wasm --config Release --target test-backend-ops -j $(nproc)
|
||||
|
||||
ubuntu-22-hip:
|
||||
runs-on: ubuntu-22.04
|
||||
@@ -479,7 +485,7 @@ jobs:
|
||||
run: |
|
||||
cmake -B build -S . \
|
||||
-DGGML_MUSA=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
ubuntu-22-sycl:
|
||||
runs-on: ubuntu-22.04
|
||||
@@ -528,7 +534,7 @@ jobs:
|
||||
-DGGML_SYCL=ON \
|
||||
-DCMAKE_C_COMPILER=icx \
|
||||
-DCMAKE_CXX_COMPILER=icpx
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
ubuntu-22-sycl-fp16:
|
||||
runs-on: ubuntu-22.04
|
||||
@@ -551,7 +557,7 @@ jobs:
|
||||
shell: bash
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev
|
||||
sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev ninja-build
|
||||
|
||||
- name: install oneAPI MKL library
|
||||
shell: bash
|
||||
@@ -574,11 +580,13 @@ jobs:
|
||||
run: |
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
cmake -B build \
|
||||
-G "Ninja" \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_SYCL=ON \
|
||||
-DCMAKE_C_COMPILER=icx \
|
||||
-DCMAKE_CXX_COMPILER=icpx \
|
||||
-DGGML_SYCL_F16=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
ubuntu-24-openvino:
|
||||
name: ubuntu-24-openvino-${{ matrix.openvino_device }}
|
||||
@@ -648,7 +656,7 @@ jobs:
|
||||
cmake -B build/ReleaseOV -G Ninja \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENVINO=ON
|
||||
cmake --build build/ReleaseOV --config Release -j $(nproc)
|
||||
time cmake --build build/ReleaseOV --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
@@ -1039,7 +1047,7 @@ jobs:
|
||||
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
|
||||
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
|
||||
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
|
||||
@@ -36,18 +36,16 @@ jobs:
|
||||
matrix:
|
||||
config:
|
||||
# Multi-stage build
|
||||
# Note: the arm64 images are failing, which prevents the amd64 images from being built
|
||||
# https://github.com/ggml-org/llama.cpp/issues/11888
|
||||
#- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
|
||||
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
|
||||
- { tag: "cuda cuda12", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04", cuda_version: "12.4.0", ubuntu_version: "22.04" }
|
||||
- { tag: "cuda13", dockerfile: ".devops/cuda-new.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04", cuda_version: "13.1.0", ubuntu_version: "24.04" }
|
||||
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" }
|
||||
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" }
|
||||
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
|
||||
- { tag: "s390x", dockerfile: ".devops/s390x.Dockerfile", platforms: "linux/s390x", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04-s390x" }
|
||||
- { tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" }
|
||||
- { tag: "openvino", dockerfile: ".devops/openvino.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
|
||||
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/arm64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-24.04" }
|
||||
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-24.04" }
|
||||
- { tag: "cuda cuda12", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-24.04", cuda_version: "12.4.0", ubuntu_version: "22.04" }
|
||||
- { tag: "cuda13", dockerfile: ".devops/cuda-new.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-24.04", cuda_version: "13.1.0", ubuntu_version: "24.04" }
|
||||
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-24.04" }
|
||||
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-24.04" }
|
||||
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-24.04" }
|
||||
- { tag: "s390x", dockerfile: ".devops/s390x.Dockerfile", platforms: "linux/s390x", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-24.04-s390x" }
|
||||
- { tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-24.04" }
|
||||
- { tag: "openvino", dockerfile: ".devops/openvino.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-24.04" }
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v6
|
||||
@@ -56,15 +54,15 @@ jobs:
|
||||
|
||||
- name: Set up QEMU
|
||||
if: ${{ matrix.config.tag != 's390x' }}
|
||||
uses: docker/setup-qemu-action@v3
|
||||
uses: docker/setup-qemu-action@c7c53464625b32c7a7e944ae62b3e17d2b600130 # v3
|
||||
with:
|
||||
image: tonistiigi/binfmt:qemu-v7.0.0-28
|
||||
image: tonistiigi/binfmt:qemu-v10.2.1
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # v3
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.repository_owner }}
|
||||
@@ -127,7 +125,7 @@ jobs:
|
||||
|
||||
- name: Build and push Full Docker image (tagged + versioned)
|
||||
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }}
|
||||
uses: docker/build-push-action@v6
|
||||
uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 # v6
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
@@ -152,7 +150,7 @@ jobs:
|
||||
|
||||
- name: Build and push Light Docker image (tagged + versioned)
|
||||
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }}
|
||||
uses: docker/build-push-action@v6
|
||||
uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 # v6
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
@@ -177,7 +175,7 @@ jobs:
|
||||
|
||||
- name: Build and push Server Docker image (tagged + versioned)
|
||||
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }}
|
||||
uses: docker/build-push-action@v6
|
||||
uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 # v6
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
|
||||
@@ -23,7 +23,7 @@ jobs:
|
||||
runs-on: ubuntu-slim
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: editorconfig-checker/action-editorconfig-checker@v2
|
||||
- uses: editorconfig-checker/action-editorconfig-checker@840e866d93b8e032123c23bac69dece044d4d84c # v2.2.0
|
||||
with:
|
||||
version: v3.0.3
|
||||
- run: editorconfig-checker
|
||||
|
||||
@@ -38,7 +38,7 @@ jobs:
|
||||
- name: Build package
|
||||
run: cd gguf-py && poetry build
|
||||
- name: Publish package
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # release/v1
|
||||
with:
|
||||
password: ${{ secrets.PYPI_API_TOKEN }}
|
||||
packages-dir: gguf-py/dist
|
||||
|
||||
@@ -8,7 +8,8 @@ on:
|
||||
paths: [
|
||||
'.github/workflows/hip-quality-check.yml',
|
||||
'**/*.cu',
|
||||
'**/*.cuh'
|
||||
'**/*.cuh',
|
||||
'scripts/hip/gcn-cdna-vgpr-check.py'
|
||||
]
|
||||
|
||||
pull_request:
|
||||
@@ -16,7 +17,8 @@ on:
|
||||
paths: [
|
||||
'.github/workflows/hip-quality-check.yml',
|
||||
'**/*.cu',
|
||||
'**/*.cuh'
|
||||
'**/*.cuh',
|
||||
'scripts/hip/gcn-cdna-vgpr-check.py'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
|
||||
@@ -31,6 +31,6 @@ jobs:
|
||||
with:
|
||||
python-version: "3.11"
|
||||
- name: flake8 Lint
|
||||
uses: py-actions/flake8@v2
|
||||
uses: py-actions/flake8@84ec6726560b6d5bd68f2a5bed83d62b52bb50ba # v2
|
||||
with:
|
||||
plugins: "flake8-no-print"
|
||||
|
||||
@@ -907,7 +907,7 @@ jobs:
|
||||
- name: Set container image
|
||||
id: cann-image
|
||||
run: |
|
||||
image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}"
|
||||
image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
|
||||
echo "image=${image}" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Pull container image
|
||||
|
||||
@@ -108,6 +108,7 @@ option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_WEBUI "llama: build the embedded Web UI for server" ON)
|
||||
option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT})
|
||||
option(LLAMA_TESTS_INSTALL "llama: install tests" ON)
|
||||
|
||||
|
||||
@@ -57,6 +57,13 @@ SRC=`pwd`
|
||||
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=${LLAMA_FATAL_WARNINGS:-ON} -DLLAMA_OPENSSL=OFF -DGGML_SCHED_NO_REALLOC=ON"
|
||||
CTEST_EXTRA=""
|
||||
|
||||
# Default to use make unless specified for compatibility
|
||||
CMAKE_GENERATOR="Unix Makefiles"
|
||||
|
||||
if [ ! -z "${GG_BUILD_NINJA}" ]; then
|
||||
CMAKE_GENERATOR="Ninja"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_METAL} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
|
||||
fi
|
||||
@@ -242,13 +249,13 @@ function gg_run_ctest_debug {
|
||||
|
||||
set -e
|
||||
|
||||
# Check cmake, make and ctest are installed
|
||||
# Check cmake and ctest are installed
|
||||
gg_check_build_requirements
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
(cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time cmake --build . --config Debug -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
(time ctest --output-on-failure -L main -E "test-opt|test-backend-ops" ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
(time ctest -C Debug --output-on-failure -L main -E "test-opt|test-backend-ops" ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
|
||||
set +e
|
||||
}
|
||||
@@ -273,16 +280,16 @@ function gg_run_ctest_release {
|
||||
|
||||
set -e
|
||||
|
||||
# Check cmake, make and ctest are installed
|
||||
# Check cmake and ctest are installed
|
||||
gg_check_build_requirements
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
(cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time cmake --build . --config Release -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||
(time ctest --output-on-failure -L 'main|python' ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
(time ctest -C Release --output-on-failure -L 'main|python' ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
else
|
||||
(time ctest --output-on-failure -L main -E test-opt ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
(time ctest -C Release --output-on-failure -L main -E test-opt ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
fi
|
||||
|
||||
set +e
|
||||
@@ -340,7 +347,7 @@ function gg_run_ctest_with_model_debug {
|
||||
cd build-ci-debug
|
||||
set -e
|
||||
|
||||
(LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
(LLAMACPP_TEST_MODELFILE="$model" time ctest -C Debug --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
|
||||
set +e
|
||||
cd ..
|
||||
@@ -353,7 +360,7 @@ function gg_run_ctest_with_model_release {
|
||||
cd build-ci-release
|
||||
set -e
|
||||
|
||||
(LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
(LLAMACPP_TEST_MODELFILE="$model" time ctest -C Release --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
|
||||
# test memory leaks
|
||||
#if [[ ! -z ${GG_BUILD_METAL} ]]; then
|
||||
@@ -407,8 +414,8 @@ function gg_run_qwen3_0_6b {
|
||||
|
||||
set -e
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
(cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time cmake --build . --config Release -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf --outtype f16
|
||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-bf16.gguf --outtype bf16
|
||||
@@ -556,8 +563,8 @@ function gg_run_embd_bge_small {
|
||||
|
||||
set -e
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
(cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time cmake --build . --config Release -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
|
||||
@@ -601,8 +608,8 @@ function gg_run_rerank_tiny {
|
||||
|
||||
set -e
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
(cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time cmake --build . --config Release -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
|
||||
@@ -652,10 +659,6 @@ function gg_check_build_requirements {
|
||||
gg_printf 'cmake not found, please install'
|
||||
fi
|
||||
|
||||
if ! command -v make &> /dev/null; then
|
||||
gg_printf 'make not found, please install'
|
||||
fi
|
||||
|
||||
if ! command -v ctest &> /dev/null; then
|
||||
gg_printf 'ctest not found, please install'
|
||||
fi
|
||||
|
||||
+22
-3
@@ -423,6 +423,9 @@ static bool parse_bool_value(const std::string & value) {
|
||||
static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
|
||||
common_params & params = ctx_arg.params;
|
||||
|
||||
// setup log directly from params.verbosity: see tools/cli/cli.cpp
|
||||
common_log_set_verbosity_thold(params.verbosity);
|
||||
|
||||
std::unordered_map<std::string, std::pair<common_arg *, bool>> arg_to_options;
|
||||
for (auto & opt : ctx_arg.options) {
|
||||
for (const auto & arg : opt.args) {
|
||||
@@ -631,8 +634,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||
));
|
||||
}
|
||||
|
||||
common_log_set_verbosity_thold(params.verbosity);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -1078,7 +1079,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params) {
|
||||
params.verbose_prompt = true;
|
||||
}
|
||||
));
|
||||
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL}));
|
||||
add_opt(common_arg(
|
||||
{"--display-prompt"},
|
||||
{"--no-display-prompt"},
|
||||
@@ -2806,6 +2807,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.port = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT"));
|
||||
add_opt(common_arg(
|
||||
{"--reuse-port"},
|
||||
string_format("allow multiple sockets to bind to the same port (default: %s)", params.reuse_port ? "enabled" : "disabled"),
|
||||
[](common_params & params) {
|
||||
params.reuse_port = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_REUSE_PORT"));
|
||||
add_opt(common_arg(
|
||||
{"--path"}, "PATH",
|
||||
string_format("path to serve static files from (default: %s)", params.public_path.c_str()),
|
||||
@@ -2842,6 +2850,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.webui_mcp_proxy = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_MCP_PROXY"));
|
||||
add_opt(common_arg(
|
||||
{"--tools"}, "TOOL1,TOOL2,...",
|
||||
"experimental: whether to enable built-in tools for AI agents - do not enable in untrusted environments (default: no tools)\n"
|
||||
"specify \"all\" to enable all tools\n"
|
||||
"available tools: read_file, file_glob_search, grep_search, exec_shell_command, write_file, edit_file, apply_diff",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.server_tools = parse_csv_row(value);
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TOOLS"));
|
||||
add_opt(common_arg(
|
||||
{"--webui"},
|
||||
{"--no-webui"},
|
||||
@@ -3244,6 +3261,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
"Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
|
||||
[](common_params & params) {
|
||||
params.verbosity = INT_MAX;
|
||||
common_log_set_verbosity_thold(INT_MAX);
|
||||
}
|
||||
));
|
||||
add_opt(common_arg(
|
||||
@@ -3264,6 +3282,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
"(default: %d)\n", params.verbosity),
|
||||
[](common_params & params, int value) {
|
||||
params.verbosity = value;
|
||||
common_log_set_verbosity_thold(value);
|
||||
}
|
||||
).set_env("LLAMA_LOG_VERBOSITY"));
|
||||
add_opt(common_arg(
|
||||
|
||||
@@ -287,7 +287,7 @@ void analyze_reasoning::compare_reasoning_presence() {
|
||||
return p.literal(reasoning_content) + p.space() + p.optional(p.tag("post", (p.marker() + p.space())) + p.rest());
|
||||
});
|
||||
auto parser_wrapped = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
|
||||
return p.tag("pre", p.marker()) + p.space() + p.literal(reasoning_content) + p.space() + p.tag("post", (p.marker() + p.space())) + p.rest();
|
||||
return p.tag("pre", p.marker() + p.space()) + p.literal(reasoning_content) + p.space() + p.tag("post", (p.marker() + p.space())) + p.rest();
|
||||
});
|
||||
// try the more aggressive parse first, if it fails, fall back to the delimiter one
|
||||
auto result = parser_wrapped.parse_anywhere_and_extract(comparison->output_B);
|
||||
@@ -297,7 +297,7 @@ void analyze_reasoning::compare_reasoning_presence() {
|
||||
if (result.result.success()) {
|
||||
if (!result.tags["pre"].empty() && !result.tags["post"].empty()) {
|
||||
mode = reasoning_mode::TAG_BASED;
|
||||
start = trim_whitespace(result.tags["pre"]);
|
||||
start = trim_leading_whitespace(result.tags["pre"]);
|
||||
end = trim_trailing_whitespace(result.tags["post"]);
|
||||
} else if (!result.tags["post"].empty()) {
|
||||
mode = reasoning_mode::TAG_BASED;
|
||||
@@ -333,7 +333,7 @@ void analyze_reasoning::compare_thinking_enabled() {
|
||||
if (left_trimmed.empty() && !diff.right.empty()) {
|
||||
if (!right_trimmed.empty() && string_ends_with(comparison->output_B, right_trimmed)) {
|
||||
if (start.empty()) {
|
||||
start = right_trimmed;
|
||||
start = trim_leading_whitespace(diff.right);
|
||||
mode = reasoning_mode::TAG_BASED;
|
||||
}
|
||||
}
|
||||
@@ -344,7 +344,7 @@ void analyze_reasoning::compare_thinking_enabled() {
|
||||
if (seg.size() >= 2 && seg[seg.size() - 1].value == left_trimmed && seg[seg.size() - 2].type == segment_type::MARKER) {
|
||||
start = seg[seg.size() - 2].value;
|
||||
}
|
||||
end = left_trimmed;
|
||||
end = trim_trailing_whitespace(diff.left);
|
||||
mode = reasoning_mode::TAG_BASED;
|
||||
}
|
||||
}
|
||||
@@ -363,15 +363,23 @@ void analyze_reasoning::compare_thinking_enabled() {
|
||||
size_t len = std::min(base.size(), anchor_len);
|
||||
std::string anchor = base.substr(base.size() - len);
|
||||
auto pos = extended.rfind(anchor);
|
||||
if (pos == std::string::npos || pos + len >= extended.size()) continue;
|
||||
if (pos == std::string::npos || pos + len >= extended.size()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
std::string extra = trim_whitespace(extended.substr(pos + len));
|
||||
if (extra.empty()) continue;
|
||||
if (extra.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto seg = prune_whitespace_segments(segmentize_markers(extra));
|
||||
if (seg.size() == 2 && seg[0].type == segment_type::MARKER && seg[1].type == segment_type::MARKER) {
|
||||
if (start.empty()) start = seg[0].value;
|
||||
if (end.empty()) end = seg[1].value;
|
||||
if (start.empty()) {
|
||||
start = seg[0].value;
|
||||
}
|
||||
if (end.empty()) {
|
||||
end = seg[1].value;
|
||||
}
|
||||
mode = reasoning_mode::TAG_BASED;
|
||||
break;
|
||||
}
|
||||
@@ -423,7 +431,7 @@ void analyze_reasoning::compare_reasoning_scope() {
|
||||
LOG_DBG(ANSI_ORANGE "%s: Detected TOOLS_ONLY reasoning mode\n" ANSI_RESET, __func__);
|
||||
|
||||
auto parser_wrapped = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
|
||||
return p.tag("pre", p.marker()) + p.space() + p.literal(reasoning_content) + p.space() + p.tag("post", (p.marker() + p.space()));
|
||||
return p.tag("pre", p.marker() + p.space()) + p.literal(reasoning_content) + p.space() + p.tag("post", (p.marker() + p.space()));
|
||||
});
|
||||
auto result = parser_wrapped.parse_anywhere_and_extract(comparison->output_B);
|
||||
if (result.result.success()) {
|
||||
@@ -516,7 +524,7 @@ analyze_content::analyze_content(const common_chat_template & tmpl, const analyz
|
||||
// Take the more promising diff
|
||||
std::string pure_content = rdiff.length() > diff_tools.left.length() ? rdiff : diff_tools.left;
|
||||
auto parser_wrapped = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
|
||||
return p.tag("pre", p.marker()) + p.space() + p.literal(response) + p.space() + p.tag("post", (p.marker() + p.space())) + p.rest();
|
||||
return p.tag("pre", p.marker() + p.space()) + p.literal(response) + p.space() + p.tag("post", (p.marker() + p.space())) + p.rest();
|
||||
});
|
||||
auto result = parser_wrapped.parse_anywhere_and_extract(pure_content);
|
||||
start = result.tags["pre"];
|
||||
|
||||
+8
-1
@@ -971,6 +971,7 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
|
||||
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
|
||||
auto has_response_format = !inputs.json_schema.is_null() && inputs.json_schema.is_object();
|
||||
auto include_grammar = has_response_format || (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE);
|
||||
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
|
||||
|
||||
auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
|
||||
auto start = p.rule("start", p.literal("<|start|>assistant"));
|
||||
@@ -979,7 +980,13 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
|
||||
auto channel = p.literal("<|channel|>") + (p.literal("commentary") | p.literal("analysis"));
|
||||
auto constrain_type = p.chars("[A-Za-z0-9_-]", 1, -1);
|
||||
|
||||
auto analysis = p.rule("analysis", p.literal("<|channel|>analysis<|message|>") + p.reasoning(content) + end);
|
||||
if (extract_reasoning) {
|
||||
p.rule("analysis", p.literal("<|channel|>analysis<|message|>") + p.reasoning(content) + end);
|
||||
} else {
|
||||
p.rule("analysis", p.content(p.literal("<|channel|>analysis<|message|>") + content + end));
|
||||
}
|
||||
|
||||
auto analysis = p.ref("analysis");
|
||||
auto preamble = p.rule("preamble", p.literal("<|channel|>commentary<|message|>") + p.content(content) + end);
|
||||
auto final_msg = p.rule("final", p.literal("<|channel|>final<|message|>") + p.content(content));
|
||||
auto any = p.rule("any", preamble | analysis);
|
||||
|
||||
@@ -656,6 +656,38 @@ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_over
|
||||
return true;
|
||||
}
|
||||
|
||||
// simple glob: * matches non-/ chars, ** matches anything including /
|
||||
static inline bool glob_match(const char * pattern, const char * str) {
|
||||
if (*pattern == '\0') {
|
||||
return *str == '\0';
|
||||
}
|
||||
if (pattern[0] == '*' && pattern[1] == '*') {
|
||||
const char * p = pattern + 2;
|
||||
if (*p == '/') p++;
|
||||
if (glob_match(p, str)) return true;
|
||||
if (*str != '\0') return glob_match(pattern, str + 1);
|
||||
return false;
|
||||
}
|
||||
if (*pattern == '*') {
|
||||
const char * p = pattern + 1;
|
||||
for (; *str != '\0' && *str != '/'; str++) {
|
||||
if (glob_match(p, str)) return true;
|
||||
}
|
||||
return glob_match(p, str);
|
||||
}
|
||||
if (*pattern == '?' && *str != '\0' && *str != '/') {
|
||||
return glob_match(pattern + 1, str + 1);
|
||||
}
|
||||
if (*pattern == *str) {
|
||||
return glob_match(pattern + 1, str + 1);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool glob_match(const std::string & pattern, const std::string & str) {
|
||||
return glob_match(pattern.c_str(), str.c_str());
|
||||
}
|
||||
|
||||
//
|
||||
// Filesystem utils
|
||||
//
|
||||
|
||||
@@ -573,6 +573,7 @@ struct common_params {
|
||||
|
||||
// server params
|
||||
int32_t port = 8080; // server listens on this network port
|
||||
bool reuse_port = false; // allow multiple sockets to bind to the same port
|
||||
int32_t timeout_read = 600; // http read timeout in seconds
|
||||
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
||||
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
||||
@@ -613,6 +614,9 @@ struct common_params {
|
||||
bool endpoint_props = false; // only control POST requests, not GET
|
||||
bool endpoint_metrics = false;
|
||||
|
||||
// enable built-in tools
|
||||
std::vector<std::string> server_tools;
|
||||
|
||||
// router server configs
|
||||
std::string models_dir = ""; // directory containing models for the router server
|
||||
std::string models_preset = ""; // directory containing model presets for the router server
|
||||
@@ -790,6 +794,8 @@ std::string string_from(const std::vector<int> & values);
|
||||
std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
|
||||
std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
|
||||
|
||||
bool glob_match(const std::string & pattern, const std::string & str);
|
||||
|
||||
//
|
||||
// Filesystem utils
|
||||
//
|
||||
|
||||
+19
-5
@@ -454,7 +454,9 @@ static gguf_split_info get_gguf_split_info(const std::string & path) {
|
||||
std::smatch m;
|
||||
|
||||
std::string prefix = path;
|
||||
string_remove_suffix(prefix, ".gguf");
|
||||
if (!string_remove_suffix(prefix, ".gguf")) {
|
||||
return {};
|
||||
}
|
||||
|
||||
int index = 1;
|
||||
int count = 1;
|
||||
@@ -546,6 +548,20 @@ static hf_cache::hf_file find_best_mmproj(const hf_cache::hf_files & files,
|
||||
return best;
|
||||
}
|
||||
|
||||
static bool gguf_filename_is_model(const std::string & filepath) {
|
||||
if (!string_ends_with(filepath, ".gguf")) {
|
||||
return false;
|
||||
}
|
||||
|
||||
std::string filename = filepath;
|
||||
if (auto pos = filename.rfind('/'); pos != std::string::npos) {
|
||||
filename = filename.substr(pos + 1);
|
||||
}
|
||||
|
||||
return filename.find("mmproj") == std::string::npos &&
|
||||
filename.find("imatrix") == std::string::npos;
|
||||
}
|
||||
|
||||
static hf_cache::hf_file find_best_model(const hf_cache::hf_files & files,
|
||||
const std::string & tag) {
|
||||
std::vector<std::string> tags;
|
||||
@@ -559,8 +575,7 @@ static hf_cache::hf_file find_best_model(const hf_cache::hf_files & files,
|
||||
for (const auto & t : tags) {
|
||||
std::regex pattern(t + "[.-]", std::regex::icase);
|
||||
for (const auto & f : files) {
|
||||
if (string_ends_with(f.path, ".gguf") &&
|
||||
f.path.find("mmproj") == std::string::npos &&
|
||||
if (gguf_filename_is_model(f.path) &&
|
||||
std::regex_search(f.path, pattern)) {
|
||||
return f;
|
||||
}
|
||||
@@ -568,8 +583,7 @@ static hf_cache::hf_file find_best_model(const hf_cache::hf_files & files,
|
||||
}
|
||||
|
||||
for (const auto & f : files) {
|
||||
if (string_ends_with(f.path, ".gguf") &&
|
||||
f.path.find("mmproj") == std::string::npos) {
|
||||
if (gguf_filename_is_model(f.path)) {
|
||||
return f;
|
||||
}
|
||||
}
|
||||
|
||||
+167
-40
@@ -26,6 +26,8 @@ namespace nl = nlohmann;
|
||||
#include <windows.h>
|
||||
#else
|
||||
#define HOME_DIR "HOME"
|
||||
#include <unistd.h>
|
||||
#include <pwd.h>
|
||||
#endif
|
||||
|
||||
namespace hf_cache {
|
||||
@@ -38,6 +40,7 @@ static fs::path get_cache_directory() {
|
||||
const char * var;
|
||||
fs::path path;
|
||||
} entries[] = {
|
||||
{"LLAMA_CACHE", fs::path()},
|
||||
{"HF_HUB_CACHE", fs::path()},
|
||||
{"HUGGINGFACE_HUB_CACHE", fs::path()},
|
||||
{"HF_HOME", fs::path("hub")},
|
||||
@@ -50,6 +53,13 @@ static fs::path get_cache_directory() {
|
||||
return entry.path.empty() ? base : base / entry.path;
|
||||
}
|
||||
}
|
||||
#ifndef _WIN32
|
||||
const struct passwd * pw = getpwuid(getuid());
|
||||
|
||||
if (pw->pw_dir && *pw->pw_dir) {
|
||||
return fs::path(pw->pw_dir) / ".cache" / "huggingface" / "hub";
|
||||
}
|
||||
#endif
|
||||
throw std::runtime_error("Failed to determine HF cache directory");
|
||||
}();
|
||||
|
||||
@@ -325,9 +335,15 @@ hf_files get_repo_files(const std::string & repo_id,
|
||||
if (item["lfs"].contains("oid") && item["lfs"]["oid"].is_string()) {
|
||||
file.oid = item["lfs"]["oid"].get<std::string>();
|
||||
}
|
||||
if (item["lfs"].contains("size") && item["lfs"]["size"].is_number()) {
|
||||
file.size = item["lfs"]["size"].get<size_t>();
|
||||
}
|
||||
} else if (item.contains("oid") && item["oid"].is_string()) {
|
||||
file.oid = item["oid"].get<std::string>();
|
||||
}
|
||||
if (file.size == 0 && item.contains("size") && item["size"].is_number()) {
|
||||
file.size = item["size"].get<size_t>();
|
||||
}
|
||||
|
||||
if (!file.oid.empty() && !is_valid_oid(file.oid)) {
|
||||
LOG_WRN("%s: skip invalid oid: %s\n", __func__, file.oid.c_str());
|
||||
@@ -487,6 +503,34 @@ std::string finalize_file(const hf_file & file) {
|
||||
|
||||
// delete everything after this line, one day
|
||||
|
||||
// copied from download.cpp without the tag part
|
||||
struct gguf_split_info {
|
||||
std::string prefix; // tag included
|
||||
int index;
|
||||
int count;
|
||||
};
|
||||
|
||||
static gguf_split_info get_gguf_split_info(const std::string & path) {
|
||||
static const std::regex re_split("^(.+)-([0-9]{5})-of-([0-9]{5})$", std::regex::icase);
|
||||
std::smatch m;
|
||||
|
||||
std::string prefix = path;
|
||||
if (!string_remove_suffix(prefix, ".gguf")) {
|
||||
return {};
|
||||
}
|
||||
|
||||
int index = 1;
|
||||
int count = 1;
|
||||
|
||||
if (std::regex_match(prefix, m, re_split)) {
|
||||
index = std::stoi(m[2].str());
|
||||
count = std::stoi(m[3].str());
|
||||
prefix = m[1].str();
|
||||
}
|
||||
|
||||
return {std::move(prefix), index, count};
|
||||
}
|
||||
|
||||
static std::pair<std::string, std::string> parse_manifest_name(std::string & filename) {
|
||||
static const std::regex re(R"(^manifest=([^=]+)=([^=]+)=.*\.json$)");
|
||||
std::smatch match;
|
||||
@@ -504,25 +548,30 @@ static std::string make_old_cache_filename(const std::string & owner,
|
||||
return result;
|
||||
}
|
||||
|
||||
static bool migrate_single_file(const fs::path & old_cache,
|
||||
const std::string & owner,
|
||||
const std::string & repo,
|
||||
const nl::json & node,
|
||||
const hf_files & files) {
|
||||
struct migrate_file {
|
||||
std::string path;
|
||||
std::string sha256;
|
||||
size_t size;
|
||||
fs::path old_path;
|
||||
fs::path etag_path;
|
||||
const hf_file * file;
|
||||
};
|
||||
|
||||
if (!node.contains("rfilename") ||
|
||||
!node.contains("lfs") ||
|
||||
!node["lfs"].contains("sha256")) {
|
||||
return false;
|
||||
}
|
||||
using migrate_files = std::vector<migrate_file>;
|
||||
|
||||
std::string path = node["rfilename"];
|
||||
std::string sha256 = node["lfs"]["sha256"];
|
||||
static bool collect_file(const fs::path & old_cache,
|
||||
const std::string & owner,
|
||||
const std::string & repo,
|
||||
const std::string & path,
|
||||
const std::string & sha256,
|
||||
const hf_files & files,
|
||||
migrate_files & to_migrate) {
|
||||
|
||||
const hf_file * file = nullptr;
|
||||
|
||||
const hf_file * file_info = nullptr;
|
||||
for (const auto & f : files) {
|
||||
if (f.path == path) {
|
||||
file_info = &f;
|
||||
file = &f;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -532,50 +581,104 @@ static bool migrate_single_file(const fs::path & old_cache,
|
||||
fs::path etag_path = old_path.string() + ".etag";
|
||||
|
||||
if (!fs::exists(old_path)) {
|
||||
if (fs::exists(etag_path)) {
|
||||
LOG_WRN("%s: %s is orphan, deleting...\n", __func__, etag_path.string().c_str());
|
||||
fs::remove(etag_path);
|
||||
if (file && fs::exists(file->final_path)) {
|
||||
return true;
|
||||
}
|
||||
LOG_WRN("%s: %s not found in old cache or HF cache\n", __func__, old_filename.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
bool delete_old_path = false;
|
||||
|
||||
if (!file_info) {
|
||||
LOG_WRN("%s: %s not found in current repo, deleting...\n", __func__, old_filename.c_str());
|
||||
delete_old_path = true;
|
||||
} else if (!sha256.empty() && !file_info->oid.empty() && sha256 != file_info->oid) {
|
||||
LOG_WRN("%s: %s is not up to date (sha256 mismatch), deleting...\n", __func__, old_filename.c_str());
|
||||
delete_old_path = true;
|
||||
if (!file) {
|
||||
LOG_WRN("%s: %s not found in current repo\n", __func__, old_filename.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
std::error_code ec;
|
||||
if (!sha256.empty() && !file->oid.empty() && sha256 != file->oid) {
|
||||
LOG_WRN("%s: %s is not up to date (sha256 mismatch)\n", __func__, old_filename.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (delete_old_path) {
|
||||
fs::remove(old_path, ec);
|
||||
fs::remove(etag_path, ec);
|
||||
if (file->size > 0) {
|
||||
size_t size = fs::file_size(old_path);
|
||||
if (size != file->size) {
|
||||
LOG_WRN("%s: %s has wrong size %zu (expected %zu)\n", __func__, old_filename.c_str(), size, file->size);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
to_migrate.push_back({path, sha256, file->size, old_path, etag_path, file});
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool collect_files(const fs::path & old_cache,
|
||||
const std::string & owner,
|
||||
const std::string & repo,
|
||||
const nl::json & node,
|
||||
const hf_files & files,
|
||||
migrate_files & to_migrate) {
|
||||
|
||||
if (!node.contains("rfilename") ||
|
||||
!node.contains("lfs") ||
|
||||
!node["lfs"].contains("sha256")) {
|
||||
return true;
|
||||
}
|
||||
|
||||
fs::path new_path(file_info->local_path);
|
||||
std::string path = node["rfilename"];
|
||||
std::string sha256 = node["lfs"]["sha256"];
|
||||
|
||||
auto split = get_gguf_split_info(path);
|
||||
|
||||
if (split.count <= 1) {
|
||||
return collect_file(old_cache, owner, repo, path, sha256, files, to_migrate);
|
||||
}
|
||||
|
||||
std::vector<std::pair<std::string, std::string>> splits;
|
||||
|
||||
for (const auto & f : files) {
|
||||
auto split_f = get_gguf_split_info(f.path);
|
||||
if (split_f.count == split.count && split_f.prefix == split.prefix) {
|
||||
// sadly the manifest only provides the sha256 of the first file (index == 1)
|
||||
// the rest will be verified using the size...
|
||||
std::string f_sha256 = (split_f.index == 1) ? sha256 : "";
|
||||
splits.emplace_back(f.path, f_sha256);
|
||||
}
|
||||
}
|
||||
|
||||
if ((int)splits.size() != split.count) {
|
||||
LOG_WRN("%s: expected %d split files but found %d in repo\n", __func__, split.count, (int)splits.size());
|
||||
return false;
|
||||
}
|
||||
|
||||
for (const auto & [f_path, f_sha256] : splits) {
|
||||
if (!collect_file(old_cache, owner, repo, f_path, f_sha256, files, to_migrate)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool migrate_file(const migrate_file & file) {
|
||||
std::error_code ec;
|
||||
|
||||
fs::path new_path(file.file->local_path);
|
||||
fs::create_directories(new_path.parent_path(), ec);
|
||||
|
||||
if (!fs::exists(new_path, ec)) {
|
||||
fs::rename(old_path, new_path, ec);
|
||||
fs::rename(file.old_path, new_path, ec);
|
||||
if (ec) {
|
||||
fs::copy_file(old_path, new_path, ec);
|
||||
fs::copy_file(file.old_path, new_path, ec);
|
||||
if (ec) {
|
||||
LOG_WRN("%s: failed to move/copy %s: %s\n", __func__, old_path.string().c_str(), ec.message().c_str());
|
||||
LOG_ERR("%s: failed to move/copy %s: %s\n", __func__, file.old_path.string().c_str(), ec.message().c_str());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
fs::remove(old_path, ec);
|
||||
fs::remove(file.old_path, ec);
|
||||
}
|
||||
fs::remove(etag_path, ec);
|
||||
|
||||
std::string filename = finalize_file(*file_info);
|
||||
LOG_INF("%s: migrated %s -> %s\n", __func__, old_filename.c_str(), filename.c_str());
|
||||
fs::remove(file.etag_path, ec);
|
||||
|
||||
std::string filename = finalize_file(*file.file);
|
||||
LOG_INF("%s: migrated %s -> %s\n", __func__, file.old_path.filename().string().c_str(), filename.c_str());
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -624,19 +727,43 @@ void migrate_old_cache_to_hf_cache(const std::string & token, bool offline) {
|
||||
continue;
|
||||
}
|
||||
|
||||
migrate_files to_migrate;
|
||||
bool ok = true;
|
||||
|
||||
try {
|
||||
std::ifstream manifest(entry.path());
|
||||
auto json = nl::json::parse(manifest);
|
||||
|
||||
for (const char * key : {"ggufFile", "mmprojFile"}) {
|
||||
if (json.contains(key)) {
|
||||
migrate_single_file(old_cache, owner, repo, json[key], files);
|
||||
if (!collect_files(old_cache, owner, repo, json[key], files, to_migrate)) {
|
||||
ok = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (const std::exception & e) {
|
||||
LOG_WRN("%s: failed to parse manifest %s: %s\n", __func__, filename.c_str(), e.what());
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!ok) {
|
||||
LOG_WRN("%s: migration skipped: one or more files failed validation\n", __func__);
|
||||
continue;
|
||||
}
|
||||
|
||||
for (const auto & file : to_migrate) {
|
||||
if (!migrate_file(file)) {
|
||||
ok = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!ok) {
|
||||
LOG_WRN("%s: migration failed: could not migrate all files\n", __func__);
|
||||
continue;
|
||||
}
|
||||
|
||||
LOG_INF("%s: migration complete, deleting manifest: %s\n", __func__, entry.path().string().c_str());
|
||||
fs::remove(entry.path());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,6 +14,7 @@ struct hf_file {
|
||||
std::string final_path;
|
||||
std::string oid;
|
||||
std::string repo_id;
|
||||
size_t size = 0; // only for the migration
|
||||
};
|
||||
|
||||
using hf_files = std::vector<hf_file>;
|
||||
|
||||
+12
-11
@@ -115,9 +115,11 @@ static void common_reasoning_budget_accept(struct llama_sampler * smpl, llama_to
|
||||
break;
|
||||
}
|
||||
case REASONING_BUDGET_FORCING:
|
||||
// force_pos is advanced in apply(), not here.
|
||||
// This ensures the first forced token isn't skipped when the sampler
|
||||
// is initialized directly in FORCING state (e.g. COUNTING + budget=0)
|
||||
ctx->force_pos++;
|
||||
if (ctx->force_pos >= ctx->forced_tokens.size()) {
|
||||
ctx->state = REASONING_BUDGET_DONE;
|
||||
LOG_INF("reasoning-budget: forced sequence complete, done\n");
|
||||
}
|
||||
break;
|
||||
case REASONING_BUDGET_DONE:
|
||||
break;
|
||||
@@ -144,14 +146,6 @@ static void common_reasoning_budget_apply(struct llama_sampler * smpl, llama_tok
|
||||
cur_p->data[i].logit = -INFINITY;
|
||||
}
|
||||
}
|
||||
|
||||
// advance to next forced token (done here rather than in accept so that
|
||||
// the first forced token isn't skipped when starting in FORCING state)
|
||||
ctx->force_pos++;
|
||||
if (ctx->force_pos >= ctx->forced_tokens.size()) {
|
||||
ctx->state = REASONING_BUDGET_DONE;
|
||||
LOG_INF("reasoning-budget: forced sequence complete, done\n");
|
||||
}
|
||||
}
|
||||
|
||||
static void common_reasoning_budget_reset(struct llama_sampler * smpl) {
|
||||
@@ -261,3 +255,10 @@ struct llama_sampler * common_reasoning_budget_init(
|
||||
common_reasoning_budget_state initial_state) {
|
||||
return common_reasoning_budget_init_state(vocab, start_tokens, end_tokens, forced_tokens, budget, initial_state);
|
||||
}
|
||||
|
||||
common_reasoning_budget_state common_reasoning_budget_get_state(const struct llama_sampler * smpl) {
|
||||
if (!smpl) {
|
||||
return REASONING_BUDGET_IDLE;
|
||||
}
|
||||
return ((const common_reasoning_budget_ctx *)smpl->ctx)->state;
|
||||
}
|
||||
|
||||
@@ -51,3 +51,5 @@ struct llama_sampler * common_reasoning_budget_init(
|
||||
const std::vector<llama_token> & forced_tokens,
|
||||
int32_t budget,
|
||||
common_reasoning_budget_state initial_state);
|
||||
|
||||
common_reasoning_budget_state common_reasoning_budget_get_state(const struct llama_sampler * smpl);
|
||||
|
||||
+46
-10
@@ -7,6 +7,7 @@
|
||||
|
||||
#include <algorithm>
|
||||
#include <cctype>
|
||||
#include <climits>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <unordered_map>
|
||||
@@ -109,6 +110,7 @@ struct common_sampler {
|
||||
common_params_sampling params;
|
||||
|
||||
struct llama_sampler * grmr;
|
||||
struct llama_sampler * rbudget;
|
||||
struct llama_sampler * chain;
|
||||
|
||||
ring_buffer<llama_token> prev;
|
||||
@@ -188,6 +190,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
|
||||
lparams.no_perf = params.no_perf;
|
||||
|
||||
llama_sampler * grmr = nullptr;
|
||||
llama_sampler * rbudget = nullptr;
|
||||
llama_sampler * chain = llama_sampler_chain_init(lparams);
|
||||
|
||||
std::vector<llama_sampler *> samplers;
|
||||
@@ -270,7 +273,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
|
||||
}
|
||||
}
|
||||
|
||||
if (grmr) {
|
||||
if (grmr && !params.grammar_lazy) {
|
||||
try {
|
||||
for (const auto & token : prefill_tokens) {
|
||||
llama_sampler_accept(grmr, token);
|
||||
@@ -284,15 +287,15 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
|
||||
}
|
||||
}
|
||||
|
||||
// reasoning budget sampler — added first so it can force tokens before other samplers
|
||||
if (params.reasoning_budget_tokens >= 0 && !params.reasoning_budget_forced.empty()) {
|
||||
samplers.push_back(common_reasoning_budget_init(
|
||||
// reasoning budget sampler
|
||||
if (!params.reasoning_budget_start.empty() && !params.reasoning_budget_end.empty()) {
|
||||
rbudget = common_reasoning_budget_init(
|
||||
vocab,
|
||||
params.reasoning_budget_start,
|
||||
params.reasoning_budget_end,
|
||||
params.reasoning_budget_forced,
|
||||
params.reasoning_budget_tokens,
|
||||
prefill_tokens));
|
||||
params.reasoning_budget_tokens < 0 ? INT_MAX : params.reasoning_budget_tokens,
|
||||
prefill_tokens);
|
||||
}
|
||||
|
||||
if (params.has_logit_bias()) {
|
||||
@@ -383,6 +386,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
|
||||
auto * result = new common_sampler {
|
||||
/* .params = */ params,
|
||||
/* .grmr = */ grmr,
|
||||
/* .rbudget = */ rbudget,
|
||||
/* .chain = */ chain,
|
||||
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
|
||||
/* .cur = */ {},
|
||||
@@ -398,11 +402,27 @@ void common_sampler_free(struct common_sampler * gsmpl) {
|
||||
}
|
||||
|
||||
llama_sampler_free(gsmpl->grmr);
|
||||
llama_sampler_free(gsmpl->rbudget);
|
||||
llama_sampler_free(gsmpl->chain);
|
||||
|
||||
delete gsmpl;
|
||||
}
|
||||
|
||||
static bool grammar_should_apply(struct common_sampler * gsmpl) {
|
||||
if (!gsmpl->grmr) {
|
||||
return false;
|
||||
}
|
||||
if (!gsmpl->rbudget) {
|
||||
return true;
|
||||
}
|
||||
if (gsmpl->params.grammar_lazy) {
|
||||
// if grammar is lazy, only apply when reasoning budget is not active
|
||||
const auto state = common_reasoning_budget_get_state(gsmpl->rbudget);
|
||||
return state == REASONING_BUDGET_IDLE || state == REASONING_BUDGET_DONE;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
|
||||
if (!gsmpl) {
|
||||
return;
|
||||
@@ -410,6 +430,11 @@ void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, boo
|
||||
|
||||
const auto tm = gsmpl->tm();
|
||||
|
||||
// grammar_should_apply() checks the reasoning budget state, so calculate this before we accept
|
||||
accept_grammar = accept_grammar && grammar_should_apply(gsmpl);
|
||||
|
||||
llama_sampler_accept(gsmpl->rbudget, token);
|
||||
|
||||
if (gsmpl->grmr && accept_grammar) {
|
||||
llama_sampler_accept(gsmpl->grmr, token);
|
||||
}
|
||||
@@ -431,6 +456,7 @@ struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
|
||||
return new common_sampler {
|
||||
/* .params = */ gsmpl->params,
|
||||
/* .grmr = */ llama_sampler_clone(gsmpl->grmr),
|
||||
/* .rbudget = */ llama_sampler_clone(gsmpl->rbudget),
|
||||
/* .chain = */ llama_sampler_clone(gsmpl->chain),
|
||||
/* .prev = */ gsmpl->prev,
|
||||
/* .cur = */ gsmpl->cur,
|
||||
@@ -500,6 +526,7 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
||||
llama_token id = LLAMA_TOKEN_NULL;
|
||||
|
||||
auto & grmr = gsmpl->grmr;
|
||||
auto & rbudget = gsmpl->rbudget;
|
||||
auto & chain = gsmpl->chain;
|
||||
auto & cur_p = gsmpl->cur_p; // initialized by set_logits
|
||||
|
||||
@@ -511,7 +538,8 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
||||
if (id != LLAMA_TOKEN_NULL) {
|
||||
LOG_DBG("%s: Backend sampler selected token: '%d'. Will not run any CPU samplers\n", __func__, id);
|
||||
|
||||
GGML_ASSERT(!gsmpl->grmr && "using grammar in combination with backend sampling is not supported");
|
||||
GGML_ASSERT(!gsmpl->grmr && "using grammar in combination with backend sampling is not supported");
|
||||
GGML_ASSERT(!gsmpl->rbudget && "using reasoning budget in combination with backend sampling is not supported");
|
||||
|
||||
// TODO: simplify
|
||||
gsmpl->cur.resize(1);
|
||||
@@ -524,7 +552,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
||||
|
||||
gsmpl->set_logits(ctx, idx);
|
||||
|
||||
if (grammar_first) {
|
||||
// apply reasoning budget first
|
||||
llama_sampler_apply(rbudget, &cur_p);
|
||||
|
||||
if (grammar_first && grammar_should_apply(gsmpl)) {
|
||||
llama_sampler_apply(grmr, &cur_p);
|
||||
}
|
||||
|
||||
@@ -532,7 +563,7 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
||||
|
||||
id = cur_p.data[cur_p.selected].id;
|
||||
|
||||
if (grammar_first) {
|
||||
if (grammar_first || !grammar_should_apply(gsmpl)) {
|
||||
return id;
|
||||
}
|
||||
|
||||
@@ -553,7 +584,12 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
||||
// if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
|
||||
gsmpl->set_logits(ctx, idx);
|
||||
|
||||
llama_sampler_apply(grmr, &cur_p);
|
||||
llama_sampler_apply(rbudget, &cur_p);
|
||||
|
||||
if (grammar_should_apply(gsmpl)) {
|
||||
llama_sampler_apply(grmr, &cur_p);
|
||||
}
|
||||
|
||||
llama_sampler_apply(chain, &cur_p);
|
||||
|
||||
GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
|
||||
|
||||
+281
-21
@@ -486,7 +486,7 @@ class ModelBase:
|
||||
elif quant_method == "modelopt":
|
||||
# Mixed-precision ModelOpt models: NVFP4 tensors are handled by
|
||||
# _generate_nvfp4_tensors; FP8 tensors have 1D weight_scale and
|
||||
# are dequantized here. input_scale tensors are unused.
|
||||
# are dequantized here. k/v scale tensors are unused.
|
||||
for name in self.model_tensors.keys():
|
||||
if name.endswith(".weight_scale"):
|
||||
weight_name = name.removesuffix("_scale")
|
||||
@@ -494,7 +494,7 @@ class ModelBase:
|
||||
s = self.model_tensors[name]
|
||||
self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s(), None)
|
||||
tensors_to_remove.append(name)
|
||||
if name.endswith((".input_scale", ".k_scale", ".v_scale")):
|
||||
if name.endswith((".k_scale", ".v_scale")):
|
||||
tensors_to_remove.append(name)
|
||||
elif quant_method is not None:
|
||||
raise NotImplementedError(f"Quant method is not yet supported: {quant_method!r}")
|
||||
@@ -542,7 +542,6 @@ class ModelBase:
|
||||
raise NotImplementedError("set_gguf_parameters() must be implemented in subclasses")
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
|
||||
new_name = self.map_tensor_name(name)
|
||||
|
||||
# Handle gate/up expert tensor fusion if enabled
|
||||
@@ -607,7 +606,12 @@ class ModelBase:
|
||||
def _nvfp4_scale2_is_trivial(scale2: Tensor) -> bool:
|
||||
return scale2.numel() <= 1 and abs(float(scale2.float().sum()) - 1.0) < 1e-6
|
||||
|
||||
def _repack_nvfp4(self, new_name: str, weight: Tensor, scale: Tensor, scale2: Tensor):
|
||||
def _repack_nvfp4(self, name: str, weight: Tensor, scale: Tensor, scale2: Tensor, input_scale: Tensor):
|
||||
if "language_model." in name:
|
||||
name = name.replace("language_model.", "")
|
||||
|
||||
new_name = self.map_tensor_name(name)
|
||||
|
||||
raw, shape = self._nvfp4_pack(weight, scale)
|
||||
logger.info(f"Repacked {new_name} with shape {shape} and quantization NVFP4")
|
||||
self.gguf_writer.add_tensor(new_name, raw, raw_dtype=gguf.GGMLQuantizationType.NVFP4)
|
||||
@@ -619,10 +623,18 @@ class ModelBase:
|
||||
logger.info(f" + {scale_name} (per-tensor NVFP4 scale2, shape [{scale2_f32.size}])")
|
||||
self.gguf_writer.add_tensor(scale_name, scale2_f32)
|
||||
|
||||
# Emit per-tensor input_scale as a separate F32 tensor when non-trivial
|
||||
if not self._nvfp4_scale2_is_trivial(input_scale):
|
||||
input_scale_f32 = input_scale.float().numpy().flatten()
|
||||
input_scale_name = new_name.replace(".weight", ".input_scale")
|
||||
logger.info(f" + {input_scale_name} (per-tensor NVFP4 input_scale, shape [{input_scale_f32.size}])")
|
||||
self.gguf_writer.add_tensor(input_scale_name, input_scale_f32)
|
||||
|
||||
def _generate_nvfp4_tensors(self):
|
||||
# Per-layer expert merging to avoid holding all experts in memory
|
||||
expert_blocks: dict[tuple[int, str], list[tuple[int, np.ndarray]]] = {}
|
||||
expert_scales: dict[tuple[int, str], list[tuple[int, float]]] = {}
|
||||
expert_input_scales: dict[tuple[int, str], list[tuple[int, float]]] = {}
|
||||
expert_shapes: dict[tuple[int, str], list[int]] = {}
|
||||
n_experts = self.find_hparam(["num_local_experts", "num_experts"], optional=True) or 0
|
||||
consumed: list[str] = []
|
||||
@@ -632,6 +644,7 @@ class ModelBase:
|
||||
continue
|
||||
scale_name = name.replace(".weight", ".weight_scale")
|
||||
scale2_name = name.replace(".weight", ".weight_scale_2")
|
||||
input_scale_name = name.replace(".weight", ".input_scale")
|
||||
if scale_name not in self.model_tensors:
|
||||
continue
|
||||
# Force eager materialization of lazy tensors
|
||||
@@ -643,11 +656,14 @@ class ModelBase:
|
||||
continue
|
||||
|
||||
scale2 = LazyTorchTensor.to_eager(self.model_tensors.get(scale2_name, lambda: torch.tensor(1.0))())
|
||||
input_scale = LazyTorchTensor.to_eager(self.model_tensors.get(input_scale_name, lambda: torch.tensor(1.0))())
|
||||
|
||||
# Mark tensors for removal from model_tensors (already written to gguf)
|
||||
consumed.extend([name, scale_name])
|
||||
if scale2_name in self.model_tensors:
|
||||
consumed.append(scale2_name)
|
||||
if input_scale_name in self.model_tensors:
|
||||
consumed.append(input_scale_name)
|
||||
|
||||
# Check if this is a per-expert tensor
|
||||
m = re.search(r'\.experts\.(\d+)\.(gate_proj|up_proj|down_proj)\.weight$', name)
|
||||
@@ -663,34 +679,37 @@ class ModelBase:
|
||||
if key not in expert_blocks:
|
||||
expert_blocks[key] = []
|
||||
expert_scales[key] = []
|
||||
expert_input_scales[key] = []
|
||||
expert_shapes[key] = shape
|
||||
expert_blocks[key].append((expert_id, raw.copy()))
|
||||
# Collect per-expert scale2 (scalar per expert)
|
||||
expert_scales[key].append((expert_id, float(scale2.float().sum())))
|
||||
# Collect per-expert input_scale (scalar per expert)
|
||||
expert_input_scales[key].append((expert_id, float(input_scale.float().sum())))
|
||||
|
||||
# Flush when all experts for this (layer, proj) are collected
|
||||
if n_experts > 0 and len(expert_blocks[key]) >= n_experts:
|
||||
self._flush_nvfp4_experts(key, expert_blocks, expert_scales, expert_shapes, bid, proj_type)
|
||||
self._flush_nvfp4_experts(key, expert_blocks, expert_scales, expert_input_scales, expert_shapes, bid, proj_type)
|
||||
else:
|
||||
new_name = self.map_tensor_name(name)
|
||||
self._repack_nvfp4(new_name, weight, scale, scale2)
|
||||
self._repack_nvfp4(name, weight, scale, scale2, input_scale)
|
||||
|
||||
# Flush any remaining experts (fallback if n_experts was unknown)
|
||||
for (bid, proj_type) in list(expert_blocks.keys()):
|
||||
self._flush_nvfp4_experts((bid, proj_type), expert_blocks, expert_scales, expert_shapes, bid, proj_type)
|
||||
self._flush_nvfp4_experts((bid, proj_type), expert_blocks, expert_scales, expert_input_scales, expert_shapes, bid, proj_type)
|
||||
|
||||
# Remove consumed tensors so get_tensors/modify_tensors won't see them
|
||||
for name in consumed:
|
||||
self.model_tensors.pop(name, None)
|
||||
|
||||
# Remove unused auxiliary tensors (input_scale, k_scale, v_scale)
|
||||
# Remove any remaining unused auxiliary tensors
|
||||
for name in list(self.model_tensors.keys()):
|
||||
if name.endswith((".input_scale", ".k_scale", ".v_scale")):
|
||||
if name.endswith((".k_scale", ".v_scale")):
|
||||
del self.model_tensors[name]
|
||||
|
||||
def _flush_nvfp4_experts(self, key, expert_blocks, expert_scales, expert_shapes, bid, proj_type):
|
||||
def _flush_nvfp4_experts(self, key, expert_blocks, expert_scales, expert_input_scales, expert_shapes, bid, proj_type):
|
||||
experts = expert_blocks.pop(key)
|
||||
scales = expert_scales.pop(key)
|
||||
input_scales = expert_input_scales.pop(key)
|
||||
shape = expert_shapes.pop(key)
|
||||
|
||||
experts.sort(key=lambda x: x[0])
|
||||
@@ -708,6 +727,14 @@ class ModelBase:
|
||||
logger.info(f" + {scale_name} (per-expert NVFP4 scale2, shape [{len(scales)}])")
|
||||
self.gguf_writer.add_tensor(scale_name, scale_vals)
|
||||
|
||||
# Emit per-expert input_scale tensor if any expert has non-trivial input_scale
|
||||
input_scales.sort(key=lambda x: x[0])
|
||||
input_scale_vals = np.array([s[1] for s in input_scales], dtype=np.float32)
|
||||
if not np.allclose(input_scale_vals, 1.0, atol=1e-6):
|
||||
input_scale_name = new_name.replace(".weight", ".input_scale")
|
||||
logger.info(f" + {input_scale_name} (per-expert NVFP4 input_scale, shape [{len(input_scales)}])")
|
||||
self.gguf_writer.add_tensor(input_scale_name, input_scale_vals)
|
||||
|
||||
del experts, merged
|
||||
|
||||
def prepare_tensors(self):
|
||||
@@ -947,6 +974,9 @@ class ModelBase:
|
||||
if "thinker_config" in config:
|
||||
# rename for Qwen2.5-Omni
|
||||
config["text_config"] = config["thinker_config"]["text_config"]
|
||||
if "language_config" in config:
|
||||
# rename for DeepSeekOCR
|
||||
config["text_config"] = config["language_config"]
|
||||
if "lfm" in config:
|
||||
# rename for LFM2-Audio
|
||||
config["text_config"] = config["lfm"]
|
||||
@@ -1308,6 +1338,9 @@ class TextModel(ModelBase):
|
||||
if chkhsh == "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df":
|
||||
# ref: https://huggingface.co/aari1995/German_Semantic_V3
|
||||
res = "jina-v2-de"
|
||||
if chkhsh == "0fe1cf6eda062318a1af7270f3331a85c539a01778ff948e24388e949c5282f4":
|
||||
# ref: https://huggingface.co/evilfreelancer/ruGPT3XL
|
||||
res = "gpt-2"
|
||||
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
|
||||
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
|
||||
res = "llama-bpe"
|
||||
@@ -1503,6 +1536,9 @@ class TextModel(ModelBase):
|
||||
if chkhsh == "e4d54df1ebc1f2b91acd986c5b51aa50837d5faf7c7398e73c1f9e9ee5d19869":
|
||||
# ref: https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601
|
||||
res = "kanana2"
|
||||
if chkhsh == "862f827721df956049dff5ca81a57f29e575280bc622e290d3bf4e35eca29015":
|
||||
# ref: https://huggingface.co/codefuse-ai/F2LLM-v2-4B
|
||||
res = "f2llmv2"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
@@ -2071,7 +2107,7 @@ class MmprojModel(ModelBase):
|
||||
preprocessor_config: dict[str, Any]
|
||||
global_config: dict[str, Any]
|
||||
|
||||
n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "encoder_layers", "vt_num_hidden_layers"]
|
||||
n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "layers", "encoder_layers", "vt_num_hidden_layers"]
|
||||
|
||||
has_vision_encoder: bool = True # by default
|
||||
has_audio_encoder: bool = False
|
||||
@@ -5005,6 +5041,97 @@ class _LinearAttentionVReorderBase(Qwen3NextModel):
|
||||
perm[dim], perm[dim + 1] = perm[dim + 1], perm[dim]
|
||||
return tensor.permute(*perm).contiguous().reshape(*shape)
|
||||
|
||||
def _transform_nvfp4_weight(self, name: str, weight: Tensor, scale: Tensor) -> tuple[Tensor, Tensor]:
|
||||
if not name.endswith((
|
||||
".linear_attn.in_proj_qkv.weight",
|
||||
".linear_attn.in_proj_z.weight",
|
||||
".linear_attn.in_proj_a.weight",
|
||||
".linear_attn.in_proj_b.weight",
|
||||
".linear_attn.out_proj.weight",
|
||||
)):
|
||||
return weight, scale
|
||||
|
||||
num_k_heads = self.hparams["linear_num_key_heads"]
|
||||
num_v_heads = self.hparams["linear_num_value_heads"]
|
||||
head_k_dim = self.hparams["linear_key_head_dim"]
|
||||
head_v_dim = self.hparams["linear_value_head_dim"]
|
||||
num_v_per_k = num_v_heads // num_k_heads
|
||||
|
||||
def unpack_nibbles(qs: Tensor) -> Tensor:
|
||||
lo = torch.bitwise_and(qs, 0x0F)
|
||||
hi = torch.bitwise_right_shift(qs, 4)
|
||||
return torch.stack((lo, hi), dim=-1).reshape(*qs.shape[:-1], qs.shape[-1] * 2)
|
||||
|
||||
def pack_nibbles(codes: Tensor) -> Tensor:
|
||||
codes = codes.reshape(*codes.shape[:-1], codes.shape[-1] // 2, 2)
|
||||
lo = torch.bitwise_and(codes[..., 0], 0x0F)
|
||||
hi = torch.bitwise_left_shift(torch.bitwise_and(codes[..., 1], 0x0F), 4)
|
||||
return torch.bitwise_or(lo, hi).contiguous()
|
||||
|
||||
def apply_col_perm(qs: Tensor, scales: Tensor, col_perm: Tensor) -> tuple[Tensor, Tensor]:
|
||||
assert qs.ndim >= 2
|
||||
assert scales.ndim >= 2
|
||||
|
||||
k = qs.shape[-1] * 2
|
||||
assert col_perm.numel() == k
|
||||
assert k % 16 == 0
|
||||
|
||||
group_cols = col_perm.reshape(-1, 16)
|
||||
group_starts = group_cols[:, 0]
|
||||
expected = group_starts.unsqueeze(1) + torch.arange(16, dtype=col_perm.dtype)
|
||||
assert torch.equal(group_cols, expected)
|
||||
assert torch.all(group_starts % 16 == 0)
|
||||
|
||||
group_perm = (group_starts // 16).to(dtype=torch.long)
|
||||
expected_groups = torch.arange(scales.shape[-1], dtype=torch.long)
|
||||
assert group_perm.numel() == scales.shape[-1]
|
||||
assert torch.equal(torch.sort(group_perm).values, expected_groups)
|
||||
|
||||
codes = unpack_nibbles(qs)
|
||||
codes = codes.index_select(-1, col_perm.to(device=qs.device, dtype=torch.long))
|
||||
qs = pack_nibbles(codes)
|
||||
scales = scales.index_select(-1, group_perm.to(device=scales.device))
|
||||
return qs, scales
|
||||
|
||||
def reorder_rows(qs: Tensor, scales: Tensor, head_dim: int) -> tuple[Tensor, Tensor]:
|
||||
row_perm = self._reorder_v_heads(
|
||||
torch.arange(num_v_heads * head_dim, dtype=torch.long).unsqueeze(-1),
|
||||
0, num_k_heads, num_v_per_k, head_dim,
|
||||
).squeeze(-1)
|
||||
return (
|
||||
qs.index_select(0, row_perm.to(device=qs.device)),
|
||||
scales.index_select(0, row_perm.to(device=scales.device)),
|
||||
)
|
||||
|
||||
if name.endswith(".linear_attn.in_proj_qkv.weight"):
|
||||
q_dim = head_k_dim * num_k_heads
|
||||
k_dim = head_k_dim * num_k_heads
|
||||
q = weight[:q_dim]
|
||||
k = weight[q_dim:q_dim + k_dim]
|
||||
v = weight[q_dim + k_dim:]
|
||||
q_scale = scale[:q_dim]
|
||||
k_scale = scale[q_dim:q_dim + k_dim]
|
||||
v_scale = scale[q_dim + k_dim:]
|
||||
v, v_scale = reorder_rows(v, v_scale, head_v_dim)
|
||||
return torch.cat([q, k, v], dim=0), torch.cat([q_scale, k_scale, v_scale], dim=0)
|
||||
|
||||
if name.endswith(".linear_attn.in_proj_z.weight"):
|
||||
weight, scale = reorder_rows(weight, scale, head_v_dim)
|
||||
elif name.endswith((".linear_attn.in_proj_a.weight", ".linear_attn.in_proj_b.weight")):
|
||||
weight, scale = reorder_rows(weight, scale, 1)
|
||||
elif name.endswith(".linear_attn.out_proj.weight"):
|
||||
col_perm = self._reorder_v_heads(
|
||||
torch.arange(num_v_heads * head_v_dim, dtype=torch.long).unsqueeze(0),
|
||||
1, num_k_heads, num_v_per_k, head_v_dim,
|
||||
).squeeze(0)
|
||||
weight, scale = apply_col_perm(weight, scale, col_perm)
|
||||
|
||||
return weight, scale
|
||||
|
||||
def _repack_nvfp4(self, name: str, weight: Tensor, scale: Tensor, scale2: Tensor, input_scale: Tensor):
|
||||
weight, scale = self._transform_nvfp4_weight(name, weight, scale)
|
||||
super()._repack_nvfp4(name, weight, scale, scale2, input_scale)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
num_k_heads = self.hparams.get("linear_num_key_heads", 0)
|
||||
num_v_heads = self.hparams.get("linear_num_value_heads", 0)
|
||||
@@ -5094,6 +5221,47 @@ class GPT2Model(TextModel):
|
||||
yield from super().modify_tensors(data_torch, new_name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("RuGPT3XLForCausalLM")
|
||||
class RuGPT3XLModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.GPT2
|
||||
|
||||
_qkv_parts: list[dict[str, Tensor]] | None = None
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# Fuse separate Q, K, V projections into a single QKV tensor
|
||||
if ".self_attn.q_proj." in name or ".self_attn.k_proj." in name or ".self_attn.v_proj." in name:
|
||||
suffix = "weight" if name.endswith(".weight") else "bias"
|
||||
part = "q" if ".q_proj." in name else ("k" if ".k_proj." in name else "v")
|
||||
key = f"{part}.{suffix}"
|
||||
|
||||
assert bid is not None
|
||||
if self._qkv_parts is None:
|
||||
self._qkv_parts = [{} for _ in range(self.block_count)]
|
||||
self._qkv_parts[bid][key] = data_torch
|
||||
|
||||
q_key, k_key, v_key = f"q.{suffix}", f"k.{suffix}", f"v.{suffix}"
|
||||
if all(k in self._qkv_parts[bid] for k in [q_key, k_key, v_key]):
|
||||
q = self._qkv_parts[bid].pop(q_key)
|
||||
k = self._qkv_parts[bid].pop(k_key)
|
||||
v = self._qkv_parts[bid].pop(v_key)
|
||||
data_torch = torch.cat([q, k, v], dim=0)
|
||||
name = self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_QKV, bid, f".{suffix}")
|
||||
logger.debug(f"Fused Q/K/V {suffix} for layer {bid} -> {name}")
|
||||
else:
|
||||
return
|
||||
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
def prepare_tensors(self):
|
||||
super().prepare_tensors()
|
||||
|
||||
if self._qkv_parts is not None:
|
||||
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
||||
parts = [f"({i}){k}" for i, d in enumerate(self._qkv_parts) for k in d.keys()]
|
||||
if len(parts) > 0:
|
||||
raise ValueError(f"Unprocessed Q/K/V parts: {parts}")
|
||||
|
||||
|
||||
@ModelBase.register("PhiForCausalLM")
|
||||
class Phi2Model(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.PHI2
|
||||
@@ -6935,6 +7103,70 @@ class ConformerAudioModel(MmprojModel):
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("DeepseekOCRForCausalLM")
|
||||
class DeepseekOCRVisionModel(MmprojModel):
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.DEEPSEEKOCR)
|
||||
# default values below are taken from HF tranformers code
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
|
||||
self.gguf_writer.add_vision_use_gelu(True)
|
||||
# calculate proj_scale_factor (used by tinygemma3 test model)
|
||||
image_seq_length = self.preprocessor_config.get("image_seq_length", 256)
|
||||
n_per_side = int(image_seq_length ** 0.5)
|
||||
image_size = self.hparams["image_size"]
|
||||
patch_size = self.hparams["patch_size"]
|
||||
proj_scale_factor = (image_size // patch_size) // n_per_side
|
||||
if proj_scale_factor > 0 and proj_scale_factor != 4:
|
||||
# we only need to write this if it's not the default value
|
||||
# in this case, we are converting a test model
|
||||
self.gguf_writer.add_vision_projector_scale_factor(proj_scale_factor)
|
||||
# @bluebread: there's no window_size in config but just add it here anyway
|
||||
self.gguf_writer.add_vision_window_size(self.hparams.get("window_size", 14))
|
||||
|
||||
# SAM configuration
|
||||
sam_hparams = hparams['sam']
|
||||
self.gguf_writer.add_vision_sam_layers_count(sam_hparams['layers'])
|
||||
self.gguf_writer.add_vision_sam_embedding_length(sam_hparams['width'])
|
||||
self.gguf_writer.add_vision_sam_head_count(sam_hparams['heads'])
|
||||
|
||||
def get_vision_config(self) -> dict[str, Any]:
|
||||
vision_config: dict[str, Any] | None = self.global_config.get("vision_config")
|
||||
|
||||
if not vision_config:
|
||||
raise ValueError("DeepseekOCR model requires 'vision_config' in the model configuration, but it was not found")
|
||||
|
||||
vision_config['sam'] = vision_config['width']['sam_vit_b']
|
||||
vision_config.update(vision_config['width']['clip-l-14-224'])
|
||||
vision_config['hidden_size'] = vision_config['width']
|
||||
vision_config['num_heads'] = vision_config['heads']
|
||||
vision_config['intermediate_size'] = vision_config['heads'] * 4
|
||||
|
||||
return vision_config
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
if ".embeddings." in name or 'pos_embed' in name:
|
||||
return gguf.GGMLQuantizationType.F32
|
||||
if ".rel_pos_h" in name or '.rel_pos_w' in name:
|
||||
return gguf.GGMLQuantizationType.F32
|
||||
if ".neck." in name or ".net_" in name:
|
||||
return gguf.GGMLQuantizationType.F32
|
||||
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# Only process vision-related tensors, skip language model tensors
|
||||
# Vision components: sam_model, vision_model, projector, image_newline, view_seperator
|
||||
# Language model components to skip: lm_head, embed_tokens, layers, norm
|
||||
if name.startswith(("lm_head.", "model.embed_tokens.", "model.layers.", "model.norm.")):
|
||||
return
|
||||
|
||||
if name.endswith("pos_embed") or name.endswith("rel_pos_h") or name.endswith("rel_pos_w"):
|
||||
name += ".weight"
|
||||
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("Gemma3nForConditionalGeneration")
|
||||
class Gemma3nVisionAudioModel(ConformerAudioModel):
|
||||
has_audio_encoder = True
|
||||
@@ -8280,6 +8512,19 @@ class DeepseekV2Model(TextModel):
|
||||
|
||||
merge_expert = True
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
hparams: dict = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
|
||||
self.origin_hf_arch = hparams.get('architectures', [None])[0]
|
||||
|
||||
# special handling for Deepseek OCR
|
||||
if self.origin_hf_arch == "DeepseekOCRForCausalLM":
|
||||
self.model_arch = gguf.MODEL_ARCH.DEEPSEEK2OCR
|
||||
self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
|
||||
self.gguf_writer.add_architecture()
|
||||
# default jinja template
|
||||
self.gguf_writer.add_chat_template("{% for m in messages %}{{m['content']}}{% endfor %}")
|
||||
|
||||
def set_vocab(self):
|
||||
try:
|
||||
self._set_vocab_gpt2()
|
||||
@@ -8335,9 +8580,15 @@ class DeepseekV2Model(TextModel):
|
||||
raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!")
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
is_ocr = (self.model_arch == gguf.MODEL_ARCH.DEEPSEEK2OCR)
|
||||
|
||||
# note: deepseek2 using MLA converts into MQA (ie: GQA with 1 group)
|
||||
self.hparams["num_key_value_heads"] = 1
|
||||
if is_ocr:
|
||||
self.hparams['rope_theta'] = self.hparams.get('rope_theta', 10000.0)
|
||||
else:
|
||||
# note: deepseek2 using MLA converts into MQA (ie: GQA with 1 group)
|
||||
self.hparams["num_key_value_heads"] = 1
|
||||
|
||||
self.hparams['rms_norm_eps'] = self.hparams.get('rms_norm_eps', 1e-6)
|
||||
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
@@ -8351,16 +8602,18 @@ class DeepseekV2Model(TextModel):
|
||||
# Default: if no MoE, all layers are dense; if MoE, none are dense
|
||||
first_k_dense_replace = hparams["num_hidden_layers"] if not has_moe else 0
|
||||
self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace)
|
||||
kv_lora_rank = hparams.get("kv_lora_rank", 512)
|
||||
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||
if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
|
||||
self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
|
||||
self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
|
||||
|
||||
# note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
|
||||
self.gguf_writer.add_key_length(hparams["kv_lora_rank"] + hparams["qk_rope_head_dim"])
|
||||
self.gguf_writer.add_value_length(hparams["kv_lora_rank"])
|
||||
self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
|
||||
self.gguf_writer.add_value_length_mla(hparams["v_head_dim"])
|
||||
if not is_ocr:
|
||||
self.gguf_writer.add_kv_lora_rank(kv_lora_rank)
|
||||
self.gguf_writer.add_key_length(kv_lora_rank + hparams["qk_rope_head_dim"])
|
||||
self.gguf_writer.add_value_length(kv_lora_rank)
|
||||
self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
|
||||
self.gguf_writer.add_value_length_mla(hparams["v_head_dim"])
|
||||
|
||||
# MoE parameters (required by C++ code for DEEPSEEK2 arch)
|
||||
# For non-MoE models like Youtu, use intermediate_size as expert_feed_forward_length
|
||||
@@ -8392,8 +8645,15 @@ class DeepseekV2Model(TextModel):
|
||||
_experts: list[dict[str, Tensor]] | None = None
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# skip vision tensors and remove "language_model." for Kimi-VL and Kimi-K2.5
|
||||
if "vision_tower" in name or "multi_modal_projector" in name or "mm_projector" in name:
|
||||
# skip vision tensors and remove "language_model." for Kimi-VL and Kimi-K2.5, and DeepSeek-OCR
|
||||
if ("vision_tower" in name
|
||||
or "multi_modal_projector" in name
|
||||
or "mm_projector" in name
|
||||
or "vision_model" in name
|
||||
or "image_newline" in name
|
||||
or "model.projector" in name
|
||||
or "sam_model" in name
|
||||
or "view_seperator" in name):
|
||||
return
|
||||
if name.startswith("siglip2.") or name.startswith("merger."):
|
||||
return
|
||||
|
||||
@@ -154,6 +154,7 @@ models = [
|
||||
{"name": "qwen35", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3.5-9B-Instruct", },
|
||||
{"name": "joyai-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jdopensource/JoyAI-LLM-Flash", },
|
||||
{"name": "kanana2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601", },
|
||||
{"name": "f2llmv2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/codefuse-ai/F2LLM-v2-4B", },
|
||||
]
|
||||
|
||||
# some models are known to be broken upstream, so we will skip them as exceptions
|
||||
@@ -177,6 +178,7 @@ pre_computed_hashes = [
|
||||
{"name": "grok-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/alvarobartt/grok-2-tokenizer", "chkhsh": "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273"},
|
||||
# jina-v2-de variants
|
||||
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/aari1995/German_Semantic_V3", "chkhsh": "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df"},
|
||||
{"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/evilfreelancer/ruGPT3XL", "chkhsh": "0fe1cf6eda062318a1af7270f3331a85c539a01778ff948e24388e949c5282f4"},
|
||||
]
|
||||
|
||||
|
||||
|
||||
+78
-54
@@ -42,12 +42,22 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
|
||||
|
||||
### Ascend NPU
|
||||
|
||||
**Verified devices**
|
||||
You can retrieve your Ascend device IDs using the following command:
|
||||
|
||||
| Ascend NPU | Status |
|
||||
|:-----------------------------:|:-------:|
|
||||
| Atlas 300T A2 | Support |
|
||||
| Atlas 300I Duo | Support |
|
||||
```sh
|
||||
lspci -n | grep -Eo '19e5:d[0-9a-f]{3}' | cut -d: -f2
|
||||
```
|
||||
|
||||
**Devices**
|
||||
|
||||
| Device Id | Product Series | Product Models | Chip Model | Verified Status |
|
||||
|:---------:|----------------|----------------|:----------:|:---------------:|
|
||||
| d803 | Atlas A3 Train | | 910C | |
|
||||
| d803 | Atlas A3 Infer | | 910C | |
|
||||
| d802 | Atlas A2 Train | | 910B | |
|
||||
| d802 | Atlas A2 Infer | Atlas 300I A2 | 910B | Support |
|
||||
| d801 | Atlas Train | | 910 | |
|
||||
| d500 | Atlas Infer | Atlas 300I Duo | 310P | Support |
|
||||
|
||||
*Notes:*
|
||||
|
||||
@@ -57,6 +67,9 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
|
||||
|
||||
## Model Supports
|
||||
|
||||
<details>
|
||||
<summary>Text-only</summary>
|
||||
|
||||
| Model Name | FP16 | Q4_0 | Q8_0 |
|
||||
|:----------------------------|:-----:|:----:|:----:|
|
||||
| Llama-2 | √ | √ | √ |
|
||||
@@ -118,8 +131,11 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
|
||||
| Trillion-7B-preview | √ | √ | √ |
|
||||
| Ling models | √ | √ | √ |
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>Multimodal</summary>
|
||||
|
||||
**Multimodal**
|
||||
| Model Name | FP16 | Q4_0 | Q8_0 |
|
||||
|:----------------------------|:-----:|:----:|:----:|
|
||||
| LLaVA 1.5 models, LLaVA 1.6 models | x | x | x |
|
||||
@@ -134,15 +150,22 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
|
||||
| GLM-EDGE | √ | √ | √ |
|
||||
| Qwen2-VL | √ | √ | √ |
|
||||
|
||||
</details>
|
||||
|
||||
|
||||
|
||||
## DataType Supports
|
||||
|
||||
| DataType | Status |
|
||||
|:----------------------:|:-------:|
|
||||
| FP16 | Support |
|
||||
| Q8_0 | Support |
|
||||
| Q4_0 | Support |
|
||||
| DataType | 910B | 310P |
|
||||
|:----------------------:|:-------:|:-------:|
|
||||
| FP16 | Support | Support |
|
||||
| Q8_0 | Support | Partial |
|
||||
| Q4_0 | Support | Partial |
|
||||
| BF16 | Support | |
|
||||
|
||||
> **310P note**
|
||||
> - `Q8_0`: data transform / buffer path is implemented, and `GET_ROWS` is supported, but quantized `MUL_MAT` / `MUL_MAT_ID` are not supported.
|
||||
> - `Q4_0`: data transform / buffer path is implemented, but quantized `MUL_MAT` / `MUL_MAT_ID` are not supported.
|
||||
|
||||
## Docker
|
||||
|
||||
@@ -160,7 +183,20 @@ npu-smi info
|
||||
|
||||
# Select the cards that you want to use, make sure these cards are not used by someone.
|
||||
# Following using cards of device0.
|
||||
docker run --name llamacpp --device /dev/davinci0 --device /dev/davinci_manager --device /dev/devmm_svm --device /dev/hisi_hdc -v /usr/local/dcmi:/usr/local/dcmi -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info -v /PATH_TO_YOUR_MODELS/:/app/models -it llama-cpp-cann -m /app/models/MODEL_PATH -ngl 32 -p "Building a website can be done in 10 simple steps:"
|
||||
docker run --name llamacpp \
|
||||
--device /dev/davinci0 \
|
||||
--device /dev/davinci_manager \
|
||||
--device /dev/devmm_svm \
|
||||
--device /dev/hisi_hdc \
|
||||
-v /usr/local/dcmi:/usr/local/dcmi \
|
||||
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
|
||||
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
|
||||
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
|
||||
-v /PATH_TO_YOUR_MODELS/:/app/models \
|
||||
-it llama-cpp-cann \
|
||||
-m /app/models/MODEL_PATH \
|
||||
-ngl 32 \
|
||||
-p "Building a website can be done in 10 simple steps:"
|
||||
```
|
||||
|
||||
*Notes:*
|
||||
@@ -171,69 +207,57 @@ docker run --name llamacpp --device /dev/davinci0 --device /dev/davinci_manager
|
||||
|
||||
### I. Setup Environment
|
||||
|
||||
1. **Install Ascend Driver and firmware**
|
||||
1. **Configure Ascend user and group**
|
||||
|
||||
```sh
|
||||
# create driver running user.
|
||||
sudo groupadd -g HwHiAiUser
|
||||
sudo groupadd HwHiAiUser
|
||||
sudo useradd -g HwHiAiUser -d /home/HwHiAiUser -m HwHiAiUser -s /bin/bash
|
||||
sudo usermod -aG HwHiAiUser $USER
|
||||
|
||||
# download driver from https://www.hiascend.com/hardware/firmware-drivers/community according to your system
|
||||
# and install driver.
|
||||
sudo sh Ascend-hdk-910b-npu-driver_x.x.x_linux-{arch}.run --full --install-for-all
|
||||
```
|
||||
|
||||
Once installed, run `npu-smi info` to check whether driver is installed successfully.
|
||||
2. **Install dependencies**
|
||||
|
||||
**Ubuntu/Debian:**
|
||||
```sh
|
||||
+-------------------------------------------------------------------------------------------+
|
||||
| npu-smi 24.1.rc2 Version: 24.1.rc2 |
|
||||
+----------------------+---------------+----------------------------------------------------+
|
||||
| NPU Name | Health | Power(W) Temp(C) Hugepages-Usage(page)|
|
||||
| Chip | Bus-Id | AICore(%) Memory-Usage(MB) HBM-Usage(MB) |
|
||||
+======================+===============+====================================================+
|
||||
| 2 xxx | OK | 64.4 51 15 / 15 |
|
||||
| 0 | 0000:01:00.0 | 0 1873 / 15077 0 / 32768 |
|
||||
+======================+===============+====================================================+
|
||||
| 5 xxx | OK | 64.0 52 15 / 15 |
|
||||
| 0 | 0000:81:00.0 | 0 1874 / 15077 0 / 32768 |
|
||||
+======================+===============+====================================================+
|
||||
| No running processes found in NPU 2 |
|
||||
+======================+===============+====================================================+
|
||||
| No running processes found in NPU 5 |
|
||||
+======================+===============+====================================================+
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y gcc python3 python3-pip linux-headers-$(uname -r)
|
||||
```
|
||||
|
||||
2. **Install Ascend Firmware**
|
||||
**RHEL/CentOS:**
|
||||
```sh
|
||||
# download driver from https://www.hiascend.com/hardware/firmware-drivers/community according to your system
|
||||
# and install driver.
|
||||
sudo sh Ascend-hdk-910b-npu-firmware_x.x.x.x.X.run --full
|
||||
sudo yum makecache
|
||||
sudo yum install -y gcc python3 python3-pip kernel-headers-$(uname -r) kernel-devel-$(uname -r)
|
||||
```
|
||||
If the following message appears, firmware is installed successfully.
|
||||
|
||||
3. **Install CANN (driver + toolkit)**
|
||||
|
||||
> The `Ascend-cann` package includes both the driver and toolkit.
|
||||
> `$ARCH` can be `x86_64` or `aarch64`, `$CHIP` can be `910b` or `310p`.
|
||||
|
||||
```sh
|
||||
Firmware package installed successfully!
|
||||
wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.5.T63/Ascend-cann_8.5.0_linux-$ARCH.run
|
||||
sudo bash ./Ascend-cann_8.5.0_linux-$ARCH.run --install
|
||||
|
||||
wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.5.T63/Ascend-cann-$CHIP-ops_8.5.0_linux-$ARCH.run
|
||||
sudo bash ./Ascend-cann-$CHIP-ops_8.5.0_linux-$ARCH.run --install
|
||||
```
|
||||
|
||||
4. **Verify installation**
|
||||
|
||||
3. **Install CANN toolkit and kernels**
|
||||
|
||||
CANN toolkit and kernels can be obtained from the official [CANN Toolkit](https://www.hiascend.com/zh/developer/download/community/result?module=cann) page.
|
||||
|
||||
Please download the corresponding version that satified your system. The minimum version required is 8.0.RC2.alpha002 and here is the install command.
|
||||
```sh
|
||||
pip3 install attrs numpy decorator sympy cffi pyyaml pathlib2 psutil protobuf scipy requests absl-py wheel typing_extensions
|
||||
sh Ascend-cann-toolkit_8.0.RC2.alpha002_linux-aarch64.run --install
|
||||
sh Ascend-cann-kernels-910b_8.0.RC2.alpha002_linux.run --install
|
||||
npu-smi info
|
||||
```
|
||||
|
||||
Set Ascend Variables:
|
||||
If device information is displayed correctly, the driver is functioning properly.
|
||||
|
||||
```sh
|
||||
echo "source ~/Ascend/ascend-toolkit/set_env.sh" >> ~/.bashrc
|
||||
source ~/.bashrc
|
||||
# Set environment variables (adjust path if needed)
|
||||
source /usr/local/Ascend/cann/set_env.sh
|
||||
|
||||
python3 -c "import acl; print(acl.get_soc_name())"
|
||||
```
|
||||
|
||||
Upon a successful installation, CANN is enabled for the available ascend devices.
|
||||
If the command outputs the chip model, the installation was successful.
|
||||
|
||||
### II. Build llama.cpp
|
||||
|
||||
|
||||
@@ -31,6 +31,13 @@ llama-server -m gemma-3-4b-it-Q4_K_M.gguf --mmproj mmproj-gemma-3-4b-it-Q4_K_M.g
|
||||
llama-server -hf ggml-org/gemma-3-4b-it-GGUF --no-mmproj-offload
|
||||
```
|
||||
|
||||
> [!IMPORTANT]
|
||||
>
|
||||
> OCR models are trained with specific prompt and input structure, please refer to these discussions for more info:
|
||||
> - PaddleOCR-VL: https://github.com/ggml-org/llama.cpp/pull/18825
|
||||
> - GLM-OCR: https://github.com/ggml-org/llama.cpp/pull/19677
|
||||
> - Deepseek-OCR: https://github.com/ggml-org/llama.cpp/pull/17400
|
||||
|
||||
## Pre-quantized models
|
||||
|
||||
These are ready-to-use models, most of them come with `Q4_K_M` quantization by default. They can be found at the Hugging Face page of the ggml-org: https://huggingface.co/collections/ggml-org/multimodal-ggufs-68244e01ff1f39e5bebeeedc
|
||||
|
||||
@@ -460,6 +460,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
endif()
|
||||
if(NOT GGML_CPU_ALL_VARIANTS)
|
||||
set(MARCH_STR "rv64gc")
|
||||
if (GGML_RVV)
|
||||
string(APPEND MARCH_STR "v")
|
||||
endif()
|
||||
|
||||
if (GGML_RV_ZFH)
|
||||
string(APPEND MARCH_STR "_zfh")
|
||||
endif()
|
||||
@@ -467,7 +471,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
if (GGML_XTHEADVECTOR)
|
||||
string(APPEND MARCH_STR "_xtheadvector")
|
||||
elseif (GGML_RVV)
|
||||
string(APPEND MARCH_STR "_v")
|
||||
if (GGML_RV_ZVFH)
|
||||
string(APPEND MARCH_STR "_zvfh")
|
||||
endif()
|
||||
@@ -475,12 +478,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
string(APPEND MARCH_STR "_zvfbfwma")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (GGML_RV_ZICBOP)
|
||||
string(APPEND MARCH_STR "_zicbop")
|
||||
endif()
|
||||
if (GGML_RV_ZIHINTPAUSE)
|
||||
string(APPEND MARCH_STR "_zihintpause")
|
||||
endif()
|
||||
|
||||
list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
|
||||
else()
|
||||
# Begin with the lowest baseline
|
||||
|
||||
@@ -2871,8 +2871,12 @@ struct ggml_cplan ggml_graph_plan(
|
||||
const int64_t ne11 = node->src[1]->ne[1]; // H
|
||||
const int64_t ne12 = node->src[1]->ne[2]; // Channels In
|
||||
|
||||
cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
|
||||
cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
|
||||
GGML_ASSERT(node->src[0]->type == GGML_TYPE_F16 || node->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(node->src[1]->type == GGML_TYPE_F32);
|
||||
|
||||
cur += ggml_type_size(node->src[0]->type) * ne00 * ne01 * ne02 * ne03;
|
||||
cur += ggml_type_size(node->src[0]->type) * ne10 * ne11 * ne12;
|
||||
|
||||
} break;
|
||||
case GGML_OP_TOP_K:
|
||||
{
|
||||
|
||||
+50
-19
@@ -6923,16 +6923,15 @@ void ggml_compute_forward_conv_3d(
|
||||
ggml_compute_forward_conv_3d_impl(params, src0, src1, dst, src0->type);
|
||||
}
|
||||
|
||||
// ggml_compute_forward_conv_transpose_2d
|
||||
|
||||
void ggml_compute_forward_conv_transpose_2d(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
template <typename kernel_t>
|
||||
static void ggml_compute_forward_conv_transpose_2d_impl(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
@@ -6943,7 +6942,7 @@ void ggml_compute_forward_conv_transpose_2d(
|
||||
|
||||
const int nk = ne00*ne01*ne02*ne03;
|
||||
|
||||
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
||||
GGML_ASSERT(nb00 == ggml_type_size(src0->type));
|
||||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
|
||||
if (ith == 0) {
|
||||
@@ -6951,12 +6950,12 @@ void ggml_compute_forward_conv_transpose_2d(
|
||||
|
||||
// permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
|
||||
{
|
||||
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
||||
kernel_t * const wdata = (kernel_t *) params->wdata + 0;
|
||||
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i03*nb03 + i02*nb02);
|
||||
ggml_fp16_t * dst_data = wdata + i02*ne01*ne00*ne03;
|
||||
const kernel_t * const src = (kernel_t *)((char *) src0->data + i03*nb03 + i02*nb02);
|
||||
kernel_t * dst_data = wdata + i02*ne01*ne00*ne03;
|
||||
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
||||
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
||||
dst_data[i01*ne00*ne03 + i00*ne03 + i03] = src[i01 * ne00 + i00];
|
||||
@@ -6968,13 +6967,17 @@ void ggml_compute_forward_conv_transpose_2d(
|
||||
|
||||
// permute source data (src1) from (Sw x Sh x Cin) to (Cin x Sw x Sh)
|
||||
{
|
||||
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
|
||||
kernel_t * const wdata = (kernel_t *) params->wdata + nk;
|
||||
for (int i12 = 0; i12 < ne12; i12++) {
|
||||
for (int i11 = 0; i11 < ne11; i11++) {
|
||||
const float * const src = (float *)((char *) src1->data + i12*nb12 + i11*nb11);
|
||||
ggml_fp16_t * dst_data = wdata + i11*ne10*ne12;
|
||||
kernel_t * dst_data = wdata + i11*ne10*ne12;
|
||||
for (int i10 = 0; i10 < ne10; i10++) {
|
||||
dst_data[i10*ne12 + i12] = GGML_CPU_FP32_TO_FP16(src[i10]);
|
||||
if constexpr (std::is_same_v<kernel_t, ggml_fp16_t>) {
|
||||
dst_data[i10*ne12 + i12] = GGML_CPU_FP32_TO_FP16(src[i10]);
|
||||
} else {
|
||||
dst_data[i10*ne12 + i12] = src[i10];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -6996,21 +6999,27 @@ void ggml_compute_forward_conv_transpose_2d(
|
||||
const int ip0 = dp*ith;
|
||||
const int ip1 = MIN(ip0 + dp, np);
|
||||
|
||||
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
||||
ggml_fp16_t * const wdata_src = wdata + nk;
|
||||
kernel_t * const wdata = (kernel_t *) params->wdata + 0;
|
||||
kernel_t * const wdata_src = wdata + nk;
|
||||
|
||||
for (int i2 = ip0; i2 < ip1; i2++) { // Cout
|
||||
float * dst_data = (float *)((char *) dst->data + i2*nb2);
|
||||
ggml_fp16_t * wdata_kernel = wdata + i2*ne01*ne00*ne03;
|
||||
kernel_t * wdata_kernel = wdata + i2*ne01*ne00*ne03;
|
||||
for (int i11 = 0; i11 < ne11; i11++) {
|
||||
for (int i10 = 0; i10 < ne10; i10++) {
|
||||
const int i1n = i11*ne10*ne12 + i10*ne12;
|
||||
for (int i01 = 0; i01 < ne01; i01++) {
|
||||
for (int i00 = 0; i00 < ne00; i00++) {
|
||||
float v = 0;
|
||||
ggml_vec_dot_f16(ne03, &v, 0,
|
||||
wdata_src + i1n, 0,
|
||||
wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1);
|
||||
if constexpr (std::is_same_v<kernel_t, ggml_fp16_t>) {
|
||||
ggml_vec_dot_f16(ne03, &v, 0,
|
||||
wdata_src + i1n, 0,
|
||||
wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1);
|
||||
} else {
|
||||
ggml_vec_dot_f32(ne03, &v, 0,
|
||||
wdata_src + i1n, 0,
|
||||
wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1);
|
||||
}
|
||||
dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
|
||||
}
|
||||
}
|
||||
@@ -7019,6 +7028,28 @@ void ggml_compute_forward_conv_transpose_2d(
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_compute_forward_conv_transpose_2d(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
ggml_compute_forward_conv_transpose_2d_impl<ggml_fp16_t>(params, dst);
|
||||
} break;
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
ggml_compute_forward_conv_transpose_2d_impl<float>(params, dst);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_conv_2d_dw
|
||||
|
||||
struct ggml_conv_2d_dw_params {
|
||||
|
||||
@@ -799,6 +799,22 @@ static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) {
|
||||
#endif // CUDART_VERSION >= 12050
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float ggml_cuda_ue4m3_to_fp32(uint8_t x) {
|
||||
#ifdef FP8_AVAILABLE
|
||||
const uint32_t bits = x * (x != 0x7F && x != 0xFF); // Convert NaN to 0.0f to match CPU implementation.
|
||||
#if defined(GGML_USE_HIP) && defined(CDNA3)
|
||||
// ROCm dose not support fp8 in software on devices with fp8 hardware,
|
||||
// but CDNA3 supports only e4m3_fnuz (no inf).
|
||||
const __hip_fp8_e4m3_fnuz xf = *reinterpret_cast<const __hip_fp8_e4m3_fnuz *>(&bits);
|
||||
#else
|
||||
const __nv_fp8_e4m3 xf = *reinterpret_cast<const __nv_fp8_e4m3 *>(&bits);
|
||||
#endif // defined(GGML_USE_HIP) && defined(GGML_USE_HIP)
|
||||
return static_cast<float>(xf) / 2;
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // FP8_AVAILABLE
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint8_t ggml_cuda_float_to_fp4_e2m1(float x, float e) {
|
||||
const uint8_t sign_bit = (x < 0.0f) << 3;
|
||||
float ax = fabsf(x) * e;
|
||||
@@ -931,6 +947,13 @@ struct ggml_cuda_type_traits<GGML_TYPE_MXFP4> {
|
||||
static constexpr int qi = QI_MXFP4;
|
||||
};
|
||||
|
||||
template<>
|
||||
struct ggml_cuda_type_traits<GGML_TYPE_NVFP4> {
|
||||
static constexpr int qk = QK_NVFP4;
|
||||
static constexpr int qr = QR_NVFP4;
|
||||
static constexpr int qi = QI_NVFP4;
|
||||
};
|
||||
|
||||
template<>
|
||||
struct ggml_cuda_type_traits<GGML_TYPE_Q2_K> {
|
||||
static constexpr int qk = QK_K;
|
||||
|
||||
@@ -1,12 +1,20 @@
|
||||
#include <algorithm>
|
||||
|
||||
#include "conv2d-transpose.cuh"
|
||||
#include "ggml.h"
|
||||
#include "convert.cuh"
|
||||
|
||||
__global__ void conv2d_transpose_kernel(const float * __restrict__ input, const half * __restrict__ kernel,
|
||||
float * __restrict__ output, const int in_w, const int in_h, const int out_w,
|
||||
const int out_h, const int kernel_w, const int kernel_h, const int stride,
|
||||
const int c_in, const int c_out, const int batches) {
|
||||
template <typename kernel_t>
|
||||
static __global__ void conv2d_transpose_kernel(const float * __restrict__ input,
|
||||
const kernel_t * __restrict__ kernel,
|
||||
float * __restrict__ output,
|
||||
const int in_w,
|
||||
const int in_h,
|
||||
const int out_w,
|
||||
const int out_h,
|
||||
const int kernel_w,
|
||||
const int kernel_h,
|
||||
const int stride,
|
||||
const int c_in,
|
||||
const int c_out,
|
||||
const int batches) {
|
||||
const int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
const int total_elements = out_w * out_h * c_out * batches;
|
||||
@@ -26,24 +34,32 @@ __global__ void conv2d_transpose_kernel(const float * __restrict__ input, const
|
||||
for (int c_in_idx = 0; c_in_idx < c_in; c_in_idx++) {
|
||||
for (int kh = 0; kh < kernel_h; ++kh) {
|
||||
int in_y = out_y_idx - kh;
|
||||
if (in_y < 0 || in_y % stride) continue;
|
||||
if (in_y < 0 || in_y % stride) {
|
||||
continue;
|
||||
}
|
||||
in_y /= stride;
|
||||
if (in_y >= in_h) continue;
|
||||
if (in_y >= in_h) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int kw = 0; kw < kernel_w; ++kw) {
|
||||
int in_x = out_x_idx - kw;
|
||||
if (in_x < 0 || in_x % stride) continue;
|
||||
if (in_x < 0 || in_x % stride) {
|
||||
continue;
|
||||
}
|
||||
in_x /= stride;
|
||||
if (in_x >= in_w) continue;
|
||||
if (in_x >= in_w) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const int input_idx = (in_w * in_h * c_in) * n_idx + (in_w * in_h) * c_in_idx + (in_w) *in_y + in_x;
|
||||
const int kernel_idx =
|
||||
(kernel_h * kernel_w * c_out) * c_in_idx + (kernel_h * kernel_w) * c_idx + (kernel_w) *kh + kw;
|
||||
|
||||
float input_val = input[input_idx];
|
||||
half kern_val = kernel[kernel_idx];
|
||||
float input_val = input[input_idx];
|
||||
kernel_t kern_val = kernel[kernel_idx];
|
||||
|
||||
accumulator += input_val * (float) kern_val;
|
||||
accumulator += input_val * ggml_cuda_cast<float>(kern_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -56,11 +72,12 @@ void ggml_cuda_conv_2d_transpose_p0(ggml_backend_cuda_context & ctx, ggml_tensor
|
||||
const ggml_tensor * kernel = dst->src[0];
|
||||
const ggml_tensor * input = dst->src[1];
|
||||
|
||||
GGML_ASSERT(kernel->type == GGML_TYPE_F16 && input->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(kernel->type == GGML_TYPE_F16 || kernel->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(input->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
||||
|
||||
const float * input_data = (const float *) input->data;
|
||||
float * output_data = (float *) dst->data;
|
||||
const half * kernel_data = (const half *) kernel->data;
|
||||
const void * kernel_data = kernel->data;
|
||||
|
||||
const int input_w = input->ne[0];
|
||||
const int input_h = input->ne[1];
|
||||
@@ -82,10 +99,17 @@ void ggml_cuda_conv_2d_transpose_p0(ggml_backend_cuda_context & ctx, ggml_tensor
|
||||
GGML_ASSERT(ggml_is_contiguous(kernel));
|
||||
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||
|
||||
const int total = (output_w * output_h * channels_out * batches);
|
||||
const int total = output_w * output_h * channels_out * batches;
|
||||
const int blocks = (total + CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE - 1) / CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE;
|
||||
|
||||
conv2d_transpose_kernel<<<blocks, CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE, 0, st>>>(
|
||||
input_data, kernel_data, output_data, input_w, input_h, output_w, output_h, kernel_w, kernel_h, stride,
|
||||
channels_in, channels_out, batches);
|
||||
if (kernel->type == GGML_TYPE_F16) {
|
||||
conv2d_transpose_kernel<half><<<blocks, CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE, 0, st>>>(
|
||||
input_data, (const half *) kernel_data, output_data, input_w, input_h, output_w, output_h, kernel_w,
|
||||
kernel_h, stride, channels_in, channels_out, batches);
|
||||
|
||||
} else {
|
||||
conv2d_transpose_kernel<float><<<blocks, CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE, 0, st>>>(
|
||||
input_data, (const float *) kernel_data, output_data, input_w, input_h, output_w, output_h, kernel_w,
|
||||
kernel_h, stride, channels_in, channels_out, batches);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
#include "common.cuh"
|
||||
|
||||
#define CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE 256
|
||||
|
||||
void ggml_cuda_conv_2d_transpose_p0(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
@@ -617,6 +617,45 @@ static void dequantize_row_mxfp4_cuda(const void * vx, dst_t * y, const int64_t
|
||||
dequantize_block_mxfp4<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template <typename dst_t>
|
||||
static __global__ void dequantize_block_nvfp4(
|
||||
const void * __restrict__ vx,
|
||||
dst_t * __restrict__ yy,
|
||||
const int64_t ne) {
|
||||
const int64_t i = blockIdx.x;
|
||||
const int tid = threadIdx.x;
|
||||
|
||||
const int64_t base = i * QK_NVFP4;
|
||||
if (base >= ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
const block_nvfp4 * x = (const block_nvfp4 *) vx;
|
||||
const block_nvfp4 & xb = x[i];
|
||||
|
||||
const int sub = tid / (QK_NVFP4_SUB / 2);
|
||||
const int j = tid % (QK_NVFP4_SUB / 2);
|
||||
|
||||
const float d = ggml_cuda_ue4m3_to_fp32(xb.d[sub]);
|
||||
const uint8_t q = xb.qs[sub * (QK_NVFP4_SUB / 2) + j];
|
||||
|
||||
const int64_t y0 = base + sub * QK_NVFP4_SUB + j;
|
||||
const int64_t y1 = y0 + QK_NVFP4_SUB / 2;
|
||||
|
||||
yy[y0] = ggml_cuda_cast<dst_t>(d * kvalues_mxfp4[q & 0x0F]);
|
||||
yy[y1] = ggml_cuda_cast<dst_t>(d * kvalues_mxfp4[q >> 4]);
|
||||
}
|
||||
|
||||
template <typename dst_t>
|
||||
static void dequantize_row_nvfp4_cuda(
|
||||
const void * vx,
|
||||
dst_t * y,
|
||||
const int64_t k,
|
||||
cudaStream_t stream) {
|
||||
GGML_ASSERT(k % QK_NVFP4 == 0);
|
||||
const int nb = k / QK_NVFP4;
|
||||
dequantize_block_nvfp4<<<nb, 32, 0, stream>>>(vx, y, k);
|
||||
}
|
||||
template <typename src_t, typename dst_t>
|
||||
static __global__ void convert_unary(
|
||||
const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01,
|
||||
@@ -715,6 +754,8 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
||||
return dequantize_row_iq3_s_cuda;
|
||||
case GGML_TYPE_MXFP4:
|
||||
return dequantize_row_mxfp4_cuda;
|
||||
case GGML_TYPE_NVFP4:
|
||||
return dequantize_row_nvfp4_cuda;
|
||||
case GGML_TYPE_F32:
|
||||
return convert_unary_cont_cuda<float>;
|
||||
case GGML_TYPE_BF16:
|
||||
@@ -766,6 +807,8 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
||||
return dequantize_row_iq3_s_cuda;
|
||||
case GGML_TYPE_MXFP4:
|
||||
return dequantize_row_mxfp4_cuda;
|
||||
case GGML_TYPE_NVFP4:
|
||||
return dequantize_row_nvfp4_cuda;
|
||||
case GGML_TYPE_F16:
|
||||
return convert_unary_cont_cuda<half>;
|
||||
case GGML_TYPE_BF16:
|
||||
|
||||
@@ -1297,7 +1297,12 @@ static void ggml_cuda_op_mul_mat_cublas(
|
||||
const bool supports_bf16 = GGML_CUDA_CC_IS_NVIDIA(cc) || GGML_CUDA_CC_IS_AMD(cc) ||
|
||||
(GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2);
|
||||
|
||||
const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT;
|
||||
const bool use_fp16 =
|
||||
src0->type != GGML_TYPE_NVFP4 &&
|
||||
(src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
||||
ggml_is_contiguous(src0) &&
|
||||
row_diff == src0->ne[1] &&
|
||||
dst->op_params[0] == GGML_PREC_DEFAULT;
|
||||
|
||||
if (supports_bf16 && src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
|
||||
ggml_cuda_pool_alloc<nv_bfloat16> src1_as_bf16(ctx.pool(id));
|
||||
@@ -4781,6 +4786,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_MXFP4:
|
||||
#ifdef FP8_AVAILABLE
|
||||
case GGML_TYPE_NVFP4:
|
||||
#endif // FP8_AVAILABLE
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
|
||||
@@ -15,6 +15,7 @@ static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type)
|
||||
case GGML_TYPE_Q5_1: return vec_dot_q5_1_q8_1;
|
||||
case GGML_TYPE_Q8_0: return vec_dot_q8_0_q8_1;
|
||||
case GGML_TYPE_MXFP4: return vec_dot_mxfp4_q8_1;
|
||||
case GGML_TYPE_NVFP4: return vec_dot_nvfp4_q8_1;
|
||||
case GGML_TYPE_Q2_K: return vec_dot_q2_K_q8_1;
|
||||
case GGML_TYPE_Q3_K: return vec_dot_q3_K_q8_1;
|
||||
case GGML_TYPE_Q4_K: return vec_dot_q4_K_q8_1;
|
||||
@@ -41,6 +42,7 @@ static constexpr __host__ __device__ int get_vdr_mmvq(ggml_type type) {
|
||||
case GGML_TYPE_Q5_1: return VDR_Q5_1_Q8_1_MMVQ;
|
||||
case GGML_TYPE_Q8_0: return VDR_Q8_0_Q8_1_MMVQ;
|
||||
case GGML_TYPE_MXFP4: return VDR_MXFP4_Q8_1_MMVQ;
|
||||
case GGML_TYPE_NVFP4: return VDR_NVFP4_Q8_1_MMVQ;
|
||||
case GGML_TYPE_Q2_K: return VDR_Q2_K_Q8_1_MMVQ;
|
||||
case GGML_TYPE_Q3_K: return VDR_Q3_K_Q8_1_MMVQ;
|
||||
case GGML_TYPE_Q4_K: return VDR_Q4_K_Q8_1_MMVQ;
|
||||
@@ -626,6 +628,12 @@ static void mul_mat_vec_q_switch_type(
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
|
||||
break;
|
||||
case GGML_TYPE_NVFP4:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_NVFP4>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q2_K>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
|
||||
@@ -322,6 +322,38 @@ static __device__ __forceinline__ float vec_dot_mxfp4_q8_1(
|
||||
return d * sumi;
|
||||
}
|
||||
|
||||
#define VDR_NVFP4_Q8_1_MMVQ 4
|
||||
#define VDR_NVFP4_Q8_1_MMQ 8
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_nvfp4_q8_1(
|
||||
const void * __restrict__ vbq,
|
||||
const block_q8_1 * __restrict__ bq8_1,
|
||||
const int32_t & kbx,
|
||||
const int32_t & iqs) {
|
||||
|
||||
const block_nvfp4 * bq4 = (const block_nvfp4 *) vbq + kbx;
|
||||
float sum = 0.0f;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < VDR_NVFP4_Q8_1_MMVQ/2; i++) {
|
||||
const int32_t iqs0 = iqs + 2*i;
|
||||
const int32_t iqs1 = iqs0 + 1;
|
||||
const int32_t is = iqs0 >> 1;
|
||||
const int2 v0 = get_int_from_table_16(get_int_b4(bq4->qs, iqs0), kvalues_mxfp4);
|
||||
const int2 v1 = get_int_from_table_16(get_int_b4(bq4->qs, iqs1), kvalues_mxfp4);
|
||||
const block_q8_1 * bq8 = bq8_1 + (is >> 1);
|
||||
const int32_t i8 = ((is & 1) << 2);
|
||||
|
||||
int sumi = ggml_cuda_dp4a(v0.x, get_int_b4(bq8->qs, i8 + 0), 0);
|
||||
sumi = ggml_cuda_dp4a(v0.y, get_int_b4(bq8->qs, i8 + 2), sumi);
|
||||
sumi = ggml_cuda_dp4a(v1.x, get_int_b4(bq8->qs, i8 + 1), sumi);
|
||||
sumi = ggml_cuda_dp4a(v1.y, get_int_b4(bq8->qs, i8 + 3), sumi);
|
||||
|
||||
const float d = ggml_cuda_ue4m3_to_fp32(bq4->d[is]) * __low2float(bq8->ds);
|
||||
sum += d * float(sumi);
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
#define VDR_Q2_K_Q8_1_MMVQ 1
|
||||
#define VDR_Q2_K_Q8_1_MMQ 4
|
||||
|
||||
|
||||
Vendored
+3
-2
@@ -6,9 +6,10 @@
|
||||
#include <cuda_bf16.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#if CUDART_VERSION >= 12050
|
||||
#if CUDART_VERSION >= 11080
|
||||
#include <cuda_fp8.h>
|
||||
#endif // CUDART_VERSION >= 12050
|
||||
#define FP8_AVAILABLE
|
||||
#endif // CUDART_VERSION >= 11080
|
||||
|
||||
#if CUDART_VERSION >= 12080
|
||||
#include <cuda_fp4.h>
|
||||
|
||||
Vendored
+6
@@ -235,6 +235,12 @@
|
||||
typedef __hip_bfloat16 nv_bfloat16;
|
||||
typedef __hip_bfloat162 nv_bfloat162;
|
||||
|
||||
#if HIP_VERSION >= 60200000
|
||||
#include <hip/hip_fp8.h>
|
||||
typedef __hip_fp8_e4m3 __nv_fp8_e4m3;
|
||||
#define FP8_AVAILABLE
|
||||
#endif // HIP_VERSION >= 60200000
|
||||
|
||||
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
|
||||
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
|
||||
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
||||
|
||||
@@ -1406,6 +1406,13 @@ static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
repack_q8_0_q8x4x2(tensor, data, size);
|
||||
break;
|
||||
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
GGML_ASSERT(offset == 0);
|
||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
|
||||
// IQ4_NL has identical block layout to Q4_0 (ggml_half d + uint8_t qs[16])
|
||||
repack_q4_0_q4x4x2(tensor, data, size);
|
||||
break;
|
||||
|
||||
case GGML_TYPE_MXFP4:
|
||||
GGML_ASSERT(offset == 0);
|
||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
|
||||
@@ -1442,6 +1449,12 @@ static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||
repack_q8x4x2_q8_0(data, tensor, size);
|
||||
break;
|
||||
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
GGML_ASSERT(offset == 0);
|
||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
|
||||
repack_q4x4x2_q4_0(data, tensor, size);
|
||||
break;
|
||||
|
||||
case GGML_TYPE_MXFP4:
|
||||
GGML_ASSERT(offset == 0);
|
||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
|
||||
@@ -1819,6 +1832,7 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_MXFP4:
|
||||
if (src0->ne[0] % 32) {
|
||||
return false;
|
||||
@@ -1868,6 +1882,7 @@ static bool ggml_hexagon_supported_mul_mat_id(const struct ggml_hexagon_session
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_MXFP4:
|
||||
if ((src0->ne[0] % 32)) {
|
||||
return false;
|
||||
@@ -2596,8 +2611,26 @@ static void ggml_backend_hexagon_free(ggml_backend_t backend) {
|
||||
delete backend;
|
||||
}
|
||||
|
||||
// Map weight type to its activation quantization family.
|
||||
// Types in the same family produce identical Q8 formats in VTCM and can
|
||||
// safely share quantized activation data via SKIP_QUANTIZE.
|
||||
// When adding a new quantized type, assign it the correct family here.
|
||||
static inline int act_quant_family(enum ggml_type wtype) {
|
||||
switch (wtype) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_MXFP4:
|
||||
return 1; // Q8x4x2
|
||||
default:
|
||||
return 0; // unknown / not quantized
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool op_reuse_src1(const ggml_tensor * op1, const ggml_tensor * op0) {
|
||||
return (op0 && op0->src[1] == op1->src[1] && ggml_is_quantized(op0->src[0]->type));
|
||||
return (op0 && op0->src[1] == op1->src[1] &&
|
||||
act_quant_family(op0->src[0]->type) == act_quant_family(op1->src[0]->type) &&
|
||||
act_quant_family(op0->src[0]->type) != 0);
|
||||
}
|
||||
|
||||
static inline bool is_compute_op(ggml_tensor *node)
|
||||
@@ -3364,6 +3397,8 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
|
||||
"please update hexagon_type to match ggml_type");
|
||||
static_assert((unsigned int) HTP_TYPE_MXFP4 == (unsigned int) GGML_TYPE_MXFP4,
|
||||
"please update hexagon_type to match ggml_type");
|
||||
static_assert((unsigned int) HTP_TYPE_IQ4_NL == (unsigned int) GGML_TYPE_IQ4_NL,
|
||||
"please update hexagon_type to match ggml_type");
|
||||
|
||||
const char * str_experimental = getenv("GGML_HEXAGON_EXPERIMENTAL");
|
||||
const char * str_verbose = getenv("GGML_HEXAGON_VERBOSE");
|
||||
|
||||
@@ -30,6 +30,12 @@ static const __fp16 q4_0_to_fp16_lut[64] __attribute__((aligned(VLEN))) = {
|
||||
-8, 0, -7, 0, -6, 0, -5, 0, -4, 0, -3, 0, -2, 0, -1, 0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0,
|
||||
};
|
||||
|
||||
// MXFP4 dequantization LUT: maps 4-bit index to fp16 mantissa value
|
||||
// kvalues: 0, 0.5, 1, 1.5, 2, 3, 4, 6, 0, -0.5, -1, -1.5, -2, -3, -4, -6
|
||||
static const __fp16 mxfp4_to_fp16_lut[64] __attribute__((aligned(VLEN))) = {
|
||||
0, 0, 0.5, 0, 1, 0, 1.5, 0, 2, 0, 3, 0, 4, 0, 6, 0, 0, 0, -0.5, 0, -1, 0, -1.5, 0, -2, 0, -3, 0, -4, 0, -6, 0,
|
||||
};
|
||||
|
||||
static const __fp16 iq4_nl_to_fp16_lut[64] __attribute__((aligned(VLEN))) = {
|
||||
-127, 0, -104, 0, -83, 0, -65, 0, -49, 0, -35, 0, -22, 0, -10, 0,
|
||||
1, 0, 13, 0, 25, 0, 38, 0, 53, 0, 69, 0, 89, 0, 113, 0,
|
||||
@@ -46,7 +52,8 @@ static const int32_t weight_transpose_scatter_offsets[32] __attribute__((aligned
|
||||
|
||||
// Scales per x4x2 logical block: 8 × sizeof(__fp16) = 16 bytes
|
||||
#define HMX_X4X2_SCALES_PER_BLK 8
|
||||
#define HMX_X4X2_DBLK_SIZE 16 // 8 * 2 bytes
|
||||
#define HMX_X4X2_DBLK_SIZE 16 // 8 * 2 bytes (fp16 scales for Q4_0/Q8_0/IQ4_NL)
|
||||
#define HMX_X4X2_MXFP4_EBLK_SIZE 8 // 8 * 1 byte (E8M0 scales for MXFP4)
|
||||
|
||||
static inline void swap_ptr(void **p1, void **p2) {
|
||||
void *t = *p1;
|
||||
@@ -78,9 +85,11 @@ static inline size_t get_x4x2_row_stride(int weight_type, int k) {
|
||||
switch (weight_type) {
|
||||
case HTP_TYPE_Q4_0:
|
||||
case HTP_TYPE_IQ4_NL:
|
||||
return (size_t)nb * (QK_Q4_0x4x2 / 2 + HMX_X4X2_DBLK_SIZE); // 144 * nb
|
||||
return (size_t) nb * (QK_Q4_0x4x2 / 2 + HMX_X4X2_DBLK_SIZE); // 144 * nb
|
||||
case HTP_TYPE_Q8_0:
|
||||
return (size_t)nb * (QK_Q8_0x4x2 + HMX_X4X2_DBLK_SIZE); // 272 * nb
|
||||
return (size_t) nb * (QK_Q8_0x4x2 + HMX_X4X2_DBLK_SIZE); // 272 * nb
|
||||
case HTP_TYPE_MXFP4:
|
||||
return (size_t) nb * (QK_MXFP4x4x2 / 2 + HMX_X4X2_MXFP4_EBLK_SIZE); // 136 * nb
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
@@ -284,6 +293,87 @@ static inline HVX_Vector dequantize_x4x2_q8_0_group_hvx(
|
||||
return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hf, v_scales));
|
||||
}
|
||||
|
||||
// --- MXFP4 E8M0 scale conversion and dequantization ---
|
||||
//
|
||||
// HVX batch-convert 8 E8M0 bytes (one x4x2 block's scales) to __fp16[8] on stack.
|
||||
// Scalar loads from the stack array execute on the scalar pipeline, in parallel
|
||||
// with HVX vlut16/vmpy/vscatter — freeing HVX slots in the hot loop.
|
||||
// Arithmetic: fp16_bits = clamp(e - 112, 0, 30) << 10
|
||||
// e=0..112 -> 0 (underflow), e=113..142 -> valid fp16, e>=143 -> clamped to 2^15.
|
||||
|
||||
typedef struct {
|
||||
__fp16 v[8] __attribute__((aligned(16)));
|
||||
} mxfp4_scales_t;
|
||||
|
||||
static inline mxfp4_scales_t mxfp4_convert_scales(const uint8_t * e8m0_8) {
|
||||
mxfp4_scales_t s;
|
||||
HVX_Vector v = hvx_vmemu(e8m0_8);
|
||||
HVX_Vector vh = Q6_V_lo_W(Q6_Wuh_vunpack_Vub(v));
|
||||
vh = Q6_Vh_vsub_VhVh(vh, Q6_Vh_vsplat_R(112));
|
||||
vh = Q6_Vh_vmax_VhVh(vh, Q6_V_vzero());
|
||||
vh = Q6_Vh_vmin_VhVh(vh, Q6_Vh_vsplat_R(30));
|
||||
vh = Q6_Vh_vasl_VhR(vh, 10);
|
||||
hvx_vec_store_u(s.v, 16, vh);
|
||||
return s;
|
||||
}
|
||||
|
||||
static inline HVX_Vector mxfp4_extract_splat(mxfp4_scales_t scales, int idx) {
|
||||
return hvx_vec_splat_f16(scales.v[idx]);
|
||||
}
|
||||
|
||||
// Dequantize one x4x2 MXFP4 group (32 elements from 32 packed bytes) -> 32 FP16.
|
||||
static inline HVX_Vector dequantize_x4x2_mxfp4_group_hvx(const uint8_t * packed_32,
|
||||
bool upper_nibbles,
|
||||
int sub_blk,
|
||||
const HVX_Vector vlut_cvt,
|
||||
mxfp4_scales_t scales) {
|
||||
HVX_Vector vq = hvx_vmemu(packed_32);
|
||||
const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
|
||||
HVX_Vector v_quants = upper_nibbles ? Q6_Vub_vlsr_VubR(vq, 4) : vq;
|
||||
v_quants = Q6_V_vand_VV(v_quants, mask_h4);
|
||||
|
||||
HVX_Vector v_sc = mxfp4_extract_splat(scales, sub_blk);
|
||||
|
||||
v_quants = Q6_Vb_vshuff_Vb(v_quants);
|
||||
HVX_VectorPair vp = Q6_Wh_vlut16_VbVhR(v_quants, vlut_cvt, 0);
|
||||
HVX_Vector v_hf = Q6_V_lo_W(vp);
|
||||
|
||||
return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hf, v_sc));
|
||||
}
|
||||
|
||||
// Batch-dequantize 4 contiguous x4x2 MXFP4 groups (4x32 = 128 packed bytes).
|
||||
static inline void dequantize_x4x2_mxfp4_x4groups_hvx(const uint8_t * packed_128,
|
||||
bool upper_nibbles,
|
||||
int sub_blk_base,
|
||||
const HVX_Vector vlut_cvt,
|
||||
mxfp4_scales_t scales,
|
||||
HVX_Vector out[4]) {
|
||||
HVX_Vector vq = hvx_vmemu(packed_128);
|
||||
const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
|
||||
HVX_Vector v_quants = upper_nibbles ? Q6_Vub_vlsr_VubR(vq, 4) : vq;
|
||||
v_quants = Q6_V_vand_VV(v_quants, mask_h4);
|
||||
|
||||
v_quants = Q6_Vb_vshuff_Vb(v_quants);
|
||||
|
||||
HVX_VectorPair vp = Q6_Wh_vlut16_VbVhR(v_quants, vlut_cvt, 0);
|
||||
HVX_Vector v_lo = Q6_V_lo_W(vp);
|
||||
HVX_Vector v_hi = Q6_V_hi_W(vp);
|
||||
|
||||
HVX_VectorPred q64 = Q6_Q_vsetq_R(64);
|
||||
HVX_Vector v_sc01 = Q6_V_vmux_QVV(q64, mxfp4_extract_splat(scales, sub_blk_base + 0),
|
||||
mxfp4_extract_splat(scales, sub_blk_base + 1));
|
||||
HVX_Vector v_sc23 = Q6_V_vmux_QVV(q64, mxfp4_extract_splat(scales, sub_blk_base + 2),
|
||||
mxfp4_extract_splat(scales, sub_blk_base + 3));
|
||||
|
||||
v_lo = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_lo, v_sc01));
|
||||
v_hi = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hi, v_sc23));
|
||||
|
||||
out[0] = v_lo;
|
||||
out[1] = Q6_V_vror_VR(v_lo, 64);
|
||||
out[2] = v_hi;
|
||||
out[3] = Q6_V_vror_VR(v_hi, 64);
|
||||
}
|
||||
|
||||
// Dequantize a tile range from x4x2 weight data (already in VTCM) to tile-major FP16.
|
||||
// Input: vtcm_src has n_cols rows of x4x2 data, each row_stride bytes.
|
||||
// Output: vtcm_dst in tile-major FP16 layout.
|
||||
@@ -295,11 +385,11 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
|
||||
int start_tile, int end_tile) {
|
||||
|
||||
const int n_k_tiles = k_block / HMX_FP16_TILE_N_COLS;
|
||||
const bool is_q4 = (weight_type == HTP_TYPE_Q4_0 || weight_type == HTP_TYPE_IQ4_NL);
|
||||
const int qrow_size = is_q4 ? (k_block / 2) : k_block;
|
||||
const int qrow_size = (weight_type == HTP_TYPE_Q8_0) ? k_block : (k_block / 2);
|
||||
|
||||
const HVX_Vector vlut_cvt = (weight_type == HTP_TYPE_IQ4_NL)
|
||||
? hvx_vmem(iq4_nl_to_fp16_lut) : hvx_vmem(q4_0_to_fp16_lut);
|
||||
const HVX_Vector vlut_cvt = (weight_type == HTP_TYPE_IQ4_NL) ? hvx_vmem(iq4_nl_to_fp16_lut) :
|
||||
(weight_type == HTP_TYPE_MXFP4) ? hvx_vmem(mxfp4_to_fp16_lut) :
|
||||
hvx_vmem(q4_0_to_fp16_lut);
|
||||
|
||||
// vscatter setup: write dequantized K-values directly to transposed [K][N] tile positions.
|
||||
// Each int32 element holds a K-row-pair (2 adjacent fp16 values). word[i] at offset i*128
|
||||
@@ -312,8 +402,9 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
|
||||
int ct = t / n_k_tiles; // column tile index
|
||||
int kt = t % n_k_tiles; // K tile index
|
||||
|
||||
// --- Batch-4 fast path for Q4: process 4 contiguous K-tiles with one vlut16 per row ---
|
||||
if (is_q4 && (kt % 4 == 0) && (t + 4 <= end_tile) && ((t + 3) / n_k_tiles == ct)) {
|
||||
// --- Batch-4 fast path for Q4_0/IQ4_NL: process 4 contiguous K-tiles with one vlut16 per row ---
|
||||
if ((weight_type == HTP_TYPE_Q4_0 || weight_type == HTP_TYPE_IQ4_NL) && (kt % 4 == 0) && (t + 4 <= end_tile) &&
|
||||
((t + 3) / n_k_tiles == ct)) {
|
||||
int blk_idx = (kt * 32) / QK_Q4_0x4x2;
|
||||
int sub_blk_base = ((kt * 32) % QK_Q4_0x4x2) / 32; // 0 or 4
|
||||
bool upper = (sub_blk_base >= 4);
|
||||
@@ -351,10 +442,60 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
|
||||
continue;
|
||||
}
|
||||
|
||||
// --- Batch-4 fast path for MXFP4: same nibble layout but E8M0 scales ---
|
||||
if (weight_type == HTP_TYPE_MXFP4 && (kt % 4 == 0) && (t + 4 <= end_tile) && ((t + 3) / n_k_tiles == ct)) {
|
||||
int blk_idx = (kt * 32) / QK_MXFP4x4x2;
|
||||
int sub_blk_base = ((kt * 32) % QK_MXFP4x4x2) / 32; // 0 or 4
|
||||
bool upper = (sub_blk_base >= 4);
|
||||
int packed_off = blk_idx * (QK_MXFP4x4x2 / 2); // 128 contiguous packed bytes
|
||||
int e8m0_blk_off = qrow_size + blk_idx * HMX_X4X2_MXFP4_EBLK_SIZE; // all 8 E8M0 scales
|
||||
|
||||
__fp16 * tile_bases[4];
|
||||
for (int g = 0; g < 4; g++) {
|
||||
tile_bases[g] = vtcm_dst + (t + g) * HMX_FP16_TILE_N_ELMS;
|
||||
}
|
||||
|
||||
HVX_Vector v_off = v_scat_base;
|
||||
for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) {
|
||||
int row0 = ct * HMX_FP16_TILE_N_COLS + r;
|
||||
int row1 = row0 + 1;
|
||||
const uint8_t * r0 = vtcm_src + row0 * row_stride;
|
||||
const uint8_t * r1 = vtcm_src + row1 * row_stride;
|
||||
|
||||
// Batch-convert all 8 E8M0 scales once per row (stays in HVX register)
|
||||
mxfp4_scales_t r0_e8 = mxfp4_convert_scales(r0 + e8m0_blk_off);
|
||||
|
||||
HVX_Vector v0[4], v1[4];
|
||||
dequantize_x4x2_mxfp4_x4groups_hvx(r0 + packed_off, upper, sub_blk_base, vlut_cvt, r0_e8, v0);
|
||||
if (row1 < n_cols) {
|
||||
mxfp4_scales_t r1_e8 = mxfp4_convert_scales(r1 + e8m0_blk_off);
|
||||
dequantize_x4x2_mxfp4_x4groups_hvx(r1 + packed_off, upper, sub_blk_base, vlut_cvt, r1_e8, v1);
|
||||
} else {
|
||||
v1[0] = v1[1] = v1[2] = v1[3] = Q6_V_vzero();
|
||||
}
|
||||
|
||||
for (int g = 0; g < 4; g++) {
|
||||
Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_bases[g], HMX_FP16_TILE_SIZE - 1, v_off, v0[g]);
|
||||
}
|
||||
v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
|
||||
for (int g = 0; g < 4; g++) {
|
||||
Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_bases[g], HMX_FP16_TILE_SIZE - 1, v_off, v1[g]);
|
||||
}
|
||||
v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
|
||||
}
|
||||
|
||||
for (int g = 0; g < 4; g++) {
|
||||
(void) *(volatile HVX_Vector *) (tile_bases[g]);
|
||||
}
|
||||
|
||||
t += 4;
|
||||
continue;
|
||||
}
|
||||
|
||||
// --- Single-tile fallback ---
|
||||
__fp16 *tile_base = vtcm_dst + t * HMX_FP16_TILE_N_ELMS;
|
||||
|
||||
if (is_q4) {
|
||||
if (weight_type == HTP_TYPE_Q4_0 || weight_type == HTP_TYPE_IQ4_NL) {
|
||||
int blk_idx = (kt * 32) / QK_Q4_0x4x2;
|
||||
int sub_blk = ((kt * 32) % QK_Q4_0x4x2) / 32;
|
||||
bool upper = (sub_blk >= 4);
|
||||
@@ -382,6 +523,39 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
|
||||
v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
|
||||
}
|
||||
(void) *(volatile HVX_Vector *)(tile_base);
|
||||
} else if (weight_type == HTP_TYPE_MXFP4) {
|
||||
int blk_idx = (kt * 32) / QK_MXFP4x4x2;
|
||||
int sub_blk = ((kt * 32) % QK_MXFP4x4x2) / 32;
|
||||
bool upper = (sub_blk >= 4);
|
||||
int byte_off = blk_idx * (QK_MXFP4x4x2 / 2) + (upper ? (sub_blk - 4) : sub_blk) * 32;
|
||||
int e8m0_blk_off = qrow_size + blk_idx * HMX_X4X2_MXFP4_EBLK_SIZE;
|
||||
|
||||
HVX_Vector v_off = v_scat_base;
|
||||
for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) {
|
||||
int row0 = ct * HMX_FP16_TILE_N_COLS + r;
|
||||
int row1 = row0 + 1;
|
||||
|
||||
const uint8_t * r0 = vtcm_src + row0 * row_stride;
|
||||
const uint8_t * r1 = vtcm_src + row1 * row_stride;
|
||||
|
||||
// Batch-convert all 8 E8M0 scales once per row (stays in HVX register)
|
||||
mxfp4_scales_t r0_e8 = mxfp4_convert_scales(r0 + e8m0_blk_off);
|
||||
|
||||
HVX_Vector v0 = dequantize_x4x2_mxfp4_group_hvx(r0 + byte_off, upper, sub_blk, vlut_cvt, r0_e8);
|
||||
HVX_Vector v1;
|
||||
if (row1 < n_cols) {
|
||||
mxfp4_scales_t r1_e8 = mxfp4_convert_scales(r1 + e8m0_blk_off);
|
||||
v1 = dequantize_x4x2_mxfp4_group_hvx(r1 + byte_off, upper, sub_blk, vlut_cvt, r1_e8);
|
||||
} else {
|
||||
v1 = Q6_V_vzero();
|
||||
}
|
||||
|
||||
Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v0);
|
||||
v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
|
||||
Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v1);
|
||||
v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
|
||||
}
|
||||
(void) *(volatile HVX_Vector *) (tile_base);
|
||||
} else {
|
||||
// Q8_0
|
||||
int blk_idx = (kt * 32) / QK_Q8_0x4x2;
|
||||
@@ -1455,21 +1629,24 @@ int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict
|
||||
{
|
||||
qweight_fetch_task_state_t s;
|
||||
|
||||
const bool is_q4 = (weight_type == HTP_TYPE_Q4_0 || weight_type == HTP_TYPE_IQ4_NL);
|
||||
const int blk_start = kk / QK_Q4_0x4x2;
|
||||
const int nb_sub = (k_blk_sz + QK_Q4_0x4x2 - 1) / QK_Q4_0x4x2;
|
||||
const int full_qrow = is_q4 ? (k / 2) : k;
|
||||
const int full_qrow = (weight_type == HTP_TYPE_Q8_0) ? k : (k / 2);
|
||||
const size_t sub_row_stride = get_x4x2_row_stride(weight_type, k_blk_sz);
|
||||
const int scale_blk_size =
|
||||
(weight_type == HTP_TYPE_MXFP4) ? HMX_X4X2_MXFP4_EBLK_SIZE : HMX_X4X2_DBLK_SIZE;
|
||||
|
||||
s.dst = vtcm_scratch0;
|
||||
s.src = w + nc * row_stride;
|
||||
s.n_rows = n_blk_sz;
|
||||
s.src_stride = row_stride;
|
||||
s.dst_stride = sub_row_stride;
|
||||
s.quant_off = is_q4 ? (blk_start * (QK_Q4_0x4x2 / 2)) : (blk_start * QK_Q8_0x4x2);
|
||||
s.quant_width = is_q4 ? (nb_sub * (QK_Q4_0x4x2 / 2)) : (nb_sub * QK_Q8_0x4x2);
|
||||
s.scale_off = full_qrow + blk_start * HMX_X4X2_DBLK_SIZE;
|
||||
s.scale_width = nb_sub * HMX_X4X2_DBLK_SIZE;
|
||||
s.quant_off =
|
||||
(weight_type == HTP_TYPE_Q8_0) ? (blk_start * QK_Q8_0x4x2) : (blk_start * (QK_Q4_0x4x2 / 2));
|
||||
s.quant_width =
|
||||
(weight_type == HTP_TYPE_Q8_0) ? (nb_sub * QK_Q8_0x4x2) : (nb_sub * (QK_Q4_0x4x2 / 2));
|
||||
s.scale_off = full_qrow + blk_start * scale_blk_size;
|
||||
s.scale_width = nb_sub * scale_blk_size;
|
||||
|
||||
// 2D DMA: quants sub-range
|
||||
dma_queue_push(ctx->dma[0], dma_make_ptr(s.dst, s.src + s.quant_off),
|
||||
|
||||
@@ -31,6 +31,12 @@ struct htp_context {
|
||||
|
||||
uint32_t opmask;
|
||||
|
||||
// Cached src1 spad position from the last quantize pass.
|
||||
// When SKIP_QUANTIZE is set the Q8 activation data is already in VTCM
|
||||
// at this address; the matmul must read from here instead of recomputing
|
||||
// the offset (which depends on the current op's src0 size).
|
||||
uint8_t * prev_src1_spad;
|
||||
|
||||
// HMX acceleration fields (v73+, enabled by compile-time HTP_HAS_HMX)
|
||||
#ifdef HTP_HAS_HMX
|
||||
int hmx_enabled; // Runtime flag: HMX initialisation succeeded
|
||||
|
||||
@@ -1114,14 +1114,12 @@ static void proc_hmx_matmul_req(struct htp_context * ctx,
|
||||
return;
|
||||
}
|
||||
|
||||
// HMX only supports F16, Q4_0, Q8_0, IQ4_NL weights.
|
||||
// Other types (e.g. MXFP4) fall back to HVX.
|
||||
// HMX supports F16, Q4_0, Q8_0, IQ4_NL, MXFP4 weights.
|
||||
// Other types fall back to HVX.
|
||||
{
|
||||
uint32_t wtype = req->src0.type;
|
||||
if (wtype != HTP_TYPE_F16 &&
|
||||
wtype != HTP_TYPE_Q4_0 &&
|
||||
wtype != HTP_TYPE_Q8_0 &&
|
||||
wtype != HTP_TYPE_IQ4_NL) {
|
||||
if (wtype != HTP_TYPE_F16 && wtype != HTP_TYPE_Q4_0 && wtype != HTP_TYPE_Q8_0 && wtype != HTP_TYPE_IQ4_NL &&
|
||||
wtype != HTP_TYPE_MXFP4) {
|
||||
proc_matmul_req(ctx, req, bufs, n_bufs);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -60,6 +60,16 @@ static const uint8_t __attribute__((aligned(128))) expand_x32_e8m0[128] = {
|
||||
0x00, 0x00, 0x09, 0x08, 0x00, 0x00, 0x22, 0x20, 0x24, 0x20, 0x21, 0x22, 0x20, 0x20,
|
||||
};
|
||||
|
||||
// IQ4_NL dequantization LUT: maps 4-bit index (0-15) to int8 kvalue
|
||||
// kvalues: -127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113
|
||||
static const uint8_t __attribute__((aligned(VLEN))) kvalues_iq4nl_lut[] = {
|
||||
0x81, 0, 0x98, 0, 0xAD, 0, 0xBF, 0, 0xCF, 0, 0xDD, 0, 0xEA, 0, 0xF6, 0, 0x01, 0, 0x0D, 0, 0x19, 0, 0x26, 0,
|
||||
0x35, 0, 0x45, 0, 0x59, 0, 0x71, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
};
|
||||
|
||||
static const uint8_t __attribute__((aligned(VLEN))) kvalues_mxfp4_lut[] = {
|
||||
0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 6, 0, 8, 0, 12, 0, 0, 0, 0xff, 0, 0xfe, 0, 0xfd, 0, 0xfc, 0,
|
||||
0xfa, 0, 0xf8, 0, 0xf4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
@@ -68,6 +78,73 @@ static const uint8_t __attribute__((aligned(VLEN))) kvalues_mxfp4_lut[] = {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
};
|
||||
|
||||
static inline HVX_Vector_x8 hvx_vec_load_iq4nlx4x8_full(const uint8_t * restrict ptr) {
|
||||
const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr;
|
||||
|
||||
HVX_Vector v0_1 = vptr[0]; // first 256 elements (128 bytes)
|
||||
HVX_Vector v2_3 = vptr[1]; // ...
|
||||
HVX_Vector v4_5 = vptr[2]; // ...
|
||||
HVX_Vector v6_7 = vptr[3]; // ...
|
||||
|
||||
const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
|
||||
const HVX_Vector lut = *(const HVX_Vector *) kvalues_iq4nl_lut;
|
||||
|
||||
HVX_Vector v0 = Q6_V_vand_VV(v0_1, mask_h4); // & 0x0F
|
||||
HVX_Vector v1 = Q6_Vub_vlsr_VubR(v0_1, 4); // >> 4
|
||||
HVX_Vector v2 = Q6_V_vand_VV(v2_3, mask_h4); // & 0x0F
|
||||
HVX_Vector v3 = Q6_Vub_vlsr_VubR(v2_3, 4); // >> 4
|
||||
HVX_Vector v4 = Q6_V_vand_VV(v4_5, mask_h4); // & 0x0F
|
||||
HVX_Vector v5 = Q6_Vub_vlsr_VubR(v4_5, 4); // >> 4
|
||||
HVX_Vector v6 = Q6_V_vand_VV(v6_7, mask_h4); // & 0x0F
|
||||
HVX_Vector v7 = Q6_Vub_vlsr_VubR(v6_7, 4); // >> 4
|
||||
|
||||
v0 = Q6_Vb_vlut32_VbVbI(v0, lut, 0);
|
||||
v1 = Q6_Vb_vlut32_VbVbI(v1, lut, 0);
|
||||
v2 = Q6_Vb_vlut32_VbVbI(v2, lut, 0);
|
||||
v3 = Q6_Vb_vlut32_VbVbI(v3, lut, 0);
|
||||
v4 = Q6_Vb_vlut32_VbVbI(v4, lut, 0);
|
||||
v5 = Q6_Vb_vlut32_VbVbI(v5, lut, 0);
|
||||
v6 = Q6_Vb_vlut32_VbVbI(v6, lut, 0);
|
||||
v7 = Q6_Vb_vlut32_VbVbI(v7, lut, 0);
|
||||
|
||||
HVX_Vector_x8 r = { v0, v1, v2, v3, v4, v5, v6, v7 };
|
||||
return r;
|
||||
}
|
||||
|
||||
static inline HVX_Vector_x8 hvx_vec_load_iq4nlx4x8_partial(const uint8_t * restrict ptr, uint32_t n) {
|
||||
const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr;
|
||||
|
||||
const uint32_t qk = QK_Q4_0x4x2; // 256
|
||||
const uint32_t nb = n / qk;
|
||||
const uint32_t nloe = n % qk;
|
||||
|
||||
const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
|
||||
const HVX_Vector lut = *(const HVX_Vector *) kvalues_iq4nl_lut;
|
||||
|
||||
HVX_Vector_x8 r;
|
||||
uint32_t i = 0;
|
||||
|
||||
#pragma unroll(2)
|
||||
for (i = 0; i < nb; i++) {
|
||||
HVX_Vector v = vptr[i]; // 256 elements (128 bytes)
|
||||
HVX_Vector v0 = Q6_V_vand_VV(v, mask_h4); // & 0x0F : first 128 elements
|
||||
HVX_Vector v1 = Q6_Vub_vlsr_VubR(v, 4); // >> 4 : second 128 elements
|
||||
r.v[i * 2 + 0] = Q6_Vb_vlut32_VbVbI(v0, lut, 0);
|
||||
r.v[i * 2 + 1] = Q6_Vb_vlut32_VbVbI(v1, lut, 0);
|
||||
}
|
||||
|
||||
if (nloe) {
|
||||
HVX_Vector v = vptr[i]; // 256 elements (128 bytes)
|
||||
HVX_Vector v0 = Q6_V_vand_VV(v, mask_h4); // & 0x0F : even 128 elements
|
||||
HVX_Vector v1 = Q6_Vub_vlsr_VubR(v, 4); // >> 4 : odd 128 elements
|
||||
HVX_VectorPair v0_1_p = Q6_W_vshuff_VVR(v1, v0, -1); // zip even:odd:...
|
||||
r.v[i * 2 + 0] = Q6_Vb_vlut32_VbVbI(Q6_V_lo_W(v0_1_p), lut, 0);
|
||||
r.v[i * 2 + 1] = Q6_Vb_vlut32_VbVbI(Q6_V_hi_W(v0_1_p), lut, 0);
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
// q4x4x2 and q8x4x2 are the flat q4/8_0 formats where all quants are stored first followed by all scales
|
||||
|
||||
static inline size_t q8x4x2_row_size(uint32_t ne) {
|
||||
@@ -921,6 +998,293 @@ static void vec_dot_q8x4x2_q8x4x2_2x2(const int n, float * restrict s0, float *
|
||||
hvx_vec_store_u(&s1[0], 8, r0_r1_c1_sum); // row0,col1 row1,col1
|
||||
}
|
||||
|
||||
// ======== IQ4_NL x Q8_0 vec_dot kernels ========
|
||||
// Same structure as Q4_0 vec_dot but uses IQ4_NL LUT-based load (4-bit index -> int8 kvalue).
|
||||
// Scale format is identical to Q4_0 (fp16 scales).
|
||||
|
||||
static void vec_dot_iq4nlx4x2_q8x4x2_1x1(const int n,
|
||||
float * restrict s0,
|
||||
const void * restrict vx0,
|
||||
const void * restrict vy0) {
|
||||
assert(n % 32 == 0);
|
||||
assert((unsigned long) vx0 % 128 == 0);
|
||||
assert((unsigned long) vy0 % 128 == 0);
|
||||
|
||||
const uint32_t qk = QK_Q4_0x4x2 * 4;
|
||||
|
||||
const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16
|
||||
const uint32_t x_qblk_size = qk / 2; // int4
|
||||
const uint32_t x_qrow_size = n / 2; // int4 (not padded)
|
||||
|
||||
const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16
|
||||
const uint32_t y_qblk_size = qk; // int8
|
||||
const uint32_t y_qrow_size = n; // int8 (not padded)
|
||||
|
||||
const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0 + 0); // quants first
|
||||
const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0 + x_qrow_size); // then scales
|
||||
|
||||
const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0); // quants first
|
||||
const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales
|
||||
|
||||
HVX_Vector r0_sum = Q6_V_vzero();
|
||||
|
||||
const uint32_t nb = n / qk;
|
||||
const uint32_t nloe = n % qk;
|
||||
|
||||
uint32_t i = 0;
|
||||
for (; i < nb; i++) {
|
||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_iq4nlx4x8_full(r0_x_q + i * x_qblk_size);
|
||||
|
||||
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
|
||||
|
||||
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
|
||||
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
||||
|
||||
HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
|
||||
|
||||
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
|
||||
|
||||
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
|
||||
}
|
||||
|
||||
if (nloe) {
|
||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_iq4nlx4x8_partial(r0_x_q + i * x_qblk_size, nloe);
|
||||
|
||||
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe));
|
||||
|
||||
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
|
||||
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
||||
|
||||
HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
|
||||
|
||||
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
|
||||
r0_dd = Q6_V_vand_QV(bmask, r0_dd);
|
||||
r0_ia = Q6_V_vand_QV(bmask, r0_ia);
|
||||
|
||||
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
|
||||
|
||||
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
|
||||
}
|
||||
|
||||
r0_sum = hvx_vec_reduce_sum_f32(r0_sum);
|
||||
|
||||
hvx_vec_store_u(s0, 4, r0_sum);
|
||||
}
|
||||
|
||||
static void vec_dot_iq4nlx4x2_q8x4x2_2x1(const int n,
|
||||
float * restrict s0,
|
||||
const void * restrict vx0,
|
||||
const void * restrict vx1,
|
||||
const void * restrict vy0) {
|
||||
assert(n % 32 == 0);
|
||||
assert((unsigned long) vx0 % 128 == 0);
|
||||
assert((unsigned long) vx1 % 128 == 0);
|
||||
assert((unsigned long) vy0 % 128 == 0);
|
||||
|
||||
const uint32_t qk = QK_Q4_0x4x2 * 4;
|
||||
|
||||
const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16
|
||||
const uint32_t x_qblk_size = qk / 2; // int4
|
||||
const uint32_t x_qrow_size = n / 2; // int4 (not padded)
|
||||
|
||||
const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16
|
||||
const uint32_t y_qblk_size = qk; // int8
|
||||
const uint32_t y_qrow_size = n; // int8 (not padded)
|
||||
|
||||
const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0; // quants first
|
||||
const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size; // then scales
|
||||
const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0; // quants first
|
||||
const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size; // then scales
|
||||
|
||||
const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0); // quants first
|
||||
const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales
|
||||
|
||||
HVX_Vector r0_sum = Q6_V_vzero();
|
||||
HVX_Vector r1_sum = Q6_V_vzero();
|
||||
|
||||
const uint32_t nb = n / qk;
|
||||
const uint32_t nloe = n % qk;
|
||||
|
||||
uint32_t i = 0;
|
||||
for (; i < nb; i++) {
|
||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_iq4nlx4x8_full(r0_x_q + i * x_qblk_size);
|
||||
HVX_Vector_x8 r1_q = hvx_vec_load_iq4nlx4x8_full(r1_x_q + i * x_qblk_size);
|
||||
|
||||
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
|
||||
HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
|
||||
|
||||
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
|
||||
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
||||
HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
|
||||
|
||||
HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
|
||||
HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
|
||||
|
||||
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
|
||||
HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
|
||||
|
||||
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
|
||||
r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
|
||||
}
|
||||
|
||||
if (nloe) {
|
||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_iq4nlx4x8_partial(r0_x_q + i * x_qblk_size, nloe);
|
||||
HVX_Vector_x8 r1_q = hvx_vec_load_iq4nlx4x8_partial(r1_x_q + i * x_qblk_size, nloe);
|
||||
|
||||
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe));
|
||||
HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy_q, nloe));
|
||||
|
||||
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
|
||||
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
||||
HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
|
||||
|
||||
HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
|
||||
HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
|
||||
|
||||
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
|
||||
r0_dd = Q6_V_vand_QV(bmask, r0_dd);
|
||||
r1_dd = Q6_V_vand_QV(bmask, r1_dd);
|
||||
r0_ia = Q6_V_vand_QV(bmask, r0_ia);
|
||||
r1_ia = Q6_V_vand_QV(bmask, r1_ia);
|
||||
|
||||
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
|
||||
HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
|
||||
|
||||
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
|
||||
r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
|
||||
}
|
||||
|
||||
HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum);
|
||||
hvx_vec_store_u(s0, 8, rsum);
|
||||
}
|
||||
|
||||
static void vec_dot_iq4nlx4x2_q8x4x2_2x2(const int n,
|
||||
float * restrict s0,
|
||||
float * restrict s1,
|
||||
const void * restrict vx0,
|
||||
const void * restrict vx1,
|
||||
const void * restrict vy0,
|
||||
const void * restrict vy1) {
|
||||
assert(n % 32 == 0);
|
||||
assert((unsigned long) vx0 % 128 == 0);
|
||||
assert((unsigned long) vx1 % 128 == 0);
|
||||
assert((unsigned long) vy0 % 128 == 0);
|
||||
assert((unsigned long) vy1 % 128 == 0);
|
||||
|
||||
const uint32_t qk = QK_Q4_0x4x2 * 4;
|
||||
|
||||
const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16
|
||||
const uint32_t x_qblk_size = qk / 2; // int4
|
||||
const uint32_t x_qrow_size = n / 2; // int4 (not padded)
|
||||
|
||||
const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16
|
||||
const uint32_t y_qblk_size = qk; // int8
|
||||
const uint32_t y_qrow_size = n; // int8 (not padded)
|
||||
|
||||
const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0;
|
||||
const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size;
|
||||
const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0;
|
||||
const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size;
|
||||
|
||||
const uint8_t * restrict y0_q = ((const uint8_t *) vy0) + 0;
|
||||
const uint8_t * restrict y0_d = ((const uint8_t *) vy0) + y_qrow_size;
|
||||
const uint8_t * restrict y1_q = ((const uint8_t *) vy1) + 0;
|
||||
const uint8_t * restrict y1_d = ((const uint8_t *) vy1) + y_qrow_size;
|
||||
|
||||
HVX_Vector r0_c0_sum = Q6_V_vzero();
|
||||
HVX_Vector r0_c1_sum = Q6_V_vzero();
|
||||
HVX_Vector r1_c0_sum = Q6_V_vzero();
|
||||
HVX_Vector r1_c1_sum = Q6_V_vzero();
|
||||
|
||||
const uint32_t nb = n / qk;
|
||||
const uint32_t nloe = n % qk;
|
||||
|
||||
uint32_t i = 0;
|
||||
for (; i < nb; i++) {
|
||||
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_full(y0_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_full(y1_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_iq4nlx4x8_full(r0_x_q + i * x_qblk_size);
|
||||
HVX_Vector_x8 r1_q = hvx_vec_load_iq4nlx4x8_full(r1_x_q + i * x_qblk_size);
|
||||
|
||||
HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy0_q));
|
||||
HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy1_q));
|
||||
HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy0_q));
|
||||
HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy1_q));
|
||||
|
||||
HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size));
|
||||
HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size));
|
||||
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
||||
HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
|
||||
|
||||
HVX_Vector r0_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy0_d)));
|
||||
HVX_Vector r0_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy1_d)));
|
||||
HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy0_d)));
|
||||
HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy1_d)));
|
||||
|
||||
HVX_Vector r0_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c0_ia, r0_c0_dd);
|
||||
HVX_Vector r0_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c1_ia, r0_c1_dd);
|
||||
HVX_Vector r1_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c0_ia, r1_c0_dd);
|
||||
HVX_Vector r1_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c1_ia, r1_c1_dd);
|
||||
|
||||
r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_sum));
|
||||
r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_sum));
|
||||
r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_sum));
|
||||
r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_sum));
|
||||
}
|
||||
|
||||
if (nloe) {
|
||||
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_partial(y0_q + i * y_qblk_size, nloe);
|
||||
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_partial(y1_q + i * y_qblk_size, nloe);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_iq4nlx4x8_partial(r0_x_q + i * x_qblk_size, nloe);
|
||||
HVX_Vector_x8 r1_q = hvx_vec_load_iq4nlx4x8_partial(r1_x_q + i * x_qblk_size, nloe);
|
||||
|
||||
HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy0_q, nloe));
|
||||
HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy1_q, nloe));
|
||||
HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy0_q, nloe));
|
||||
HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy1_q, nloe));
|
||||
|
||||
HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size));
|
||||
HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size));
|
||||
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
||||
HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
|
||||
|
||||
HVX_Vector r0_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy0_d)));
|
||||
HVX_Vector r0_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy1_d)));
|
||||
HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy0_d)));
|
||||
HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy1_d)));
|
||||
|
||||
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
|
||||
r0_c0_dd = Q6_V_vand_QV(bmask, r0_c0_dd);
|
||||
r0_c1_dd = Q6_V_vand_QV(bmask, r0_c1_dd);
|
||||
r1_c0_dd = Q6_V_vand_QV(bmask, r1_c0_dd);
|
||||
r1_c1_dd = Q6_V_vand_QV(bmask, r1_c1_dd);
|
||||
r0_c0_ia = Q6_V_vand_QV(bmask, r0_c0_ia);
|
||||
r0_c1_ia = Q6_V_vand_QV(bmask, r0_c1_ia);
|
||||
r1_c0_ia = Q6_V_vand_QV(bmask, r1_c0_ia);
|
||||
r1_c1_ia = Q6_V_vand_QV(bmask, r1_c1_ia);
|
||||
|
||||
HVX_Vector r0_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c0_ia, r0_c0_dd);
|
||||
HVX_Vector r0_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c1_ia, r0_c1_dd);
|
||||
HVX_Vector r1_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c0_ia, r1_c0_dd);
|
||||
HVX_Vector r1_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c1_ia, r1_c1_dd);
|
||||
|
||||
r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_sum));
|
||||
r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_sum));
|
||||
r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_sum));
|
||||
r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_sum));
|
||||
}
|
||||
|
||||
HVX_Vector r0_r1_c0_sum = hvx_vec_reduce_sum_f32x2(r0_c0_sum, r1_c0_sum);
|
||||
HVX_Vector r0_r1_c1_sum = hvx_vec_reduce_sum_f32x2(r0_c1_sum, r1_c1_sum);
|
||||
|
||||
hvx_vec_store_u(&s0[0], 8, r0_r1_c0_sum);
|
||||
hvx_vec_store_u(&s1[0], 8, r0_r1_c1_sum);
|
||||
}
|
||||
|
||||
static void vec_dot_mxfp4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const void * restrict vx0, const void * restrict vy0) {
|
||||
assert(n % 32 == 0); // min sub-block size
|
||||
assert((unsigned long) vx0 % 128 == 0);
|
||||
@@ -2393,6 +2757,12 @@ static int htp_mminit_vec_dot(struct htp_matmul_context * mmctx, enum htp_data_t
|
||||
mmctx->vec_dot_2x1 = vec_dot_q8x4x2_q8x4x2_2x1;
|
||||
mmctx->vec_dot_2x2 = vec_dot_q8x4x2_q8x4x2_2x2;
|
||||
return 0;
|
||||
case HTP_TYPE_IQ4_NL:
|
||||
mmctx->type = "iq4nlx4x2-f32";
|
||||
mmctx->vec_dot_1x1 = vec_dot_iq4nlx4x2_q8x4x2_1x1;
|
||||
mmctx->vec_dot_2x1 = vec_dot_iq4nlx4x2_q8x4x2_2x1;
|
||||
mmctx->vec_dot_2x2 = vec_dot_iq4nlx4x2_q8x4x2_2x2;
|
||||
return 0;
|
||||
case HTP_TYPE_MXFP4:
|
||||
mmctx->type = "mxfp4x4x2-f32";
|
||||
mmctx->vec_dot_1x1 = vec_dot_mxfp4x4x2_q8x4x2_1x1;
|
||||
@@ -2556,6 +2926,13 @@ int op_matmul(struct htp_ops_context * octx) {
|
||||
const uint32_t n_quant_jobs = MIN(src1_nrows, octx->n_threads);
|
||||
mmctx->src1_nrows_per_thread = (src1_nrows + n_quant_jobs - 1) / n_quant_jobs;
|
||||
worker_pool_run_func(octx->ctx->worker_pool, quant_job_func, mmctx, n_quant_jobs);
|
||||
// Cache where src1 was written so subsequent SKIP_QUANTIZE ops can find it
|
||||
octx->ctx->prev_src1_spad = octx->src1_spad.data;
|
||||
} else {
|
||||
// SKIP_QUANTIZE: Q8 data lives at the address written by the previous
|
||||
// quantize pass. The current op may have a different src0 size (e.g.
|
||||
// IQ4_NL vs MXFP4), so src1_spad.data computed above could be wrong.
|
||||
octx->src1_spad.data = octx->ctx->prev_src1_spad;
|
||||
}
|
||||
|
||||
if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
|
||||
@@ -2659,6 +3036,9 @@ int op_matmul_id(struct htp_ops_context * octx) {
|
||||
const uint32_t n_quant_jobs = MIN(src1_nrows, octx->n_threads);
|
||||
mmctx->src1_nrows_per_thread = (src1_nrows + n_quant_jobs - 1) / n_quant_jobs;
|
||||
worker_pool_run_func(octx->ctx->worker_pool, quant_job_func, mmctx, n_quant_jobs);
|
||||
octx->ctx->prev_src1_spad = octx->src1_spad.data;
|
||||
} else {
|
||||
octx->src1_spad.data = octx->ctx->prev_src1_spad;
|
||||
}
|
||||
|
||||
if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
|
||||
|
||||
@@ -690,7 +690,7 @@ ggml_metal_device_t ggml_metal_device_init(int device) {
|
||||
" auto tB = B.slice((int)tgid.x, 0); \n"
|
||||
" \n"
|
||||
" matmul2d< \n"
|
||||
" matmul2d_descriptor(8, 8, dynamic_extent), \n"
|
||||
" matmul2d_descriptor(16, 16, dynamic_extent), \n"
|
||||
" execution_simdgroups<4>> mm; \n"
|
||||
" \n"
|
||||
" auto cT = mm.get_destination_cooperative_tensor<decltype(tA), decltype(tB), float>(); \n"
|
||||
@@ -740,7 +740,7 @@ ggml_metal_device_t ggml_metal_device_init(int device) {
|
||||
" auto tB = B.slice((int)tgid.x, 0); \n"
|
||||
" \n"
|
||||
" matmul2d< \n"
|
||||
" matmul2d_descriptor(8, 8, dynamic_extent), \n"
|
||||
" matmul2d_descriptor(16, 16, dynamic_extent), \n"
|
||||
" execution_simdgroups<4>> mm; \n"
|
||||
" \n"
|
||||
" auto cT = mm.get_destination_cooperative_tensor<decltype(tA), decltype(tB), float>(); \n"
|
||||
|
||||
@@ -394,6 +394,9 @@ struct ggml_backend_opencl_context {
|
||||
bool fp16_support;
|
||||
bool has_vector_subgroup_broadcast;
|
||||
bool disable_fusion;
|
||||
|
||||
bool adreno_has_large_buffer;
|
||||
bool adreno_use_large_buffer;
|
||||
ggml_cl_compiler_version adreno_cl_compiler_version;
|
||||
|
||||
int adreno_wave_size;
|
||||
@@ -787,6 +790,10 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
" -cl-mad-enable -cl-unsafe-math-optimizations"
|
||||
" -cl-finite-math-only -cl-fast-relaxed-math";
|
||||
|
||||
if (backend_ctx->adreno_use_large_buffer) {
|
||||
compile_opts += " -qcom-enable-large-buffer ";
|
||||
}
|
||||
|
||||
GGML_LOG_INFO("ggml_opencl: loading OpenCL kernels");
|
||||
|
||||
// add
|
||||
@@ -3020,6 +3027,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
// Check if ext_buffer contains cl_khr_fp16
|
||||
backend_ctx->fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
|
||||
GGML_LOG_INFO("ggml_opencl: device FP16 support: %s\n", backend_ctx->fp16_support ? "true" : "false");
|
||||
// check Adreno large buffer support
|
||||
backend_ctx->adreno_has_large_buffer = strstr(ext_buffer, "cl_qcom_large_buffer") != NULL;
|
||||
|
||||
// fp16 is required
|
||||
if (!backend_ctx->fp16_support) {
|
||||
@@ -3086,6 +3095,18 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
GGML_LOG_INFO("ggml_opencl: using kernels optimized for Adreno (GGML_OPENCL_USE_ADRENO_KERNELS)\n");
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
|
||||
// determine whether to use large buffer for Adreno
|
||||
backend_ctx->adreno_use_large_buffer = getenv("GGML_OPENCL_ADRENO_USE_LARGE_BUFFER") != nullptr &&
|
||||
backend_ctx->gpu_family == GPU_FAMILY::ADRENO;
|
||||
if (backend_ctx->adreno_use_large_buffer) {
|
||||
if (!backend_ctx->adreno_has_large_buffer) {
|
||||
GGML_LOG_INFO("ggml_opencl: Adreno large buffer requested but not supported by driver, will use regular buffer\n");
|
||||
backend_ctx->adreno_use_large_buffer = false;
|
||||
} else {
|
||||
GGML_LOG_INFO("ggml_opencl: Adreno large buffer enabled\n");
|
||||
}
|
||||
}
|
||||
|
||||
cl_int err;
|
||||
|
||||
// A local ref of cl_context for convenience
|
||||
@@ -5660,6 +5681,11 @@ static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_b
|
||||
|
||||
cl_int err;
|
||||
cl_mem mem = clCreateBuffer(backend_ctx->context, CL_MEM_READ_WRITE, size, NULL, &err);
|
||||
if (err != CL_SUCCESS && backend_ctx->adreno_use_large_buffer) {
|
||||
cl_mem_properties props[] = { 0x41A6 /* CL_LARGE_BUFFER_QCOM */, 1, 0 };
|
||||
mem = clCreateBufferWithProperties(backend_ctx->context, props, CL_MEM_READ_WRITE, size, NULL, &err);
|
||||
}
|
||||
|
||||
if (err != CL_SUCCESS) {
|
||||
GGML_LOG_INFO("%s: failed to allocate %.2f MiB\n", __func__, size / 1024.0 / 1024.0);
|
||||
return nullptr;
|
||||
|
||||
@@ -589,8 +589,10 @@ static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
|
||||
ggml_backend_buffer_t buffer = tensor->buffer;
|
||||
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
||||
result.buffer = ctx != nullptr ? ctx->remote_ptr : 0;
|
||||
result.data = reinterpret_cast<uint64_t>(tensor->data);
|
||||
} else {
|
||||
result.buffer = 0;
|
||||
result.data = 0;
|
||||
}
|
||||
for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) {
|
||||
result.ne[i] = tensor->ne[i];
|
||||
@@ -606,7 +608,6 @@ static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
|
||||
}
|
||||
result.view_src = reinterpret_cast<uint64_t>(tensor->view_src);
|
||||
result.view_offs = tensor->view_offs;
|
||||
result.data = reinterpret_cast<uint64_t>(tensor->data);
|
||||
|
||||
// Avoid sending uninitialized data over the wire
|
||||
memset(result.name, 0, sizeof(result.name));
|
||||
@@ -1443,9 +1444,11 @@ ggml_tensor * rpc_server::create_node(uint64_t id,
|
||||
const rpc_tensor * tensor = it_ptr->second;
|
||||
|
||||
struct ggml_tensor * result = deserialize_tensor(ctx, tensor);
|
||||
if (result == nullptr || result->buffer == nullptr) {
|
||||
GGML_LOG_ERROR("[%s] invalid tensor: null %s (id=%" PRIu64 ")\n",
|
||||
__func__, result == nullptr ? "tensor" : "buffer", id);
|
||||
if (result == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
if (result->buffer == nullptr && result->data != nullptr) {
|
||||
GGML_LOG_ERROR("[%s] invalid data ptr", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
tensor_map[id] = result;
|
||||
|
||||
@@ -1112,6 +1112,16 @@ struct vk_op_glu_push_constants {
|
||||
uint32_t mode; // 0: default, 1: swapped, 2: split
|
||||
float alpha; // for swiglu_oai
|
||||
float limit;
|
||||
uint32_t nb01;
|
||||
uint32_t nb02;
|
||||
uint32_t nb03;
|
||||
uint32_t ne01;
|
||||
uint32_t ne02;
|
||||
uint32_t nb11;
|
||||
uint32_t nb12;
|
||||
uint32_t nb13;
|
||||
uint32_t ne11;
|
||||
uint32_t ne12;
|
||||
};
|
||||
|
||||
struct vk_op_unary_push_constants {
|
||||
@@ -5044,7 +5054,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
} else {
|
||||
device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 1, priorities});
|
||||
}
|
||||
vk::DeviceCreateInfo device_create_info;
|
||||
vk::DeviceCreateInfo device_create_info{};
|
||||
std::vector<const char *> device_extensions;
|
||||
vk::PhysicalDeviceFeatures device_features = device->physical_device.getFeatures();
|
||||
|
||||
@@ -5413,12 +5423,10 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
#endif
|
||||
device->name = GGML_VK_NAME + std::to_string(idx);
|
||||
|
||||
device_create_info = {
|
||||
vk::DeviceCreateFlags(),
|
||||
device_queue_create_infos,
|
||||
{},
|
||||
device_extensions
|
||||
};
|
||||
device_create_info
|
||||
.setFlags(vk::DeviceCreateFlags())
|
||||
.setQueueCreateInfos(device_queue_create_infos)
|
||||
.setPEnabledExtensionNames(device_extensions);
|
||||
device_create_info.setPNext(&device_features2);
|
||||
device->device = device->physical_device.createDevice(device_create_info);
|
||||
|
||||
@@ -11048,8 +11056,6 @@ static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const
|
||||
const float alpha = op_params_f[2];
|
||||
const float limit = op_params_f[3];
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
if (!split) {
|
||||
GGML_ASSERT(src0->ne[0] / 2 == dst->ne[0]);
|
||||
} else {
|
||||
@@ -11067,7 +11073,17 @@ static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const
|
||||
(uint32_t)dst->ne[0],
|
||||
mode,
|
||||
alpha,
|
||||
limit
|
||||
limit,
|
||||
(uint32_t)(src0->nb[1] / src0->nb[0]),
|
||||
(uint32_t)(src0->nb[2] / src0->nb[0]),
|
||||
(uint32_t)(src0->nb[3] / src0->nb[0]),
|
||||
(uint32_t)src0->ne[1],
|
||||
(uint32_t)src0->ne[2],
|
||||
(uint32_t)(dst->nb[1] / dst->nb[0]),
|
||||
(uint32_t)(dst->nb[2] / dst->nb[0]),
|
||||
(uint32_t)(dst->nb[3] / dst->nb[0]),
|
||||
(uint32_t)dst->ne[1],
|
||||
(uint32_t)dst->ne[2]
|
||||
});
|
||||
}
|
||||
|
||||
@@ -15217,8 +15233,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
case GGML_GLU_OP_SWIGLU_OAI:
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
case GGML_GLU_OP_GEGLU_QUICK:
|
||||
return ggml_is_contiguous(op->src[0]) &&
|
||||
(op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
|
||||
return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
|
||||
(op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) &&
|
||||
(op->src[0]->type == op->type);
|
||||
default:
|
||||
|
||||
@@ -16,4 +16,14 @@ layout (push_constant) uniform parameter
|
||||
uint mode;
|
||||
float alpha;
|
||||
float limit;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb03;
|
||||
uint ne01;
|
||||
uint ne02;
|
||||
uint nb11;
|
||||
uint nb12;
|
||||
uint nb13;
|
||||
uint ne11;
|
||||
uint ne12;
|
||||
} p;
|
||||
|
||||
@@ -8,22 +8,32 @@ void main() {
|
||||
const uint row = i / p.ne20;
|
||||
const uint col = i - row * p.ne20;
|
||||
|
||||
const uint i3 = row / (p.ne01 * p.ne02);
|
||||
const uint i2 = (row % (p.ne01 * p.ne02)) / p.ne01;
|
||||
const uint i1 = row % p.ne01;
|
||||
const uint src_idx = i3 * p.nb03 + i2 * p.nb02 + i1 * p.nb01 + col;
|
||||
|
||||
const uint dst_i3 = row / (p.ne11 * p.ne12);
|
||||
const uint dst_i2 = (row % (p.ne11 * p.ne12)) / p.ne11;
|
||||
const uint dst_i1 = row % p.ne11;
|
||||
const uint dst_idx = dst_i3 * p.nb13 + dst_i2 * p.nb12 + dst_i1 * p.nb11 + col;
|
||||
|
||||
if (p.mode == 0) {
|
||||
// Default
|
||||
const uint offset = p.ne00 / 2;
|
||||
const uint idx = row * p.ne00 + col;
|
||||
const uint idx = src_idx;
|
||||
|
||||
data_d[row * offset + col] = D_TYPE(op(float(data_a[idx]), float(data_a[idx + offset])));
|
||||
data_d[dst_idx] = D_TYPE(op(float(data_a[idx]), float(data_a[idx + offset])));
|
||||
} else if (p.mode == 1) {
|
||||
// Swapped
|
||||
const uint offset = p.ne00 / 2;
|
||||
const uint idx = row * p.ne00 + col;
|
||||
const uint idx = src_idx;
|
||||
|
||||
data_d[row * offset + col] = D_TYPE(op(float(data_a[idx + offset]), float(data_a[idx])));
|
||||
data_d[dst_idx] = D_TYPE(op(float(data_a[idx + offset]), float(data_a[idx])));
|
||||
} else {
|
||||
// Split
|
||||
const uint idx = row * p.ne00 + col;
|
||||
const uint idx = src_idx;
|
||||
|
||||
data_d[idx] = D_TYPE(op(float(data_a[idx]), float(data_b[idx])));
|
||||
data_d[dst_idx] = D_TYPE(op(float(data_a[idx]), float(data_b[idx])));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4962,6 +4962,7 @@ static struct ggml_tensor * ggml_interpolate_impl(
|
||||
GGML_ASSERT((mode & 0xFF) < GGML_SCALE_MODE_COUNT);
|
||||
// TODO: implement antialias for modes other than bilinear
|
||||
GGML_ASSERT(!(mode & GGML_SCALE_FLAG_ANTIALIAS) || (mode & 0xFF) == GGML_SCALE_MODE_BILINEAR);
|
||||
GGML_ASSERT(a->type == GGML_TYPE_F32);
|
||||
|
||||
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3);
|
||||
|
||||
@@ -5307,6 +5308,7 @@ struct ggml_tensor * ggml_flash_attn_ext(
|
||||
GGML_ASSERT(q->ne[3] == v->ne[3]);
|
||||
|
||||
if (mask) {
|
||||
GGML_ASSERT(mask->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(ggml_is_contiguous(mask));
|
||||
//GGML_ASSERT(ggml_can_repeat_rows(mask, qk));
|
||||
|
||||
|
||||
@@ -326,6 +326,11 @@ class Keys:
|
||||
class Projector:
|
||||
SCALE_FACTOR = "clip.vision.projector.scale_factor"
|
||||
|
||||
class SAM:
|
||||
BLOCK_COUNT = "clip.vision.sam.block_count"
|
||||
EMBEDDING_LENGTH = "clip.vision.sam.embedding_length"
|
||||
HEAD_COUNT = "clip.vision.sam.head_count"
|
||||
|
||||
class ClipAudio:
|
||||
PROJECTOR_TYPE = "clip.audio.projector_type" # for mixed modality models
|
||||
NUM_MEL_BINS = "clip.audio.num_mel_bins"
|
||||
@@ -434,6 +439,7 @@ class MODEL_ARCH(IntEnum):
|
||||
ARCTIC = auto()
|
||||
DEEPSEEK = auto()
|
||||
DEEPSEEK2 = auto()
|
||||
DEEPSEEK2OCR = auto()
|
||||
CHATGLM = auto()
|
||||
GLM4 = auto()
|
||||
GLM4_MOE = auto()
|
||||
@@ -755,6 +761,22 @@ class MODEL_TENSOR(IntEnum):
|
||||
V_MM_GATE = auto() # cogvlm
|
||||
V_TOK_BOI = auto() # cogvlm
|
||||
V_TOK_EOI = auto() # cogvlm
|
||||
V_SAM_POS_EMBD = auto() # Deepseek-OCR
|
||||
V_SAM_PATCH_EMBD = auto() # Deepseek-OCR
|
||||
V_SAM_PRE_NORM = auto() # Deepseek-OCR
|
||||
V_SAM_POST_NORM = auto() # Deepseek-OCR
|
||||
V_SAM_ATTN_POS_H = auto() # Deepseek-OCR
|
||||
V_SAM_ATTN_POS_W = auto() # Deepseek-OCR
|
||||
V_SAM_ATTN_QKV = auto() # Deepseek-OCR
|
||||
V_SAM_ATTN_OUT = auto() # Deepseek-OCR
|
||||
V_SAM_MLP_LIN_1 = auto() # Deepseek-OCR
|
||||
V_SAM_MLP_LIN_2 = auto() # Deepseek-OCR
|
||||
V_SAM_NECK = auto() # Deepseek-OCR
|
||||
V_SAM_NET_2 = auto() # Deepseek-OCR
|
||||
V_SAM_NET_3 = auto() # Deepseek-OCR
|
||||
V_ENC_EMBD_IMGNL = auto() # Deepseek-OCR
|
||||
V_ENC_EMBD_VSEP = auto() # Deepseek-OCR
|
||||
|
||||
# audio (mtmd)
|
||||
A_ENC_EMBD_POS = auto()
|
||||
A_ENC_EMBD_NORM = auto()
|
||||
@@ -880,6 +902,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.ARCTIC: "arctic",
|
||||
MODEL_ARCH.DEEPSEEK: "deepseek",
|
||||
MODEL_ARCH.DEEPSEEK2: "deepseek2",
|
||||
MODEL_ARCH.DEEPSEEK2OCR: "deepseek2-ocr",
|
||||
MODEL_ARCH.CHATGLM: "chatglm",
|
||||
MODEL_ARCH.GLM4: "glm4",
|
||||
MODEL_ARCH.GLM4_MOE: "glm4moe",
|
||||
@@ -1199,6 +1222,22 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
MODEL_TENSOR.V_MM_GATE: "mm.gate",
|
||||
MODEL_TENSOR.V_TOK_BOI: "v.boi",
|
||||
MODEL_TENSOR.V_TOK_EOI: "v.eoi",
|
||||
# DeepSeek-OCR SAM
|
||||
MODEL_TENSOR.V_SAM_POS_EMBD: "v.sam.pos_embd",
|
||||
MODEL_TENSOR.V_SAM_PATCH_EMBD: "v.sam.patch_embd",
|
||||
MODEL_TENSOR.V_SAM_PRE_NORM: "v.sam.blk.{bid}.pre_ln",
|
||||
MODEL_TENSOR.V_SAM_POST_NORM: "v.sam.blk.{bid}.post_ln",
|
||||
MODEL_TENSOR.V_SAM_ATTN_POS_H: "v.sam.blk.{bid}.attn.pos_h",
|
||||
MODEL_TENSOR.V_SAM_ATTN_POS_W: "v.sam.blk.{bid}.attn.pos_w",
|
||||
MODEL_TENSOR.V_SAM_ATTN_QKV: "v.sam.blk.{bid}.attn.qkv",
|
||||
MODEL_TENSOR.V_SAM_ATTN_OUT: "v.sam.blk.{bid}.attn.out",
|
||||
MODEL_TENSOR.V_SAM_MLP_LIN_1: "v.sam.blk.{bid}.mlp.lin1",
|
||||
MODEL_TENSOR.V_SAM_MLP_LIN_2: "v.sam.blk.{bid}.mlp.lin2",
|
||||
MODEL_TENSOR.V_SAM_NECK: "v.sam.neck.{bid}",
|
||||
MODEL_TENSOR.V_SAM_NET_2: "v.sam.net_2",
|
||||
MODEL_TENSOR.V_SAM_NET_3: "v.sam.net_3",
|
||||
MODEL_TENSOR.V_ENC_EMBD_IMGNL: "v.image_newline", # Deepseek-OCR
|
||||
MODEL_TENSOR.V_ENC_EMBD_VSEP: "v.view_seperator", # Deepseek-OCR
|
||||
# audio (mtmd)
|
||||
# note: all audio tensor names must use prefix "a." or "mm.a."
|
||||
MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd",
|
||||
@@ -1265,6 +1304,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.V_ENC_EMBD_PATCH,
|
||||
MODEL_TENSOR.V_ENC_EMBD_NORM,
|
||||
MODEL_TENSOR.V_ENC_EMBD_POS,
|
||||
MODEL_TENSOR.V_ENC_EMBD_IMGNL,
|
||||
MODEL_TENSOR.V_ENC_EMBD_VSEP,
|
||||
MODEL_TENSOR.V_ENC_INPUT_NORM,
|
||||
MODEL_TENSOR.V_ENC_ATTN_QKV,
|
||||
MODEL_TENSOR.V_ENC_ATTN_Q,
|
||||
@@ -1317,6 +1358,19 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.V_MM_GATE,
|
||||
MODEL_TENSOR.V_TOK_BOI,
|
||||
MODEL_TENSOR.V_TOK_EOI,
|
||||
MODEL_TENSOR.V_SAM_POS_EMBD,
|
||||
MODEL_TENSOR.V_SAM_PATCH_EMBD,
|
||||
MODEL_TENSOR.V_SAM_PRE_NORM,
|
||||
MODEL_TENSOR.V_SAM_POST_NORM,
|
||||
MODEL_TENSOR.V_SAM_ATTN_POS_H,
|
||||
MODEL_TENSOR.V_SAM_ATTN_POS_W,
|
||||
MODEL_TENSOR.V_SAM_ATTN_QKV,
|
||||
MODEL_TENSOR.V_SAM_ATTN_OUT,
|
||||
MODEL_TENSOR.V_SAM_MLP_LIN_1,
|
||||
MODEL_TENSOR.V_SAM_MLP_LIN_2,
|
||||
MODEL_TENSOR.V_SAM_NECK,
|
||||
MODEL_TENSOR.V_SAM_NET_2,
|
||||
MODEL_TENSOR.V_SAM_NET_3,
|
||||
# audio
|
||||
MODEL_TENSOR.A_ENC_EMBD_POS,
|
||||
MODEL_TENSOR.A_ENC_EMBD_NORM,
|
||||
@@ -2612,7 +2666,41 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.ATTN_Q_B,
|
||||
MODEL_TENSOR.ATTN_KV_A_MQA,
|
||||
MODEL_TENSOR.ATTN_KV_B,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_K_B,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_V_B,
|
||||
MODEL_TENSOR.ATTN_Q_A_NORM,
|
||||
MODEL_TENSOR.ATTN_KV_A_NORM,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||
MODEL_TENSOR.FFN_GATE_INP,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.FFN_GATE_EXP,
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
MODEL_TENSOR.FFN_GATE_SHEXP,
|
||||
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
||||
MODEL_TENSOR.FFN_UP_SHEXP,
|
||||
MODEL_TENSOR.FFN_EXP_PROBS_B,
|
||||
],
|
||||
MODEL_ARCH.DEEPSEEK2OCR: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_Q_A,
|
||||
MODEL_TENSOR.ATTN_Q_B,
|
||||
MODEL_TENSOR.ATTN_KV_A_MQA,
|
||||
MODEL_TENSOR.ATTN_KV_B,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_K_B,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_V_B,
|
||||
MODEL_TENSOR.ATTN_Q_A_NORM,
|
||||
MODEL_TENSOR.ATTN_KV_A_NORM,
|
||||
@@ -3741,6 +3829,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||
],
|
||||
MODEL_ARCH.DEEPSEEK2OCR: [
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||
],
|
||||
MODEL_ARCH.CHATGLM: [
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
],
|
||||
@@ -3938,6 +4030,7 @@ class VisionProjectorType:
|
||||
LIGHTONOCR = "lightonocr"
|
||||
COGVLM = "cogvlm"
|
||||
JANUS_PRO = "janus_pro"
|
||||
DEEPSEEKOCR = "deepseekocr"
|
||||
LFM2A = "lfm2a" # audio
|
||||
MUSIC_FLAMINGO = "musicflamingo" # audio
|
||||
GLM4V = "glm4v"
|
||||
|
||||
@@ -1218,6 +1218,15 @@ class GGUFWriter:
|
||||
def add_vision_window_size(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.WINDOW_SIZE, value)
|
||||
|
||||
def add_vision_sam_layers_count(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.SAM.BLOCK_COUNT, value)
|
||||
|
||||
def add_vision_sam_embedding_length(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.SAM.EMBEDDING_LENGTH, value)
|
||||
|
||||
def add_vision_sam_head_count(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.SAM.HEAD_COUNT, value)
|
||||
|
||||
# audio models
|
||||
|
||||
def add_clip_audio_projector_type(self, value: str) -> None:
|
||||
|
||||
@@ -63,6 +63,7 @@ class TensorNameMap:
|
||||
"transformer.wpe", # gpt2
|
||||
"embeddings.position_embeddings", # bert
|
||||
"wpe", # gpt2
|
||||
"model.embed_positions", # rugpt3xl
|
||||
),
|
||||
|
||||
# Output
|
||||
@@ -1344,6 +1345,7 @@ class TensorNameMap:
|
||||
MODEL_TENSOR.V_MMPROJ_FC: (
|
||||
"model.connector.modality_projection.proj", # SmolVLM
|
||||
"model.vision.linear_proj.linear_proj", # cogvlm
|
||||
"model.projector.layers", # Deepseek-OCR
|
||||
"visual.merger.proj", # glm4v
|
||||
),
|
||||
|
||||
@@ -1364,6 +1366,7 @@ class TensorNameMap:
|
||||
"vision_model.class_embedding", # llama 4
|
||||
"model.vision.patch_embedding.cls_embedding", # cogvlm
|
||||
"vision_model.radio_model.model.patch_generator.cls_token.token", # Nemotron Nano v2 VL
|
||||
"model.vision_model.embeddings.class_embedding", # Deepseek-OCR
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_EMBD_PATCH: (
|
||||
@@ -1377,6 +1380,7 @@ class TensorNameMap:
|
||||
"visual.patch_embed.proj", # qwen2vl
|
||||
"vision_tower.patch_embed.proj", # kimi-vl
|
||||
"model.vision.patch_embedding.proj", # cogvlm
|
||||
"model.vision_model.embeddings.patch_embedding", # Deepseek-OCR CLIP
|
||||
"siglip2.vision_model.embeddings.patch_embedding",
|
||||
"vision_model.radio_model.model.patch_generator.embedder", # Nemotron Nano v2 VL
|
||||
),
|
||||
@@ -1398,10 +1402,19 @@ class TensorNameMap:
|
||||
"vision_model.radio_model.model.patch_generator.pos_embed", # Nemotron Nano v2 VL
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_EMBD_IMGNL: (
|
||||
"model.image_newline", # Deepseek-OCR
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_EMBD_VSEP: (
|
||||
"model.view_seperator", # Deepseek-OCR
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_QKV: (
|
||||
"visual.blocks.{bid}.attn.qkv", # qwen3vl
|
||||
"model.vision.transformer.layers.{bid}.attention.query_key_value", # cogvlm
|
||||
"vision_tower.encoder.blocks.{bid}.wqkv", # Kimi-K2.5
|
||||
"model.vision_model.transformer.layers.{bid}.self_attn.qkv_proj", # Deepseek-OCR CLIP
|
||||
"vision_tower.encoder.blocks.{bid}.wqkv" # Kimi-K2.5
|
||||
"vision_model.radio_model.model.blocks.{bid}.attn.qkv", # Nemotron Nano v2 VL
|
||||
),
|
||||
|
||||
@@ -1416,6 +1429,7 @@ class TensorNameMap:
|
||||
"visual.blocks.{bid}.attn.q", # qwen2vl, generated
|
||||
"vision_tower.encoder.blocks.{bid}.wq", # kimi-vl, generated
|
||||
"siglip2.vision_model.encoder.layers.{bid}.self_attn.q_proj", # youtuvl
|
||||
"model.vision_model.transformer.layers.{bid}.self_attn.q_proj", # Deepseek-OCR CLIP, generated
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
|
||||
@@ -1434,6 +1448,7 @@ class TensorNameMap:
|
||||
"vision_encoder.transformer.layers.{bid}.attention.wk", # pixtral
|
||||
"visual.blocks.{bid}.attn.k", # qwen2vl, generated
|
||||
"vision_tower.encoder.blocks.{bid}.wk", # kimi-vl, generated
|
||||
"model.vision_model.transformer.layers.{bid}.self_attn.k_proj", # Deepseek-OCR CLIP, generated
|
||||
"siglip2.vision_model.encoder.layers.{bid}.self_attn.k_proj",
|
||||
),
|
||||
|
||||
@@ -1454,6 +1469,7 @@ class TensorNameMap:
|
||||
"visual.blocks.{bid}.attn.v", # qwen2vl, generated
|
||||
"vision_tower.encoder.blocks.{bid}.wv", # kimi-vl, generated
|
||||
"siglip2.vision_model.encoder.layers.{bid}.self_attn.v_proj",
|
||||
"model.vision_model.transformer.layers.{bid}.self_attn.v_proj", # Deepseek-OCR CLIP, generated
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_INPUT_NORM: (
|
||||
@@ -1468,6 +1484,7 @@ class TensorNameMap:
|
||||
"visual.blocks.{bid}.norm1", # qwen2vl
|
||||
"vision_tower.encoder.blocks.{bid}.norm0", # kimi-vl (norm0/norm1)
|
||||
"model.vision.transformer.layers.{bid}.input_layernorm", # cogvlm
|
||||
"model.vision_model.transformer.layers.{bid}.layer_norm1", # Deepseek-OCR CLIP
|
||||
"siglip2.vision_model.encoder.layers.{bid}.layer_norm1",
|
||||
"vision_model.radio_model.model.blocks.{bid}.norm1", # Nemotron Nano v2 VL
|
||||
),
|
||||
@@ -1485,6 +1502,7 @@ class TensorNameMap:
|
||||
"visual.blocks.{bid}.attn.proj", # qwen2vl
|
||||
"vision_tower.encoder.blocks.{bid}.wo", # kimi-vl
|
||||
"model.vision.transformer.layers.{bid}.attention.dense", # cogvlm
|
||||
"model.vision_model.transformer.layers.{bid}.self_attn.out_proj", # Deepseek-OCR CLIP
|
||||
"siglip2.vision_model.encoder.layers.{bid}.self_attn.out_proj", # youtuvl
|
||||
"vision_model.radio_model.model.blocks.{bid}.attn.proj", # Nemotron Nano v2 VL
|
||||
),
|
||||
@@ -1501,6 +1519,7 @@ class TensorNameMap:
|
||||
"visual.blocks.{bid}.norm2", # qwen2vl
|
||||
"vision_tower.encoder.blocks.{bid}.norm1", # kimi-vl (norm0/norm1)
|
||||
"model.vision.transformer.layers.{bid}.post_attention_layernorm", # cogvlm
|
||||
"model.vision_model.transformer.layers.{bid}.layer_norm2", # Deepseek-OCR CLIP
|
||||
"siglip2.vision_model.encoder.layers.{bid}.layer_norm2",
|
||||
"vision_model.radio_model.model.blocks.{bid}.norm2", # Nemotron Nano v2 VL
|
||||
),
|
||||
@@ -1517,6 +1536,7 @@ class TensorNameMap:
|
||||
"visual.blocks.{bid}.mlp.up_proj", # qwen2.5vl
|
||||
"visual.blocks.{bid}.mlp.linear_fc1", # qwen3vl
|
||||
"vision_tower.encoder.blocks.{bid}.mlp.fc0", # kimi-vl (fc0/fc1)
|
||||
"model.vision_model.transformer.layers.{bid}.mlp.fc1", # Deepseek-OCR CLIP
|
||||
"model.vision.transformer.layers.{bid}.mlp.fc1", # cogvlm
|
||||
"siglip2.vision_model.encoder.layers.{bid}.mlp.fc1",
|
||||
"vision_model.radio_model.model.blocks.{bid}.mlp.fc1", # Nemotron Nano v2 VL
|
||||
@@ -1541,6 +1561,7 @@ class TensorNameMap:
|
||||
"visual.blocks.{bid}.mlp.linear_fc2", # qwen3vl
|
||||
"vision_tower.encoder.blocks.{bid}.mlp.fc1", # kimi-vl (fc0/fc1)
|
||||
"model.vision.transformer.layers.{bid}.mlp.fc2", # cogvlm
|
||||
"model.vision_model.transformer.layers.{bid}.mlp.fc2", # Deepseek-OCR CLIP
|
||||
"siglip2.vision_model.encoder.layers.{bid}.mlp.fc2",
|
||||
"vision_model.radio_model.model.blocks.{bid}.mlp.fc2", # Nemotron Nano v2 VL
|
||||
),
|
||||
@@ -1560,6 +1581,7 @@ class TensorNameMap:
|
||||
"vision_tower.ln_pre", # pixtral-hf
|
||||
"vision_encoder.ln_pre", # pixtral
|
||||
"vision_model.layernorm_pre", # llama4
|
||||
"model.vision_model.pre_layrnorm", # Deepseek-OCR CLIP
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_POST_NORM: (
|
||||
@@ -1662,6 +1684,58 @@ class TensorNameMap:
|
||||
"model.visual.deepstack_merger_list.{bid}.linear_fc2", # deepstack in qwen3vl
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_SAM_POS_EMBD: (
|
||||
"model.sam_model.pos_embed",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_SAM_PATCH_EMBD: (
|
||||
"model.sam_model.patch_embed.proj",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_SAM_PRE_NORM: (
|
||||
"model.sam_model.blocks.{bid}.norm1", # deepstack in qwen3vl
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_SAM_POST_NORM: (
|
||||
"model.sam_model.blocks.{bid}.norm2", # deepstack in qwen3vl
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_SAM_ATTN_POS_H: (
|
||||
"model.sam_model.blocks.{bid}.attn.rel_pos_h",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_SAM_ATTN_POS_W: (
|
||||
"model.sam_model.blocks.{bid}.attn.rel_pos_w",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_SAM_ATTN_QKV: (
|
||||
"model.sam_model.blocks.{bid}.attn.qkv",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_SAM_ATTN_OUT: (
|
||||
"model.sam_model.blocks.{bid}.attn.proj",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_SAM_MLP_LIN_1: (
|
||||
"model.sam_model.blocks.{bid}.mlp.lin1",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_SAM_MLP_LIN_2: (
|
||||
"model.sam_model.blocks.{bid}.mlp.lin2",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_SAM_NECK: (
|
||||
"model.sam_model.neck.{bid}",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_SAM_NET_2: (
|
||||
"model.sam_model.net_2",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_SAM_NET_3: (
|
||||
"model.sam_model.net_3",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_MM_POST_FC_NORM: (
|
||||
"model.vision.linear_proj.norm1", # cogvlm
|
||||
),
|
||||
|
||||
@@ -2,37 +2,51 @@
|
||||
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
import re
|
||||
|
||||
|
||||
def parse_log_file(filepath):
|
||||
"""Parse log file and extract function VGPR usage."""
|
||||
import re
|
||||
|
||||
functions = defaultdict(lambda: {'vgprs': 0, 'spill': 0, 'location': ''})
|
||||
func_stack = []
|
||||
|
||||
try:
|
||||
with open(filepath, 'r') as f:
|
||||
content = f.read()
|
||||
# Find all function entries with VGPR usage including location
|
||||
pattern = r'([^:]+:\d+):.*?Function Name: (\S+).*?VGPRs: (\d+).*?VGPRs Spill: (\d+)'
|
||||
matches = re.findall(pattern, content, re.DOTALL)
|
||||
for line in f:
|
||||
# Match function name lines
|
||||
func_match = re.search(r'remark: ([^:]+):(\d+):\d+: Function Name: (\S+)', line)
|
||||
if func_match:
|
||||
location = func_match.group(1) + ':' + func_match.group(2)
|
||||
func_name = func_match.group(3)
|
||||
# Extract just the filename and line number
|
||||
parts = location.split('/')
|
||||
short_location = parts[-1] if len(parts) > 0 else location
|
||||
functions[func_name]['location'] = short_location
|
||||
# Push function onto stack with its location
|
||||
func_stack.append({'name': func_name, 'location': location})
|
||||
continue
|
||||
|
||||
for location, func_name, vgprs, spill in matches:
|
||||
functions[func_name]['vgprs'] = int(vgprs)
|
||||
functions[func_name]['spill'] = int(spill)
|
||||
# Extract just the filename and line number
|
||||
parts = location.split('/')
|
||||
if len(parts) > 0:
|
||||
short_location = parts[-1] # Get last part (filename)
|
||||
# Check if there's a line number after filename
|
||||
if ':' in short_location:
|
||||
functions[func_name]['location'] = short_location
|
||||
else:
|
||||
functions[func_name]['location'] = location
|
||||
else:
|
||||
functions[func_name]['location'] = location
|
||||
# Match VGPR usage lines (only if we have functions in stack)
|
||||
vgpr_match = re.search(r'remark: ([^:]+):(\d+):\d+:\s+VGPRs: (\d+)', line)
|
||||
if vgpr_match:
|
||||
location = vgpr_match.group(1) + ':' + vgpr_match.group(2)
|
||||
# Find the most recent function with matching location
|
||||
for i in range(len(func_stack) - 1, -1, -1):
|
||||
if func_stack[i]['location'] == location:
|
||||
functions[func_stack[i]['name']]['vgprs'] = int(vgpr_match.group(3))
|
||||
break
|
||||
continue
|
||||
|
||||
spill_match = re.search(r'remark: ([^:]+):(\d+):\d+:\s+VGPRs Spill: (\d+)', line)
|
||||
if spill_match:
|
||||
location = spill_match.group(1) + ':' + spill_match.group(2)
|
||||
# Find the most recent function with matching location
|
||||
for i in range(len(func_stack) - 1, -1, -1):
|
||||
if func_stack[i]['location'] == location:
|
||||
functions[func_stack[i]['name']]['spill'] = int(spill_match.group(3))
|
||||
break
|
||||
continue
|
||||
except FileNotFoundError:
|
||||
print(f"Error: File {filepath} not found", file=sys.stderr) # noqa: NP100
|
||||
print(f"Error: File {filepath} not found", file=sys.stderr) # noqa: NP100
|
||||
sys.exit(1)
|
||||
|
||||
return functions
|
||||
@@ -40,7 +54,7 @@ def parse_log_file(filepath):
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: ./vgpr_check.py <log_file>", file=sys.stderr) # noqa: NP100
|
||||
print("Usage: ./vgpr_check.py <log_file>", file=sys.stderr) # noqa: NP100
|
||||
sys.exit(1)
|
||||
|
||||
log_file = sys.argv[1]
|
||||
@@ -123,6 +137,9 @@ def main():
|
||||
'_ZL18flash_attn_ext_f16ILi128ELi128ELi32ELi2ELb1ELb0EEvPKcS1_S1_S1_S1_PKiPfP15HIP_vector_typeIfLj2EEffffjfiS5_IjLj3EEiiiiiiiiiiiliiliiiiil',
|
||||
'_ZL18flash_attn_ext_f16ILi128ELi128ELi4ELi8ELb1ELb0EEvPKcS1_S1_S1_S1_PKiPfP15HIP_vector_typeIfLj2EEffffjfiS5_IjLj3EEiiiiiiiiiiiliiliiiiil',
|
||||
'_ZL18flash_attn_ext_f16ILi96ELi96ELi4ELi8ELb0ELb0EEvPKcS1_S1_S1_S1_PKiPfP15HIP_vector_typeIfLj2EEffffjfiS5_IjLj3EEiiiiiiiiiiiliiliiiiil',
|
||||
'_ZL18flash_attn_ext_vecILi128ELi2EL9ggml_type2ELS0_2ELb0EEvPKcS2_S2_S2_S2_PKiPfP15HIP_vector_typeIfLj2EEffffjfiS6_IjLj3EEiiiiiiiiiiiliiliiiiil',
|
||||
'_ZL9mul_mat_qIL9ggml_type10ELi16ELb1EEvPKcPKiS4_S4_PfS5_iiiiiiiiiiiiiiiii',
|
||||
'_ZL9mul_mat_qIL9ggml_type12ELi128ELb1EEvPKcPKiS4_S4_PfS5_iiiiiiiiiiiiiiiii'
|
||||
}
|
||||
|
||||
functions = parse_log_file(log_file)
|
||||
@@ -134,7 +151,7 @@ def main():
|
||||
total_vgprs = int(data['vgprs']) + int(data['spill'])
|
||||
if total_vgprs > 256 and func_name in ignored and func_name not in printed_ignored:
|
||||
location = data.get('location', log_file)
|
||||
print(f"{location}: {func_name} - Total VGPRs: {total_vgprs} ({data['vgprs']} + {data['spill']}) [IGNORED]") # noqa: NP100
|
||||
print(f"{location}: {func_name} - Total VGPRs: {total_vgprs} ({data['vgprs']} + {data['spill']}) [IGNORED]") # noqa: NP100
|
||||
printed_ignored.add(func_name)
|
||||
|
||||
# Then print new functions with issues in red
|
||||
@@ -146,7 +163,7 @@ def main():
|
||||
# Print in red if not ignored
|
||||
color_code = "\033[91m" if func_name not in ignored else ""
|
||||
reset_code = "\033[0m" if func_name not in ignored else ""
|
||||
print(f"{color_code}{location}: {func_name} - Total VGPRs: {total_vgprs} ({data['vgprs']} + {data['spill']}) {status}{reset_code}") # noqa: NP100
|
||||
print(f"{color_code}{location}: {func_name} - Total VGPRs: {total_vgprs} ({data['vgprs']} + {data['spill']}) {status}{reset_code}") # noqa: NP100
|
||||
if func_name not in ignored:
|
||||
found_issues = True
|
||||
|
||||
|
||||
@@ -20,6 +20,14 @@ if ($null -ne $env:V) {
|
||||
$env:GGML_HEXAGON_VERBOSE=$env:V
|
||||
}
|
||||
|
||||
if ($null -ne $env:E) {
|
||||
$env:GGML_HEXAGON_EXPERIMENTAL=$env:E
|
||||
}
|
||||
|
||||
if ($null -ne $env:PROF) {
|
||||
$env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1
|
||||
}
|
||||
|
||||
if ($null -ne $env:OPMASK) {
|
||||
$env:GGML_HEXAGON_OPMASK=$env:OPMASK
|
||||
}
|
||||
@@ -32,6 +40,10 @@ if ($null -ne $env:NDEV) {
|
||||
$env:GGML_HEXAGON_NDEV=$env:NDEV
|
||||
}
|
||||
|
||||
if ($null -ne $env:HB) {
|
||||
$env:GGML_HEXAGON_HOSTBUF=$env:HB
|
||||
}
|
||||
|
||||
$env:ADSP_LIBRARY_PATH="$basedir\lib"
|
||||
|
||||
& "$basedir\bin\llama-bench.exe" `
|
||||
|
||||
@@ -44,10 +44,14 @@ if ($null -ne $env:NDEV) {
|
||||
$env:GGML_HEXAGON_NDEV=$env:NDEV
|
||||
}
|
||||
|
||||
if ($null -ne $env:HB) {
|
||||
$env:GGML_HEXAGON_HOSTBUF=$env:HB
|
||||
}
|
||||
|
||||
$env:ADSP_LIBRARY_PATH="$basedir\lib"
|
||||
|
||||
& "$basedir\bin\llama-cli.exe" `
|
||||
--no-mmap -m $basedir\..\..\gguf\$model `
|
||||
--poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 `
|
||||
--ctx-size 8192 --ubatch-size 128 -fa on `
|
||||
--ctx-size 8192 --ubatch-size 256 -fa on `
|
||||
-ngl 99 --device $device $cli_opts
|
||||
|
||||
@@ -44,10 +44,14 @@ if ($null -ne $env:NDEV) {
|
||||
$env:GGML_HEXAGON_NDEV=$env:NDEV
|
||||
}
|
||||
|
||||
if ($null -ne $env:HB) {
|
||||
$env:GGML_HEXAGON_HOSTBUF=$env:HB
|
||||
}
|
||||
|
||||
$env:ADSP_LIBRARY_PATH="$basedir\lib"
|
||||
|
||||
& "$basedir\bin\llama-completion.exe" `
|
||||
--no-mmap -m $basedir\..\..\gguf\$model `
|
||||
--poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 `
|
||||
--ctx-size 8192 --batch-size 128 -fa on `
|
||||
--ctx-size 8192 --batch-size 256 -fa on `
|
||||
-ngl 99 -no-cnv --device $device $cli_opts
|
||||
|
||||
@@ -0,0 +1,74 @@
|
||||
#!/usr/bin/env pwsh
|
||||
|
||||
# Basedir on device
|
||||
$basedir=".\pkg-snapdragon"
|
||||
|
||||
$cli_opts=$args
|
||||
|
||||
$model="gemma-3-4b-it-Q4_0.gguf"
|
||||
if ($null -ne $env:M) {
|
||||
$model=$env:M
|
||||
}
|
||||
|
||||
$mmproj="mmproj-F16.gguf"
|
||||
if ($null -ne $env:MMPROJ) {
|
||||
$mmproj=$env:MMPROJ
|
||||
}
|
||||
|
||||
$image=""
|
||||
if ($null -ne $env:IMG) {
|
||||
$image=$env:IMG
|
||||
}
|
||||
|
||||
$device="HTP0"
|
||||
if ($null -ne $env:D) {
|
||||
$device=$env:D
|
||||
}
|
||||
|
||||
if ($null -ne $env:V) {
|
||||
$env:GGML_HEXAGON_VERBOSE=$env:V
|
||||
}
|
||||
|
||||
# Default experimental to 1
|
||||
$env:GGML_HEXAGON_EXPERIMENTAL=1
|
||||
if ($null -ne $env:E) {
|
||||
$env:GGML_HEXAGON_EXPERIMENTAL=$env:E
|
||||
}
|
||||
|
||||
if ($null -ne $env:SCHED) {
|
||||
$env:GGML_SCHED_DEBUG=$env:SCHED; $cli_opts="$cli_opts -v"
|
||||
}
|
||||
|
||||
if ($null -ne $env:PROF) {
|
||||
$env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1
|
||||
}
|
||||
|
||||
if ($null -ne $env:OPMASK) {
|
||||
$env:GGML_HEXAGON_OPMASK=$env:OPMASK
|
||||
}
|
||||
|
||||
if ($null -ne $env:NHVX) {
|
||||
$env:GGML_HEXAGON_NHVX=$env:NHVX
|
||||
}
|
||||
|
||||
if ($null -ne $env:NDEV) {
|
||||
$env:GGML_HEXAGON_NDEV=$env:NDEV
|
||||
}
|
||||
|
||||
if ($null -ne $env:HB) {
|
||||
$env:GGML_HEXAGON_HOSTBUF=$env:HB
|
||||
}
|
||||
|
||||
if ($null -ne $env:MTMD_DEVICE) {
|
||||
$env:MTMD_BACKEND_DEVICE=$env:MTMD_DEVICE
|
||||
}
|
||||
|
||||
$env:ADSP_LIBRARY_PATH="$basedir\lib"
|
||||
|
||||
& "$basedir\bin\llama-mtmd-cli.exe" `
|
||||
--no-mmap -m $basedir\..\..\gguf\$model `
|
||||
--mmproj $basedir\..\..\gguf\$mmproj `
|
||||
--image $basedir\..\..\gguf\$image `
|
||||
--poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 `
|
||||
--ctx-size 8192 --ubatch-size 256 -fa on `
|
||||
-ngl 99 --device $device -v $cli_opts
|
||||
@@ -50,6 +50,10 @@ if ($null -ne $env:NDEV) {
|
||||
$env:GGML_HEXAGON_NDEV=$env:NDEV
|
||||
}
|
||||
|
||||
if ($null -ne $env:HB) {
|
||||
$env:GGML_HEXAGON_HOSTBUF=$env:HB
|
||||
}
|
||||
|
||||
$env:ADSP_LIBRARY_PATH="$basedir\lib"
|
||||
|
||||
& "$basedir\bin\$tool" `
|
||||
|
||||
@@ -5,7 +5,7 @@ import os
|
||||
import sys
|
||||
import subprocess
|
||||
|
||||
HTTPLIB_VERSION = "refs/tags/v0.39.0"
|
||||
HTTPLIB_VERSION = "refs/tags/v0.40.0"
|
||||
|
||||
vendor = {
|
||||
"https://github.com/nlohmann/json/releases/latest/download/json.hpp": "vendor/nlohmann/json.hpp",
|
||||
|
||||
@@ -73,6 +73,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_ARCTIC, "arctic" },
|
||||
{ LLM_ARCH_DEEPSEEK, "deepseek" },
|
||||
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
|
||||
{ LLM_ARCH_DEEPSEEK2OCR, "deepseek2-ocr" },
|
||||
{ LLM_ARCH_CHATGLM, "chatglm" },
|
||||
{ LLM_ARCH_GLM4, "glm4" },
|
||||
{ LLM_ARCH_GLM4_MOE, "glm4moe" },
|
||||
@@ -1571,6 +1572,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
||||
LLM_TENSOR_FFN_UP_SHEXP,
|
||||
};
|
||||
case LLM_ARCH_DEEPSEEK2:
|
||||
case LLM_ARCH_DEEPSEEK2OCR:
|
||||
case LLM_ARCH_MISTRAL4:
|
||||
return {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
@@ -1579,6 +1581,8 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
||||
LLM_TENSOR_ATTN_NORM,
|
||||
LLM_TENSOR_ATTN_Q_A_NORM,
|
||||
LLM_TENSOR_ATTN_KV_A_NORM,
|
||||
LLM_TENSOR_ATTN_K, // deepseek-ocr
|
||||
LLM_TENSOR_ATTN_V, // deepseek-ocr
|
||||
LLM_TENSOR_ATTN_Q,
|
||||
LLM_TENSOR_ATTN_Q_A,
|
||||
LLM_TENSOR_ATTN_Q_B,
|
||||
|
||||
@@ -77,6 +77,7 @@ enum llm_arch {
|
||||
LLM_ARCH_ARCTIC,
|
||||
LLM_ARCH_DEEPSEEK,
|
||||
LLM_ARCH_DEEPSEEK2,
|
||||
LLM_ARCH_DEEPSEEK2OCR,
|
||||
LLM_ARCH_CHATGLM,
|
||||
LLM_ARCH_GLM4,
|
||||
LLM_ARCH_GLM4_MOE,
|
||||
|
||||
@@ -49,6 +49,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
||||
{ "deepseek", LLM_CHAT_TEMPLATE_DEEPSEEK },
|
||||
{ "deepseek2", LLM_CHAT_TEMPLATE_DEEPSEEK_2 },
|
||||
{ "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 },
|
||||
{ "deepseek-ocr", LLM_CHAT_TEMPLATE_DEEPSEEK_OCR },
|
||||
{ "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
|
||||
{ "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
|
||||
{ "chatglm3", LLM_CHAT_TEMPLATE_CHATGLM_3 },
|
||||
@@ -548,6 +549,11 @@ int32_t llm_chat_apply_template(
|
||||
if (add_ass) {
|
||||
ss << LU8("<|Assistant|>");
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_OCR) {
|
||||
for (auto message : chat) {
|
||||
// no template
|
||||
ss << message->content;
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_3) {
|
||||
// ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
|
||||
// EXAONE-3.0-7.8B-Instruct
|
||||
|
||||
@@ -28,6 +28,7 @@ enum llm_chat_template {
|
||||
LLM_CHAT_TEMPLATE_DEEPSEEK,
|
||||
LLM_CHAT_TEMPLATE_DEEPSEEK_2,
|
||||
LLM_CHAT_TEMPLATE_DEEPSEEK_3,
|
||||
LLM_CHAT_TEMPLATE_DEEPSEEK_OCR,
|
||||
LLM_CHAT_TEMPLATE_COMMAND_R,
|
||||
LLM_CHAT_TEMPLATE_LLAMA_3,
|
||||
LLM_CHAT_TEMPLATE_CHATGLM_3,
|
||||
|
||||
+1
-1
@@ -1516,7 +1516,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||
|
||||
if (!weight_before_ffn) {
|
||||
experts = ggml_mul(ctx0, experts, weights);
|
||||
cb(cur, "ffn_moe_weighted", il);
|
||||
cb(experts, "ffn_moe_weighted", il);
|
||||
}
|
||||
|
||||
ggml_tensor * cur_experts[LLAMA_MAX_EXPERTS] = { nullptr };
|
||||
|
||||
@@ -1561,7 +1561,6 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/13870
|
||||
? LLAMA_ROPE_TYPE_NEOX
|
||||
: hparams.rope_type;
|
||||
|
||||
ggml_tensor * tmp;
|
||||
|
||||
if (ggml_is_quantized(cur->type)) {
|
||||
|
||||
+138
-19
@@ -370,6 +370,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
||||
ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
||||
ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out_impl, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
||||
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
||||
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
||||
@@ -748,8 +750,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
case LLM_ARCH_BERT:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 3:
|
||||
@@ -781,8 +781,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
}
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 12:
|
||||
@@ -797,8 +795,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
case LLM_ARCH_JINA_BERT_V2:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||
hparams.f_max_alibi_bias = 8.0f;
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
@@ -810,8 +806,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
case LLM_ARCH_JINA_BERT_V3:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 24:
|
||||
@@ -823,8 +817,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
case LLM_ARCH_NOMIC_BERT_MOE:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||
ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
|
||||
|
||||
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
|
||||
@@ -838,8 +830,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
case LLM_ARCH_NEO_BERT:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||
|
||||
if (hparams.n_layer == 28) {
|
||||
type = LLM_TYPE_250M;
|
||||
@@ -848,8 +838,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
case LLM_ARCH_EUROBERT:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||
|
||||
if (hparams.n_layer == 12) {
|
||||
type = LLM_TYPE_SMALL; // 0.2B
|
||||
@@ -913,7 +901,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
// fall through
|
||||
case LLM_ARCH_QWEN2:
|
||||
{
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
switch (hparams.n_layer) {
|
||||
case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
|
||||
@@ -995,7 +982,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
} break;
|
||||
case LLM_ARCH_QWEN3:
|
||||
{
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
switch (hparams.n_layer) {
|
||||
case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
|
||||
@@ -1287,7 +1273,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||
|
||||
//applied only if model converted with --sentence-transformers-dense-modules
|
||||
ml.get_key(LLM_KV_DENSE_2_FEAT_IN, hparams.dense_2_feat_in, false);
|
||||
@@ -1636,6 +1621,26 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_DEEPSEEK2OCR:
|
||||
{
|
||||
// similar to deepseek2, but without MLA
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
||||
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
||||
|
||||
if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
|
||||
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
|
||||
}
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 12: type = LLM_TYPE_3B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_PLM:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
@@ -2084,7 +2089,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
|
||||
} break;
|
||||
case LLM_ARCH_BAILINGMOE:
|
||||
{
|
||||
@@ -4967,6 +4971,60 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
||||
create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0);
|
||||
|
||||
// Shared expert branch
|
||||
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
||||
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
|
||||
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_DEEPSEEK2OCR:
|
||||
{
|
||||
// similar to deepseek2, but without MLA
|
||||
const int64_t n_ff_exp = hparams.n_ff_exp;
|
||||
const int64_t n_expert_shared = hparams.n_expert_shared;
|
||||
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
// output
|
||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||
// try to load output.weight, if not found, use token_embd (tied embeddings)
|
||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
||||
if (!output) {
|
||||
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = layers[i];
|
||||
|
||||
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
||||
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd}, 0);
|
||||
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd}, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
||||
|
||||
// norm
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
if (i < (int) hparams.n_layer_dense_lead) {
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
} else {
|
||||
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
||||
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
if (n_expert == 0) {
|
||||
throw std::runtime_error("n_expert must be > 0");
|
||||
}
|
||||
if (n_expert_used == 0) {
|
||||
throw std::runtime_error("n_expert_used must be > 0");
|
||||
}
|
||||
|
||||
// MoE branch
|
||||
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
||||
create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0);
|
||||
|
||||
// Shared expert branch
|
||||
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
||||
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
|
||||
@@ -7520,6 +7578,65 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
if (!layer.ssm_beta_s && layer.ssm_beta) {
|
||||
layer.ssm_beta_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
|
||||
// input scales
|
||||
if (!layer.wq_in_s && layer.wq) {
|
||||
layer.wq_in_s = create_tensor(tn(LLM_TENSOR_ATTN_Q, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.wk_in_s && layer.wk) {
|
||||
layer.wk_in_s = create_tensor(tn(LLM_TENSOR_ATTN_K, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.wv_in_s && layer.wv) {
|
||||
layer.wv_in_s = create_tensor(tn(LLM_TENSOR_ATTN_V, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.wo_in_s && layer.wo) {
|
||||
layer.wo_in_s = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.wqkv_in_s && layer.wqkv) {
|
||||
layer.wqkv_in_s = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.wqkv_gate_in_s && layer.wqkv_gate) {
|
||||
layer.wqkv_gate_in_s = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.ffn_gate_in_s && layer.ffn_gate) {
|
||||
layer.ffn_gate_in_s = create_tensor(tn(LLM_TENSOR_FFN_GATE, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.ffn_down_in_s && layer.ffn_down) {
|
||||
layer.ffn_down_in_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.ffn_up_in_s && layer.ffn_up) {
|
||||
layer.ffn_up_in_s = create_tensor(tn(LLM_TENSOR_FFN_UP, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.ffn_gate_exps_in_s && layer.ffn_gate_exps) {
|
||||
layer.ffn_gate_exps_in_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "input_scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.ffn_down_exps_in_s && layer.ffn_down_exps) {
|
||||
layer.ffn_down_exps_in_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "input_scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.ffn_up_exps_in_s && layer.ffn_up_exps) {
|
||||
layer.ffn_up_exps_in_s = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "input_scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.ffn_gate_shexp_in_s && layer.ffn_gate_shexp) {
|
||||
layer.ffn_gate_shexp_in_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.ffn_down_shexp_in_s && layer.ffn_down_shexp) {
|
||||
layer.ffn_down_shexp_in_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.ffn_up_shexp_in_s && layer.ffn_up_shexp) {
|
||||
layer.ffn_up_shexp_in_s = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.ssm_in_in_s && layer.ssm_in) {
|
||||
layer.ssm_in_in_s = create_tensor(tn(LLM_TENSOR_SSM_IN, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.ssm_out_in_s && layer.ssm_out) {
|
||||
layer.ssm_out_in_s = create_tensor(tn(LLM_TENSOR_SSM_OUT, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.ssm_alpha_in_s && layer.ssm_alpha) {
|
||||
layer.ssm_alpha_in_s = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.ssm_beta_in_s && layer.ssm_beta) {
|
||||
layer.ssm_beta_in_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7858,7 +7975,7 @@ void llama_model::print_info() const {
|
||||
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
||||
}
|
||||
|
||||
if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_MISTRAL4) {
|
||||
if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_DEEPSEEK2OCR || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_MISTRAL4) {
|
||||
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
||||
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
|
||||
LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
|
||||
@@ -8435,6 +8552,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
llm = std::make_unique<llm_build_deepseek>(*this, params);
|
||||
} break;
|
||||
case LLM_ARCH_DEEPSEEK2:
|
||||
case LLM_ARCH_DEEPSEEK2OCR:
|
||||
case LLM_ARCH_GLM_DSA:
|
||||
case LLM_ARCH_MISTRAL4:
|
||||
{
|
||||
@@ -8835,6 +8953,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_ARCTIC:
|
||||
case LLM_ARCH_DEEPSEEK:
|
||||
case LLM_ARCH_DEEPSEEK2:
|
||||
case LLM_ARCH_DEEPSEEK2OCR:
|
||||
case LLM_ARCH_PLM:
|
||||
case LLM_ARCH_CHATGLM:
|
||||
case LLM_ARCH_GRANITE:
|
||||
|
||||
@@ -414,6 +414,27 @@ struct llama_layer {
|
||||
struct ggml_tensor * ssm_alpha_s = nullptr;
|
||||
struct ggml_tensor * ssm_beta_s = nullptr;
|
||||
|
||||
// input scales
|
||||
struct ggml_tensor * wq_in_s = nullptr;
|
||||
struct ggml_tensor * wk_in_s = nullptr;
|
||||
struct ggml_tensor * wv_in_s = nullptr;
|
||||
struct ggml_tensor * wo_in_s = nullptr;
|
||||
struct ggml_tensor * wqkv_in_s = nullptr;
|
||||
struct ggml_tensor * wqkv_gate_in_s = nullptr;
|
||||
struct ggml_tensor * ffn_gate_in_s = nullptr;
|
||||
struct ggml_tensor * ffn_up_in_s = nullptr;
|
||||
struct ggml_tensor * ffn_down_in_s = nullptr;
|
||||
struct ggml_tensor * ffn_gate_exps_in_s = nullptr;
|
||||
struct ggml_tensor * ffn_down_exps_in_s = nullptr;
|
||||
struct ggml_tensor * ffn_up_exps_in_s = nullptr;
|
||||
struct ggml_tensor * ffn_gate_shexp_in_s= nullptr;
|
||||
struct ggml_tensor * ffn_up_shexp_in_s = nullptr;
|
||||
struct ggml_tensor * ffn_down_shexp_in_s= nullptr;
|
||||
struct ggml_tensor * ssm_in_in_s = nullptr;
|
||||
struct ggml_tensor * ssm_out_in_s = nullptr;
|
||||
struct ggml_tensor * ssm_alpha_in_s = nullptr;
|
||||
struct ggml_tensor * ssm_beta_in_s = nullptr;
|
||||
|
||||
// altup & laurel
|
||||
struct ggml_tensor * per_layer_inp_gate = nullptr;
|
||||
struct ggml_tensor * per_layer_proj = nullptr;
|
||||
|
||||
+7
-1
@@ -344,7 +344,13 @@ static bool tensor_allows_quantization(const llama_model_quantize_params * param
|
||||
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
|
||||
|
||||
// do not quantize specific multimodal tensors
|
||||
quantize &= name.find(".position_embd.") == std::string::npos;
|
||||
quantize &= name.find(".position_embd") == std::string::npos;
|
||||
quantize &= name.find("sam.pos_embd") == std::string::npos;
|
||||
quantize &= name.find("sam.neck.") == std::string::npos;
|
||||
quantize &= name.find("sam.net_") == std::string::npos;
|
||||
quantize &= name.find(".rel_pos") == std::string::npos;
|
||||
quantize &= name.find(".patch_embd") == std::string::npos;
|
||||
quantize &= name.find(".patch_merger") == std::string::npos;
|
||||
|
||||
return quantize;
|
||||
}
|
||||
|
||||
+3
-1
@@ -1952,7 +1952,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
} else if (
|
||||
tokenizer_pre == "qwen2" ||
|
||||
tokenizer_pre == "deepseek-r1-qwen" ||
|
||||
tokenizer_pre == "kormo") {
|
||||
tokenizer_pre == "kormo" ||
|
||||
tokenizer_pre == "f2llmv2") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
||||
clean_spaces = false;
|
||||
} else if (
|
||||
@@ -2489,6 +2490,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|| t.first == "[EOS]" // Kimi-K2
|
||||
|| t.first == "<|end_of_text|>"
|
||||
|| t.first == "<end_of_utterance>" // smoldocling
|
||||
|| t.first == "<|end▁of▁sentence|>" // deepseek-ocr
|
||||
) {
|
||||
special_eog_ids.insert(t.second);
|
||||
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||
|
||||
@@ -2,6 +2,9 @@
|
||||
|
||||
llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
// lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
|
||||
bool is_ocr = model.arch == LLM_ARCH_DEEPSEEK2OCR;
|
||||
|
||||
const bool is_mla = hparams.is_mla();
|
||||
|
||||
// note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
|
||||
@@ -54,7 +57,38 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
// self_attention
|
||||
{
|
||||
if (is_ocr) {
|
||||
const int n_embed_head = hparams.n_embd / hparams.n_head();
|
||||
const int ocr_rope_type = GGML_ROPE_TYPE_NEOX;
|
||||
GGML_ASSERT(n_embed_head == n_embd_head_k && n_embed_head == n_embd_head_v);
|
||||
|
||||
ggml_tensor * Qcur = NULL;
|
||||
ggml_tensor * Kcur = NULL;
|
||||
ggml_tensor * Vcur = NULL;
|
||||
|
||||
Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||
Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
||||
Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
||||
cb(Qcur, "q", il);
|
||||
cb(Kcur, "k", il);
|
||||
cb(Vcur, "v", il);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embed_head, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embed_head, n_head, n_tokens);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embed_head, n_head, n_tokens);
|
||||
|
||||
GGML_ASSERT(fabs(freq_base - 10000.0) < 1e-4);
|
||||
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_embed_head, ocr_rope_type, 0, freq_base, 1, 0, 1, 0, 0);
|
||||
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_embed_head, ocr_rope_type, 0, freq_base, 1, 0, 1, 0, 0);
|
||||
cb(Qcur, "q_pe", il);
|
||||
cb(Kcur, "k_pe", il);
|
||||
|
||||
cur = build_attn(inp_attn_kv,
|
||||
model.layers[il].wo, NULL,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||
cb(cur, "attn_out", il);
|
||||
}
|
||||
else {
|
||||
ggml_tensor * q = NULL;
|
||||
|
||||
const bool is_lite = model.layers[il].wq;
|
||||
|
||||
+23
-14
@@ -4823,28 +4823,33 @@ struct test_conv_transpose_1d : public test_case {
|
||||
|
||||
// GGML_OP_CONV_TRANSPOSE_2D
|
||||
struct test_conv_transpose_2d : public test_case {
|
||||
// Dimensions
|
||||
const std::array<int64_t, 4> ne_input;
|
||||
const std::array<int64_t, 4> ne_kernel;
|
||||
const int stride;
|
||||
// Types
|
||||
const ggml_type kernel_type;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR3(ne_input, ne_kernel, stride);
|
||||
return VARS_TO_STR4(kernel_type, ne_input, ne_kernel, stride);
|
||||
}
|
||||
|
||||
double max_nmse_err() override {
|
||||
return 5e-4; // The default 1e-7 is too small for Vulkan.
|
||||
}
|
||||
|
||||
test_conv_transpose_2d(std::array<int64_t, 4> ne_input = {10, 10, 3, 1}, // [input_width, input_height, input_channels, 1]
|
||||
std::array<int64_t, 4> ne_kernel = {3, 3, 3, 1}, // [kernel_width, kernel_height, input_channels, 1]
|
||||
int stride = 1)
|
||||
: ne_input(ne_input), ne_kernel(ne_kernel), stride(stride){}
|
||||
test_conv_transpose_2d(
|
||||
std::array<int64_t, 4> ne_input = {10, 10, 3, 1}, // [input_width, input_height, input_channels, 1]
|
||||
std::array<int64_t, 4> ne_kernel = {3, 3, 3, 1}, // [kernel_width, kernel_height, input_channels, 1]
|
||||
int stride = 1,
|
||||
ggml_type kernel_type = GGML_TYPE_F16
|
||||
) : ne_input(ne_input), ne_kernel(ne_kernel), stride(stride), kernel_type(kernel_type) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data());
|
||||
ggml_set_name(input, "input");
|
||||
|
||||
ggml_tensor * kernel = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne_kernel.data());
|
||||
ggml_tensor * kernel = ggml_new_tensor(ctx, kernel_type, 4, ne_kernel.data());
|
||||
ggml_set_name(kernel, "kernel");
|
||||
|
||||
ggml_tensor * out = ggml_conv_transpose_2d_p0(ctx, kernel, input, stride);
|
||||
@@ -7279,7 +7284,7 @@ static const ggml_type all_types[] = {
|
||||
GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
|
||||
GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
|
||||
GGML_TYPE_Q8_0,
|
||||
GGML_TYPE_MXFP4,
|
||||
GGML_TYPE_MXFP4, GGML_TYPE_NVFP4,
|
||||
GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
|
||||
GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
|
||||
GGML_TYPE_Q6_K,
|
||||
@@ -7295,7 +7300,7 @@ static const ggml_type base_types[] = {
|
||||
GGML_TYPE_Q4_0,
|
||||
GGML_TYPE_Q4_1, // for I8MM tests
|
||||
GGML_TYPE_Q4_K,
|
||||
GGML_TYPE_MXFP4, // TODO: or "other"
|
||||
GGML_TYPE_MXFP4, GGML_TYPE_NVFP4, // TODO: or "other"
|
||||
GGML_TYPE_IQ2_XXS
|
||||
};
|
||||
|
||||
@@ -7704,9 +7709,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1));
|
||||
test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1));
|
||||
|
||||
test_cases.emplace_back(new test_conv_transpose_2d({3, 2, 3, 1}, {2, 2, 1, 3}, 1));
|
||||
test_cases.emplace_back(new test_conv_transpose_2d({10, 10, 9, 1}, {3, 3, 1, 9}, 2));
|
||||
test_cases.emplace_back(new test_conv_transpose_2d({129, 63, 35, 1}, {3, 3, 48, 35}, 1));
|
||||
for (ggml_type kernel_type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
||||
test_cases.emplace_back(new test_conv_transpose_2d({3, 2, 3, 1}, {2, 2, 1, 3}, 1, kernel_type));
|
||||
test_cases.emplace_back(new test_conv_transpose_2d({10, 10, 9, 1}, {3, 3, 1, 9}, 2, kernel_type));
|
||||
test_cases.emplace_back(new test_conv_transpose_2d({129, 63, 35, 1}, {3, 3, 48, 35}, 1, kernel_type));
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_count_equal(GGML_TYPE_F32, {4, 500, 1, 1}));
|
||||
test_cases.emplace_back(new test_count_equal(GGML_TYPE_F32, {4, 5000, 1, 1}));
|
||||
@@ -8892,9 +8899,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
||||
test_cases.emplace_back(new test_conv_2d_dw({512, 512, 256, 1}, {3, 3, 1, 256}, 1, 1, 1, false));
|
||||
test_cases.emplace_back(new test_conv_2d_dw({512, 512, 256, 1}, {3, 3, 1, 256}, 1, 1, 1, true));
|
||||
|
||||
test_cases.emplace_back(new test_conv_transpose_2d({256, 256, 256, 1}, {3, 3, 16, 256}, 1));
|
||||
test_cases.emplace_back(new test_conv_transpose_2d({16, 16, 16, 1}, {3, 3, 8, 16}, 1));
|
||||
test_cases.emplace_back(new test_conv_transpose_2d({10, 10, 9, 1}, {3, 3, 1, 9}, 2));
|
||||
for (ggml_type kernel_type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
||||
test_cases.emplace_back(new test_conv_transpose_2d({256, 256, 256, 1}, {3, 3, 16, 256}, 1, kernel_type));
|
||||
test_cases.emplace_back(new test_conv_transpose_2d({16, 16, 16, 1}, {3, 3, 8, 16}, 1, kernel_type));
|
||||
test_cases.emplace_back(new test_conv_transpose_2d({10, 10, 9, 1}, {3, 3, 1, 9}, 2, kernel_type));
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_mean(GGML_TYPE_F32, {256, 256, 3, 1}));
|
||||
|
||||
|
||||
@@ -1330,7 +1330,7 @@ static void test_nemotron_reasoning_detection(testing & t) {
|
||||
analysis.analyze_template(tmpl);
|
||||
|
||||
// Check reasoning markers
|
||||
t.assert_equal("reasoning_start should be '<think>'", "<think>", analysis.reasoning.start);
|
||||
t.assert_equal("reasoning_start should be '<think>\\n'", "<think>\n", analysis.reasoning.start);
|
||||
t.assert_equal("reasoning_end should be '</think>'", "</think>", analysis.reasoning.end);
|
||||
|
||||
// Check reasoning mode detection
|
||||
|
||||
+344
-81
@@ -805,7 +805,8 @@ struct peg_test_case {
|
||||
common_chat_templates_inputs params;
|
||||
std::string input;
|
||||
common_chat_msg expect;
|
||||
bool is_partial = false;
|
||||
bool is_partial = false;
|
||||
bool expect_reconstruction = false;
|
||||
};
|
||||
|
||||
struct make_peg_parser {
|
||||
@@ -828,6 +829,12 @@ struct make_peg_parser {
|
||||
}
|
||||
};
|
||||
|
||||
// Global template filter for --template flag
|
||||
static std::string g_template_filter;
|
||||
|
||||
// When true, run reconstruction test on every non-partial test and report results
|
||||
static bool g_force_reconstruction_test = false;
|
||||
|
||||
static void test_peg_parser(common_chat_templates * tmpls,
|
||||
const std::function<void(peg_test_case &)> & init,
|
||||
bool detailed_debug) {
|
||||
@@ -936,75 +943,158 @@ static void test_peg_parser(common_chat_templates * tmpls,
|
||||
throw std::runtime_error("Failed to build grammar: " + parser.params_.grammar);
|
||||
}
|
||||
|
||||
// Find the earliest trigger position to determine the constrained portion
|
||||
auto earliest_trigger_pos = std::string::npos;
|
||||
for (const auto & trigger : parser.params_.grammar_triggers) {
|
||||
size_t pos = std::string::npos;
|
||||
std::smatch match;
|
||||
switch (trigger.type) {
|
||||
case COMMON_GRAMMAR_TRIGGER_TYPE_WORD:
|
||||
{
|
||||
const auto & word = trigger.value;
|
||||
pos = tc.input.find(word);
|
||||
break;
|
||||
}
|
||||
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
|
||||
{
|
||||
const auto & pattern = std::regex(trigger.value);
|
||||
if (std::regex_search(tc.input, match, pattern)) {
|
||||
pos = match.position(pattern.mark_count());
|
||||
}
|
||||
break;
|
||||
}
|
||||
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL:
|
||||
{
|
||||
const auto & pattern = trigger.value;
|
||||
if (std::regex_match(tc.input, match, std::regex(pattern))) {
|
||||
auto mpos = std::string::npos;
|
||||
for (size_t i = 1; i < match.size(); ++i) {
|
||||
if (match[i].length() > 0) {
|
||||
mpos = match.position(i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (mpos == std::string::npos) {
|
||||
mpos = match.position(0);
|
||||
}
|
||||
pos = mpos;
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
throw std::runtime_error("Unknown trigger type");
|
||||
}
|
||||
if (pos != std::string::npos) {
|
||||
if (earliest_trigger_pos == std::string::npos || pos < earliest_trigger_pos) {
|
||||
earliest_trigger_pos = pos;
|
||||
// In production, grammar triggers match against the full generated text
|
||||
// including the generation prompt. All positions are in full_input coordinates.
|
||||
const auto & gen_prompt = parser.params_.generation_prompt;
|
||||
std::string full_input = gen_prompt + tc.input;
|
||||
|
||||
// Determine whether the reasoning-budget sampler path applies: tool-call grammar
|
||||
// with all WORD triggers and thinking tags present. In production, the reasoning
|
||||
// budget sampler inhibits grammar application while inside thinking blocks —
|
||||
// triggers inside <think>...</think> are suppressed.
|
||||
bool use_reasoning_budget_path = false;
|
||||
if (parser.params_.grammar_lazy && !parser.params_.thinking_end_tag.empty()) {
|
||||
use_reasoning_budget_path = true;
|
||||
for (const auto & trigger : parser.params_.grammar_triggers) {
|
||||
if (trigger.type != COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
|
||||
use_reasoning_budget_path = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Determine the constrained portion of input to test against grammar
|
||||
std::string constrained = tc.input;
|
||||
// Find the earliest trigger position to determine the constrained portion
|
||||
auto earliest_trigger_pos = std::string::npos;
|
||||
|
||||
if (use_reasoning_budget_path) {
|
||||
// Reasoning-budget path: simulate thinking-aware trigger detection.
|
||||
// Walk through full_input tracking thinking state; only match triggers
|
||||
// when outside thinking blocks.
|
||||
const auto & think_start = parser.params_.thinking_start_tag;
|
||||
const auto & think_end = parser.params_.thinking_end_tag;
|
||||
|
||||
bool in_thinking = false;
|
||||
for (size_t i = 0; i < full_input.size(); ++i) {
|
||||
if (!in_thinking && !think_start.empty()
|
||||
&& full_input.compare(i, think_start.size(), think_start) == 0) {
|
||||
in_thinking = true;
|
||||
i += think_start.size() - 1;
|
||||
continue;
|
||||
}
|
||||
if (in_thinking && full_input.compare(i, think_end.size(), think_end) == 0) {
|
||||
in_thinking = false;
|
||||
i += think_end.size() - 1;
|
||||
continue;
|
||||
}
|
||||
if (in_thinking) {
|
||||
continue;
|
||||
}
|
||||
// Outside thinking — check if any trigger word starts here
|
||||
for (const auto & trigger : parser.params_.grammar_triggers) {
|
||||
if (full_input.compare(i, trigger.value.size(), trigger.value) == 0) {
|
||||
if (earliest_trigger_pos == std::string::npos || i < earliest_trigger_pos) {
|
||||
earliest_trigger_pos = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (earliest_trigger_pos != std::string::npos) {
|
||||
break; // found the earliest
|
||||
}
|
||||
}
|
||||
|
||||
// If the reasoning-budget path found no trigger outside thinking but the test
|
||||
// expects tool calls, this template nests tool calls inside thinking
|
||||
// blocks (e.g. Kimi). Fall back to the legacy path for this case.
|
||||
if (earliest_trigger_pos == std::string::npos && !tc.expect.tool_calls.empty()) {
|
||||
use_reasoning_budget_path = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (!use_reasoning_budget_path) {
|
||||
// Legacy path: find triggers without thinking-awareness
|
||||
for (const auto & trigger : parser.params_.grammar_triggers) {
|
||||
size_t pos = std::string::npos;
|
||||
std::smatch match;
|
||||
switch (trigger.type) {
|
||||
case COMMON_GRAMMAR_TRIGGER_TYPE_WORD:
|
||||
{
|
||||
const auto & word = trigger.value;
|
||||
pos = full_input.find(word);
|
||||
break;
|
||||
}
|
||||
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
|
||||
{
|
||||
const auto & compiled = std::regex(trigger.value);
|
||||
if (std::regex_search(full_input, match, compiled)) {
|
||||
pos = match.position(compiled.mark_count());
|
||||
}
|
||||
break;
|
||||
}
|
||||
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL:
|
||||
{
|
||||
// In production, PATTERN_FULL triggers are checked against
|
||||
// the text generated so far, growing token by token. Simulate
|
||||
// by trying every prefix of full_input.
|
||||
const auto & compiled = std::regex(trigger.value);
|
||||
for (size_t end = gen_prompt.size(); end <= full_input.size(); ++end) {
|
||||
std::string prefix = full_input.substr(0, end);
|
||||
if (std::regex_match(prefix, match, compiled)) {
|
||||
pos = std::string::npos;
|
||||
for (size_t gi = 1; gi < match.size(); ++gi) {
|
||||
if (match[gi].length() > 0) {
|
||||
pos = match.position(gi);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (pos == std::string::npos) {
|
||||
pos = match.position(0);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
throw std::runtime_error("Unknown trigger type");
|
||||
}
|
||||
if (pos != std::string::npos) {
|
||||
if (earliest_trigger_pos == std::string::npos || pos < earliest_trigger_pos) {
|
||||
earliest_trigger_pos = pos;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If the test expects tool calls and the grammar is lazy, the trigger must fire.
|
||||
// Otherwise the grammar would never activate in production and tool calls wouldn't
|
||||
// be constrained. A silent skip here would hide broken triggers.
|
||||
if (parser.params_.grammar_lazy && !tc.expect.tool_calls.empty() && !tc.is_partial
|
||||
&& earliest_trigger_pos == std::string::npos) {
|
||||
std::string trigger_desc;
|
||||
for (const auto & trigger : parser.params_.grammar_triggers) {
|
||||
trigger_desc += "\n [type=" + std::to_string(trigger.type) + "] " + trigger.value;
|
||||
}
|
||||
throw std::runtime_error(
|
||||
"Grammar trigger did not fire, but test expects tool calls (lazy grammar).\n"
|
||||
">>> Input: " + full_input + "\n"
|
||||
">>> Triggers (" + std::to_string(parser.params_.grammar_triggers.size()) + "):" + trigger_desc);
|
||||
}
|
||||
|
||||
// Determine the constrained portion of input to test against grammar.
|
||||
// If the trigger position falls inside the generation prompt, the grammar
|
||||
// sampler was already active before model output began — constrain from the
|
||||
// start of the model output (i.e. tc.input).
|
||||
std::string constrained = full_input;
|
||||
bool grammar_triggered = false;
|
||||
if (earliest_trigger_pos != std::string::npos) {
|
||||
constrained = tc.input.substr(earliest_trigger_pos);
|
||||
auto constrain_from = std::max(earliest_trigger_pos, gen_prompt.size());
|
||||
constrained = full_input.substr(constrain_from);
|
||||
grammar_triggered = true;
|
||||
} else if (!parser.params_.grammar_lazy) {
|
||||
// For non-lazy grammars, the entire input should match
|
||||
grammar_triggered = true;
|
||||
}
|
||||
|
||||
// For non-lazy grammars, prepend reasoning prefill to grammar input, just like
|
||||
// PEG parsing does. The grammar includes the full reasoning pattern (e.g. optional
|
||||
// <think>...</think>), but the model output may start mid-reasoning if the template
|
||||
// already placed the opening tag in the prompt.
|
||||
// For lazy grammars, the grammar only activates from the trigger position, so the
|
||||
// reasoning prefill is irrelevant — reasoning is handled by the PEG parser.
|
||||
if (!parser.params_.generation_prompt.empty() && earliest_trigger_pos == std::string::npos) {
|
||||
constrained = parser.params_.generation_prompt + constrained;
|
||||
}
|
||||
|
||||
// Test the constrained portion against the grammar
|
||||
if (grammar_triggered && !tc.is_partial) {
|
||||
auto result = match_string_detailed(constrained, grammar.get());
|
||||
@@ -1036,10 +1126,57 @@ static void test_peg_parser(common_chat_templates * tmpls,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Global template filter for --template flag
|
||||
static std::string g_template_filter;
|
||||
// Reconstruction test: verify that appending the parsed message to the original
|
||||
// messages and re-rendering the template (without generation prompt) reproduces
|
||||
// the original prompt + input exactly, or as a proper prefix (the template may
|
||||
// append end-of-turn tokens after the assistant message).
|
||||
if ((tc.expect_reconstruction || g_force_reconstruction_test) && !tc.is_partial) {
|
||||
// Start from tc.expect but copy tool call arguments from the actual parser
|
||||
// output, which preserves original JSON formatting (e.g. {"arg1":1} vs {"arg1": 1}).
|
||||
auto reconstruction_msg = tc.expect;
|
||||
auto parsed_msg = parser.parse(tc.input, false);
|
||||
for (size_t i = 0; i < reconstruction_msg.tool_calls.size() && i < parsed_msg.tool_calls.size(); i++) {
|
||||
reconstruction_msg.tool_calls[i].arguments = parsed_msg.tool_calls[i].arguments;
|
||||
}
|
||||
common_chat_templates_inputs reconstruction_inputs = tc.params;
|
||||
reconstruction_inputs.messages.push_back(reconstruction_msg);
|
||||
reconstruction_inputs.add_generation_prompt = false;
|
||||
|
||||
auto reconstruction_params = common_chat_templates_apply(tmpls, reconstruction_inputs);
|
||||
std::string expected_text = parser.params_.prompt + tc.input;
|
||||
bool match = reconstruction_params.prompt == expected_text ||
|
||||
(reconstruction_params.prompt.size() > expected_text.size() &&
|
||||
reconstruction_params.prompt.compare(0, expected_text.size(), expected_text) == 0);
|
||||
if (!match && g_force_reconstruction_test && !tc.expect_reconstruction) {
|
||||
// In forced mode, report mismatch but don't fail
|
||||
// Find the first difference position
|
||||
size_t diff_pos = 0;
|
||||
size_t min_len = std::min(expected_text.size(), reconstruction_params.prompt.size());
|
||||
while (diff_pos < min_len && expected_text[diff_pos] == reconstruction_params.prompt[diff_pos]) {
|
||||
diff_pos++;
|
||||
}
|
||||
size_t ctx_start = diff_pos > 60 ? diff_pos - 60 : 0;
|
||||
size_t ctx_end_e = std::min(expected_text.size(), diff_pos + 40);
|
||||
size_t ctx_end_r = std::min(reconstruction_params.prompt.size(), diff_pos + 40);
|
||||
LOG_ERR("\x1b[31m[RECONSTRUCTION FAIL]\x1b[0m "
|
||||
"first diff at byte %zu (expected len=%zu, reconstructed len=%zu)\n"
|
||||
" expected: ...%s...\n"
|
||||
" reconstructed: ...%s...\n",
|
||||
diff_pos, expected_text.size(), reconstruction_params.prompt.size(),
|
||||
expected_text.substr(ctx_start, ctx_end_e - ctx_start).c_str(),
|
||||
reconstruction_params.prompt.substr(ctx_start, ctx_end_r - ctx_start).c_str());
|
||||
} else if (!match) {
|
||||
std::string error_msg =
|
||||
"Reconstruction mismatch:\n\n"
|
||||
">>> Expected (prompt + input):\n" + expected_text +
|
||||
"\n\n>>> Reconstructed:\n" + reconstruction_params.prompt;
|
||||
throw std::runtime_error(error_msg);
|
||||
} else if (g_force_reconstruction_test) {
|
||||
LOG_INF("\x1b[32m[RECONSTRUCTION OK]\x1b[0m\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fluent builder for PEG parser tests
|
||||
class peg_test_builder;
|
||||
@@ -1099,6 +1236,11 @@ class peg_test_builder {
|
||||
return *this;
|
||||
}
|
||||
|
||||
peg_test_builder & expect_reconstruction(bool val = true) {
|
||||
tc_.expect_reconstruction = val;
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Expect setters
|
||||
peg_test_builder & expect(const common_chat_msg & msg) {
|
||||
tc_.expect = msg;
|
||||
@@ -1272,16 +1414,18 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
// Ministral-3-14B-Reasoning-2512
|
||||
auto tst = peg_tester("models/templates/mistralai-Ministral-3-14B-Reasoning-2512.jinja", detailed_debug);
|
||||
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).expect_reconstruction().run();
|
||||
|
||||
tst.test("[THINK]I'm\nthinking[/THINK]Hello, world!\nWhat's up?")
|
||||
.expect_content("[THINK]I'm\nthinking[/THINK]Hello, world!\nWhat's up?")
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
tst.test("[THINK]I'm\nthinking[/THINK]Hello, world!\nWhat's up?")
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.enable_thinking(true)
|
||||
.expect(message_assist_thoughts)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
tst.test(R"([TOOL_CALLS]special_function[ARGS]{"arg1":1})")
|
||||
@@ -1311,6 +1455,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
{ "special_function", R"({"arg1": 1})", {} },
|
||||
{ "special_function_with_opt", R"({"arg1": 1, "arg2": 2})", {} },
|
||||
})
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
tst.test(
|
||||
@@ -1323,6 +1468,20 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.expect_reasoning("I need to output the invoice details in JSON")
|
||||
.expect_content(R"({"amount": 123.45, "date": "2025-12-03"})")
|
||||
.run();
|
||||
|
||||
// fake tool call marker in reasoning
|
||||
tst.test(
|
||||
"[THINK]Let me think about [TOOL_CALLS]special_function[ARGS]{\"arg1\":1} and more[/THINK]"
|
||||
R"([TOOL_CALLS]special_function[ARGS]{"arg1": 1})")
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.enable_thinking(true)
|
||||
.tools({ special_function_tool })
|
||||
.expect_reasoning("Let me think about [TOOL_CALLS]special_function[ARGS]{\"arg1\":1} and more")
|
||||
.expect_tool_calls({
|
||||
{ "special_function", R"({"arg1": 1})", {} },
|
||||
})
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
}
|
||||
|
||||
{
|
||||
@@ -1425,6 +1584,50 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.expect_reasoning("I need to output the invoice details in JSON")
|
||||
.expect_content(R"({"amount": 123.45, "date": "2025-12-03"})")
|
||||
.run();
|
||||
|
||||
// tool call segment in reasoning
|
||||
tst.test(
|
||||
"Let's call a tool: <tool_call>\n"
|
||||
"<function=python>\n"
|
||||
"<parameter=code>\n"
|
||||
"def hello():\n"
|
||||
" print(\"Not the real call!\")\n"
|
||||
"\n"
|
||||
"hello()\n"
|
||||
"</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call></think>\n"
|
||||
"<tool_call>\n"
|
||||
"<function=python>\n"
|
||||
"<parameter=code>\n"
|
||||
"def hello():\n"
|
||||
" print(\"Hello, world!\")\n"
|
||||
"\n"
|
||||
"hello()\n"
|
||||
"</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>"
|
||||
)
|
||||
.enable_thinking(true)
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.tools({
|
||||
python_tool
|
||||
})
|
||||
.expect_reasoning("Let's call a tool: <tool_call>\n"
|
||||
"<function=python>\n"
|
||||
"<parameter=code>\n"
|
||||
"def hello():\n"
|
||||
" print(\"Not the real call!\")\n"
|
||||
"\n"
|
||||
"hello()\n"
|
||||
"</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>")
|
||||
.expect_tool_calls({
|
||||
{ "python", "{\"code\": \"def hello():\\n print(\\\"Hello, world!\\\")\\n\\nhello()\"}", {} },
|
||||
})
|
||||
.run();
|
||||
|
||||
}
|
||||
|
||||
{
|
||||
@@ -1481,9 +1684,9 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
// Google Gemma 2 2B - does not support tool calling
|
||||
auto tst = peg_tester("models/templates/google-gemma-2-2b-it.jinja");
|
||||
|
||||
tst.test("Hello, world!").expect(simple_assist_msg("Hello, world!")).run();
|
||||
tst.test("Hello, world!").expect(simple_assist_msg("Hello, world!")).expect_reconstruction().run();
|
||||
|
||||
tst.test("Line 1\nLine 2\nLine 3").expect(simple_assist_msg("Line 1\nLine 2\nLine 3")).run();
|
||||
tst.test("Line 1\nLine 2\nLine 3").expect(simple_assist_msg("Line 1\nLine 2\nLine 3")).expect_reconstruction().run();
|
||||
}
|
||||
|
||||
{
|
||||
@@ -1526,7 +1729,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
// Test simple content-only template
|
||||
auto tst = peg_tester("models/templates/google-gemma-2-2b-it.jinja", detailed_debug);
|
||||
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).expect_reconstruction().run();
|
||||
}
|
||||
{
|
||||
// IBM Granite (reasoning and tool calling model)
|
||||
@@ -1638,7 +1841,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
// Qwen3-Coder (tool calling with XML-style format)
|
||||
auto tst = peg_tester("models/templates/Qwen3-Coder.jinja", detailed_debug);
|
||||
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).expect_reconstruction().run();
|
||||
|
||||
tst.test(
|
||||
"<tool_call>\n"
|
||||
@@ -1650,6 +1853,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
"</tool_call>")
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
tst.test(
|
||||
@@ -1678,6 +1882,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
{ "special_function", R"({"arg1": 1})", {} },
|
||||
{ "special_function_with_opt", R"({"arg1": 1, "arg2": 2})", {} },
|
||||
})
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
// Test with code content (multiline)
|
||||
@@ -1698,6 +1903,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.expect_tool_calls({
|
||||
{ "python", "{\"code\": \"def hello():\\n print(\\\"Hello, world!\\\")\\n\\nhello()\"}", {} },
|
||||
})
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
// Test with code content (asian unicode chars)
|
||||
@@ -1715,6 +1921,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.expect_tool_calls({
|
||||
{ "python", "{\"code\": \"格\"}", {} },
|
||||
})
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
// Test with HTML tag content
|
||||
@@ -1736,6 +1943,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.expect_tool_calls({
|
||||
{ "html", "{\"markup\": \"<html>\\n <head>\\n <title>Hello!</title>\\n </head>\\n</html>\"}", {} },
|
||||
})
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
// Test with TODO list (array of objects)
|
||||
@@ -1753,6 +1961,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.expect_tool_calls({
|
||||
{ "todo_list", "{\"todos\": [{\"item\": \"Check stuff\", \"selected\": false}, {\"item\": \"Prepare stuff\", \"selected\": true}]}", {} },
|
||||
})
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
// Test flexible optional argument ordering (2 required + 4 optional, reversed optional order)
|
||||
@@ -1769,6 +1978,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.expect_tool_calls({
|
||||
{ "tool_2req_4opt", R"({"req1": "hello", "req2": 42, "opt4": 100, "opt2": 200})", {} },
|
||||
})
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
// Test flexible optional argument ordering (2 required + 5 optional, reversed optional order)
|
||||
@@ -1786,6 +1996,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.expect_tool_calls({
|
||||
{ "tool_2req_5opt", R"({"req1": "world", "req2": 7, "opt5": "last", "opt3": "middle", "opt1": "first"})", {} },
|
||||
})
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
// Test flexible optional argument ordering (2 required + 5 optional, all 5 in shuffled order)
|
||||
@@ -1805,6 +2016,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.expect_tool_calls({
|
||||
{ "tool_2req_5opt", R"({"req1": "test", "req2": 99, "opt3": "c", "opt1": "a", "opt5": "e", "opt4": 4, "opt2": 2})", {} },
|
||||
})
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
}
|
||||
{
|
||||
@@ -1885,6 +2097,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
tst.test("Hello, world!\nWhat's up?")
|
||||
.enable_thinking(false)
|
||||
.expect(message_assist)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
// Reasoning with content (forced-open mode - input starts after <think>)
|
||||
@@ -1892,6 +2105,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.enable_thinking(true)
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK)
|
||||
.expect(message_assist_thoughts)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
// Tool call without reasoning
|
||||
@@ -1902,6 +2116,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.enable_thinking(false)
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
// Tool call with reasoning (forced-open mode)
|
||||
@@ -1914,6 +2129,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK)
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call_thoughts)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
tst.test(
|
||||
@@ -1933,6 +2149,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
{ "special_function", R"({"arg1": 1})", {} },
|
||||
{ "special_function_with_opt", R"({"arg1": 1, "arg2": 2})", {} },
|
||||
})
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
// #20650: tool with no required args, model emits <tool_call>name</tool_call> with no arg tags.
|
||||
@@ -1950,6 +2167,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.tools({ no_args_tool })
|
||||
.expect_reasoning("Let me read the diff content.")
|
||||
.expect_tool_calls({{ "read_file_diff_md", "{}", {} }})
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
}
|
||||
}
|
||||
@@ -2208,22 +2426,24 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
|
||||
// Kimi-K2 old template
|
||||
auto tst = peg_tester("models/templates/moonshotai-Kimi-K2.jinja", detailed_debug);
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).expect_reconstruction().run();
|
||||
tst.test(
|
||||
"<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>"
|
||||
"{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>")
|
||||
.tools({ special_function_tool })
|
||||
.expect(kimi_id_special_func_tool_call)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
// Kimi-K2-Instruct
|
||||
auto tst2 = peg_tester("models/templates/Kimi-K2-Instruct.jinja", detailed_debug);
|
||||
tst2.test("Hello, world!\nWhat's up?").expect(message_assist).run();
|
||||
tst2.test("Hello, world!\nWhat's up?").expect(message_assist).expect_reconstruction().run();
|
||||
tst2.test(
|
||||
"<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>"
|
||||
"{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>")
|
||||
.tools({ special_function_tool })
|
||||
.expect(kimi_id_special_func_tool_call)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
}
|
||||
|
||||
@@ -2297,6 +2517,19 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.tools({ empty_args_tool })
|
||||
.expect(simple_assist_msg("", "", "empty_args", "{}"))
|
||||
.run();
|
||||
|
||||
// fake tool call marker in reasoning
|
||||
tst.test(
|
||||
"<think>Let me think about <|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|> hmm</think>"
|
||||
"<|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>")
|
||||
.enable_thinking(true)
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.tools({ special_function_tool })
|
||||
.expect_reasoning("Let me think about <|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|> hmm")
|
||||
.expect_tool_calls({
|
||||
{ "special_function", R"({"arg1": 1})", {} },
|
||||
})
|
||||
.run();
|
||||
}
|
||||
|
||||
// Apertus-8B-Instruct tests - FUNC_NAME_AS_KEY format
|
||||
@@ -2306,6 +2539,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
tst.test("<|tools_prefix|>[{\"special_function\": {\"arg1\": 1}}]<|tools_suffix|>")
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
}
|
||||
|
||||
@@ -2314,7 +2548,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
{
|
||||
auto tst = peg_tester("models/templates/MiniMax-M2.jinja", detailed_debug);
|
||||
tst.test(
|
||||
"</think><minimax:tool_call>\n<invoke name=\"special_function\">\n<parameter "
|
||||
"<minimax:tool_call>\n<invoke name=\"special_function\">\n<parameter "
|
||||
"name=\"arg1\">1</parameter>\n</invoke>\n</minimax:tool_call>")
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call)
|
||||
@@ -2364,37 +2598,41 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
// mistralai-Mistral-Nemo-Instruct-2407.jinja
|
||||
{
|
||||
auto tst = peg_tester("models/templates/mistralai-Mistral-Nemo-Instruct-2407.jinja", detailed_debug);
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).expect_reconstruction().run();
|
||||
tst.test("[TOOL_CALLS][{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}, \"id\": \"123456789\"}]")
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call_id)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
}
|
||||
{
|
||||
auto tst = peg_tester("models/templates/meetkai-functionary-medium-v3.1.jinja", detailed_debug);
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).expect_reconstruction().run();
|
||||
tst.test("<function=special_function>{\"arg1\": 1}</function>")
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
}
|
||||
// Functionary v3.2 - recipient-based format: >>>recipient\n{content}
|
||||
{
|
||||
auto tst = peg_tester("models/templates/meetkai-functionary-medium-v3.2.jinja", detailed_debug);
|
||||
tst.test("all\nHello, world!\nWhat's up?").expect(message_assist).run();
|
||||
tst.test("all\nHello, world!\nWhat's up?").expect(message_assist).expect_reconstruction().run();
|
||||
tst.test("special_function\n{\"arg1\": 1}")
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
}
|
||||
|
||||
// FireFunction
|
||||
{
|
||||
auto tst = peg_tester("models/templates/fireworks-ai-llama-3-firefunction-v2.jinja", detailed_debug);
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).expect_reconstruction().run();
|
||||
tst.test(" functools[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]")
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
}
|
||||
|
||||
@@ -2455,10 +2693,11 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
{ "models/templates/MiMo-VL.jinja", "models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja",
|
||||
"models/templates/Qwen-Qwen2.5-7B-Instruct.jinja" }) {
|
||||
auto tst = peg_tester(path, detailed_debug);
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).expect_reconstruction().run();
|
||||
tst.test("<tool_call>\n{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n</tool_call>")
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
}
|
||||
|
||||
@@ -2481,6 +2720,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.enable_thinking(true)
|
||||
.expect(simple_assist_msg("Hello, world!\nWhat's up?", "Here are my reasoning steps:\nI'm\nthinking"))
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
// Reasoning + Tool calls
|
||||
@@ -2497,42 +2737,45 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
// Mistral Small 3.2 - FUNC_BRACKET_TAG format: [TOOL_CALLS]func_name[CALL_ID]id[ARGS]{...}
|
||||
{
|
||||
auto tst = peg_tester("models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja", detailed_debug);
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).expect_reconstruction().run();
|
||||
tst.test("[TOOL_CALLS]special_function[CALL_ID]123456789[ARGS]{\"arg1\": 1}")
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call_id)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
}
|
||||
// Devstral
|
||||
{
|
||||
auto tst = peg_tester("models/templates/unsloth-mistral-Devstral-Small-2507.jinja", detailed_debug);
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).expect_reconstruction().run();
|
||||
tst.test("[TOOL_CALLS]special_function[ARGS]{\"arg1\": 1}")
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
tst.test("Hello, world!\nWhat's up?[TOOL_CALLS]special_function[ARGS]{\"arg1\": 1}")
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call_content)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
}
|
||||
|
||||
{
|
||||
// Llama 3.1
|
||||
auto tst = peg_tester("models/templates/meta-llama-Llama-3.1-8B-Instruct.jinja", detailed_debug);
|
||||
tst.test("Hello, world!\nWhat's up?").tools({ special_function_tool }).expect(message_assist).run();
|
||||
tst.test("Hello, world!\nWhat's up?").tools({ special_function_tool }).expect(message_assist).expect_reconstruction().run();
|
||||
}
|
||||
|
||||
{
|
||||
// Llama 3.2
|
||||
auto tst = peg_tester("models/templates/meta-llama-Llama-3.2-3B-Instruct.jinja", detailed_debug);
|
||||
tst.test("Hello, world!\nWhat's up?").tools({ special_function_tool }).expect(message_assist).run();
|
||||
tst.test("Hello, world!\nWhat's up?").tools({ special_function_tool }).expect(message_assist).expect_reconstruction().run();
|
||||
}
|
||||
|
||||
{
|
||||
// Llama 3.3
|
||||
auto tst = peg_tester("models/templates/meta-llama-Llama-3.3-70B-Instruct.jinja", detailed_debug);
|
||||
tst.test("Hello, world!\nWhat's up?").tools({ python_tool }).expect(message_assist).run();
|
||||
tst.test("Hello, world!\nWhat's up?").tools({ python_tool }).expect(message_assist).expect_reconstruction().run();
|
||||
}
|
||||
|
||||
// GPT-OSS format tests
|
||||
@@ -2553,6 +2796,14 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.expect(message_assist_thoughts)
|
||||
.run();
|
||||
|
||||
// Analysis channel (reasoning) with final channel (content) with reasoning_format = none
|
||||
tst.test(
|
||||
"<|channel|>analysis<|message|>I'm\nthinking<|end|><|start|>assistant<|channel|>final<|message|>Hello, world!\nWhat's "
|
||||
"up?")
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_NONE)
|
||||
.expect_content("<|channel|>analysis<|message|>I'm\nthinking<|end|>Hello, world!\nWhat's up?")
|
||||
.run();
|
||||
|
||||
// Analysis channel only (partial) - still works when reasoning format is set
|
||||
tst.test("<|channel|>analysis<|message|>I'm\nthinking")
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
@@ -2562,24 +2813,28 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
|
||||
// Tool call with recipient in role header: " to=functions.NAME<|channel|>analysis<|message|>JSON"
|
||||
tst.test(" to=functions.special_function<|channel|>analysis<|message|>{\"arg1\": 1}")
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call)
|
||||
.run();
|
||||
|
||||
// Tool call with recipient in channel header: "<|channel|>analysis to=functions.NAME<|message|>JSON"
|
||||
tst.test("<|channel|>analysis to=functions.special_function<|message|>{\"arg1\": 1}")
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call)
|
||||
.run();
|
||||
|
||||
// Tool call with constraint: " to=functions.NAME<|channel|>analysis <|constrain|>json<|message|>JSON"
|
||||
tst.test(" to=functions.special_function<|channel|>analysis <|constrain|>json<|message|>{\"arg1\": 1}")
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call)
|
||||
.run();
|
||||
|
||||
// Tool call in commentary channel (channel header variant)
|
||||
tst.test("<|channel|>commentary to=functions.special_function<|message|>{\"arg1\": 1}")
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call)
|
||||
.run();
|
||||
@@ -2836,10 +3091,11 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
// GigaChat V3
|
||||
{
|
||||
auto tst = peg_tester("models/templates/GigaChat3-10B-A1.8B.jinja", detailed_debug);
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).expect_reconstruction().run();
|
||||
tst.test("<|message_sep|>\n\nfunction call<|role_sep|>\n{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}")
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
tst.test(
|
||||
@@ -2848,16 +3104,18 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
)
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call_content)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
}
|
||||
|
||||
// GigaChat V3.1
|
||||
{
|
||||
auto tst = peg_tester("models/templates/GigaChat3.1-10B-A1.8B.jinja", detailed_debug);
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).expect_reconstruction().run();
|
||||
tst.test("<|function_call|>{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}")
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
tst.test(
|
||||
@@ -2866,6 +3124,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
)
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call_content)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
}
|
||||
}
|
||||
@@ -3002,6 +3261,10 @@ int main(int argc, char ** argv) {
|
||||
detailed_debug = true;
|
||||
common_log_set_verbosity_thold(999);
|
||||
}
|
||||
if (arg == "--force-reconstruction-test") {
|
||||
g_force_reconstruction_test = true;
|
||||
only_run_filtered = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (only_run_filtered) {
|
||||
|
||||
@@ -467,6 +467,9 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg
|
||||
if (arch == LLM_ARCH_PLM) {
|
||||
continue; // TODO tensor shapes
|
||||
}
|
||||
if (arch == LLM_ARCH_DEEPSEEK2OCR) {
|
||||
continue; // TODO tensor shapes
|
||||
}
|
||||
|
||||
// FIXME some models are segfaulting with WebGPU:
|
||||
#ifdef GGML_USE_WEBGPU
|
||||
|
||||
@@ -61,8 +61,6 @@ static void test_reasoning_budget(
|
||||
|
||||
// Feed the sequence and track when forcing occurs
|
||||
for (size_t i = 0; i < sequence.size(); i++) {
|
||||
llama_sampler_accept(sampler, sequence[i]);
|
||||
|
||||
// Check if we're in forcing state by applying and seeing if logits are modified
|
||||
cur_p.selected = -1;
|
||||
for (size_t j = 0; j < cur.size(); j++) {
|
||||
@@ -81,6 +79,8 @@ static void test_reasoning_budget(
|
||||
}
|
||||
}
|
||||
|
||||
llama_sampler_accept(sampler, sequence[i]);
|
||||
|
||||
fprintf(stderr, " i=%zu: token=%d, finite_count=%zu, finite_token=%d\n", i, (int)sequence[i], finite_count, (int)finite_token);
|
||||
|
||||
if (finite_count == 1) {
|
||||
@@ -167,9 +167,9 @@ int main(void) {
|
||||
}
|
||||
|
||||
// Test 2: Budget exhausted, forcing should occur
|
||||
// Flow: i=0 accept(100)->COUNTING, i=1 accept(50)->remaining=1, i=2 accept(51)->remaining=0->FORCING
|
||||
// Forcing is active at i=2 and i=3 (when apply() is called while in FORCING state)
|
||||
// At i=4, force_pos becomes 2 which equals forced_tokens.size(), so state becomes DONE
|
||||
// Flow: i=0 apply()->passthrough, accept(100)->COUNTING; i=1 accept(50)->remaining=1
|
||||
// i=2 accept(51)->remaining=0->FORCING; i=3 apply() forces token[0]; i=4 apply() forces token[1]
|
||||
// At i=4, accept() advances force_pos to 2 which equals forced_tokens.size(), so state becomes DONE
|
||||
{
|
||||
const std::vector<llama_token> start = {100};
|
||||
const std::vector<llama_token> end = {101};
|
||||
@@ -179,13 +179,12 @@ int main(void) {
|
||||
test_reasoning_budget("budget exhausted forcing", sequence, start, end, forced,
|
||||
2, // budget of 2 tokens
|
||||
REASONING_BUDGET_IDLE,
|
||||
2, // forcing starts at i=2 (after accept(51) depletes budget, apply() forces)
|
||||
3); // forcing continues through i=3 (at i=4 state becomes DONE)
|
||||
3, // forcing starts at i=3 (accept at i=2 depletes budget, apply at i=3 forces)
|
||||
4); // forcing continues through i=4 (accept at i=4 transitions to DONE)
|
||||
}
|
||||
|
||||
// Test 3: Activate immediately with budget=0, forcing should start right away
|
||||
// Flow: Since no start token in sequence, state stays IDLE (no start/end configured means passthrough)
|
||||
// This test needs start token to be in the sequence or use activate_immediately with start token present
|
||||
// Flow: init promotes COUNTING+budget=0 to FORCING, so apply() sees FORCING at i=0
|
||||
{
|
||||
const std::vector<llama_token> start = {100};
|
||||
const std::vector<llama_token> end = {101};
|
||||
@@ -195,8 +194,8 @@ int main(void) {
|
||||
test_reasoning_budget("activate immediately budget=0", sequence, start, end, forced,
|
||||
0, // budget of 0 tokens
|
||||
REASONING_BUDGET_COUNTING, // starts counting, promoted to FORCING since budget=0
|
||||
0, // forcing starts at i=0 (after accept(100), budget=0 goes straight to FORCING)
|
||||
1); // forcing continues through i=1 (at i=2 state becomes DONE)
|
||||
0, // forcing starts at i=0 (initialized in FORCING, apply forces immediately)
|
||||
1); // forcing continues through i=1 (accept at i=1 transitions to DONE)
|
||||
}
|
||||
|
||||
// Test 4: No start/end tokens configured - passthrough (no forcing)
|
||||
@@ -214,7 +213,7 @@ int main(void) {
|
||||
|
||||
// Test 5: Activate immediately with budget > 0, count down then force
|
||||
// Flow: i=0 accept(50)->remaining=1, i=1 accept(51)->remaining=0->FORCING
|
||||
// So forcing starts at i=1 (apply after accept sees FORCING with force_pos=0)
|
||||
// Forcing starts at i=2 (apply sees FORCING after accept at i=1 transitioned)
|
||||
{
|
||||
const std::vector<llama_token> start = {100};
|
||||
const std::vector<llama_token> end = {101};
|
||||
@@ -224,8 +223,8 @@ int main(void) {
|
||||
test_reasoning_budget("activate immediately with budget", sequence, start, end, forced,
|
||||
2, // budget of 2 tokens
|
||||
REASONING_BUDGET_COUNTING,
|
||||
1, // forcing starts at i=1 (after 2 accepts deplete budget)
|
||||
2); // forcing continues through i=2
|
||||
2, // forcing starts at i=2 (after 2 accepts deplete budget, apply at i=2 forces)
|
||||
3); // forcing continues through i=3
|
||||
}
|
||||
|
||||
printf("OK (5 tests passed)\n");
|
||||
|
||||
+81
-18
@@ -100,7 +100,7 @@ struct cli_context {
|
||||
}
|
||||
|
||||
// reasoning budget sampler
|
||||
if (reasoning_budget >= 0 && !chat_params.thinking_end_tag.empty()) {
|
||||
if (!chat_params.thinking_end_tag.empty()) {
|
||||
const llama_vocab * vocab = llama_model_get_vocab(
|
||||
llama_get_model(ctx_server.get_llama_context()));
|
||||
|
||||
@@ -224,10 +224,11 @@ struct cli_context {
|
||||
};
|
||||
|
||||
// TODO?: Make this reusable, enums, docs
|
||||
static const std::array<const std::string, 6> cmds = {
|
||||
static const std::array<const std::string, 7> cmds = {
|
||||
"/audio ",
|
||||
"/clear",
|
||||
"/exit",
|
||||
"/glob ",
|
||||
"/image ",
|
||||
"/read ",
|
||||
"/regen",
|
||||
@@ -258,7 +259,7 @@ static std::vector<std::pair<std::string, size_t>> auto_completion_callback(std:
|
||||
}
|
||||
}
|
||||
|
||||
if (!cmd.empty() && line.length() >= cmd.length() && cursor_byte_pos >= cmd.length()) {
|
||||
if (!cmd.empty() && cmd != "/glob " && line.length() >= cmd.length() && cursor_byte_pos >= cmd.length()) {
|
||||
const std::string path_prefix = std::string(line.substr(cmd.length(), cursor_byte_pos - cmd.length()));
|
||||
const std::string path_postfix = std::string(line.substr(cursor_byte_pos));
|
||||
auto cur_dir = std::filesystem::current_path();
|
||||
@@ -339,6 +340,8 @@ static std::vector<std::pair<std::string, size_t>> auto_completion_callback(std:
|
||||
return matches;
|
||||
}
|
||||
|
||||
static constexpr size_t FILE_GLOB_MAX_RESULTS = 100;
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
common_params params;
|
||||
|
||||
@@ -430,7 +433,8 @@ int main(int argc, char ** argv) {
|
||||
console::log(" /exit or Ctrl+C stop or exit\n");
|
||||
console::log(" /regen regenerate the last response\n");
|
||||
console::log(" /clear clear the chat history\n");
|
||||
console::log(" /read add a text file\n");
|
||||
console::log(" /read <file> add a text file\n");
|
||||
console::log(" /glob <pattern> add text files using globbing pattern\n");
|
||||
if (inf.has_inp_image) {
|
||||
console::log(" /image <file> add an image file\n");
|
||||
}
|
||||
@@ -441,6 +445,27 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// interactive loop
|
||||
std::string cur_msg;
|
||||
|
||||
auto add_text_file = [&](const std::string & fname) -> bool {
|
||||
std::string marker = ctx_cli.load_input_file(fname, false);
|
||||
if (marker.empty()) {
|
||||
console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str());
|
||||
return false;
|
||||
}
|
||||
if (inf.fim_sep_token != LLAMA_TOKEN_NULL) {
|
||||
cur_msg += common_token_to_piece(ctx_cli.ctx_server.get_llama_context(), inf.fim_sep_token, true);
|
||||
cur_msg += fname;
|
||||
cur_msg.push_back('\n');
|
||||
} else {
|
||||
cur_msg += "--- File: ";
|
||||
cur_msg += fname;
|
||||
cur_msg += " ---\n";
|
||||
}
|
||||
cur_msg += marker;
|
||||
console::log("Loaded text from '%s'\n", fname.c_str());
|
||||
return true;
|
||||
};
|
||||
|
||||
while (true) {
|
||||
std::string buffer;
|
||||
console::set_display(DISPLAY_TYPE_USER_INPUT);
|
||||
@@ -525,22 +550,60 @@ int main(int argc, char ** argv) {
|
||||
continue;
|
||||
} else if (string_starts_with(buffer, "/read ")) {
|
||||
std::string fname = string_strip(buffer.substr(6));
|
||||
std::string marker = ctx_cli.load_input_file(fname, false);
|
||||
if (marker.empty()) {
|
||||
console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str());
|
||||
continue;
|
||||
add_text_file(fname);
|
||||
continue;
|
||||
} else if (string_starts_with(buffer, "/glob ")) {
|
||||
std::error_code ec;
|
||||
size_t count = 0;
|
||||
auto curdir = std::filesystem::current_path();
|
||||
std::string pattern = string_strip(buffer.substr(6));
|
||||
std::filesystem::path rel_path;
|
||||
|
||||
auto startglob = pattern.find_first_of("![*?");
|
||||
if (startglob != std::string::npos && startglob != 0) {
|
||||
auto endpath = pattern.substr(0, startglob).find_last_of('/');
|
||||
if (endpath != std::string::npos) {
|
||||
std::string rel_pattern = pattern.substr(0, endpath);
|
||||
#if !defined(_WIN32)
|
||||
if (string_starts_with(rel_pattern, "~")) {
|
||||
const char * home = std::getenv("HOME");
|
||||
if (home && home[0]) {
|
||||
rel_pattern = std::string(home) + rel_pattern.substr(1);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
rel_path = rel_pattern;
|
||||
pattern.erase(0, endpath + 1);
|
||||
curdir /= rel_path;
|
||||
}
|
||||
}
|
||||
if (inf.fim_sep_token != LLAMA_TOKEN_NULL) {
|
||||
cur_msg += common_token_to_piece(ctx_cli.ctx_server.get_llama_context(), inf.fim_sep_token, true);
|
||||
cur_msg += fname;
|
||||
cur_msg.push_back('\n');
|
||||
} else {
|
||||
cur_msg += "--- File: ";
|
||||
cur_msg += fname;
|
||||
cur_msg += " ---\n";
|
||||
|
||||
for (const auto & entry : std::filesystem::recursive_directory_iterator(curdir,
|
||||
std::filesystem::directory_options::skip_permission_denied, ec)) {
|
||||
if (!entry.is_regular_file()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
std::string rel = std::filesystem::relative(entry.path(), curdir, ec).string();
|
||||
if (ec) {
|
||||
ec.clear();
|
||||
continue;
|
||||
}
|
||||
std::replace(rel.begin(), rel.end(), '\\', '/');
|
||||
|
||||
if (!glob_match(pattern, rel)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!add_text_file((rel_path / rel).string())) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (++count >= FILE_GLOB_MAX_RESULTS) {
|
||||
console::error("Maximum number of globbed files allowed (%zu) reached.\n", FILE_GLOB_MAX_RESULTS);
|
||||
break;
|
||||
}
|
||||
}
|
||||
cur_msg += marker;
|
||||
console::log("Loaded text from '%s'\n", fname.c_str());
|
||||
continue;
|
||||
} else {
|
||||
// not a command
|
||||
|
||||
@@ -146,13 +146,19 @@ int main(int argc, char ** argv) {
|
||||
|
||||
ctx = llama_init->context();
|
||||
model = llama_init->model();
|
||||
smpl = llama_init->sampler(0);
|
||||
|
||||
if (ctx == NULL) {
|
||||
LOG_ERR("%s: error: unable to create context\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (model == NULL) {
|
||||
LOG_ERR("%s: error: unable to load model\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
smpl = llama_init->sampler(0);
|
||||
|
||||
llama_memory_t mem = llama_get_memory(ctx);
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
@@ -695,7 +701,7 @@ int main(int argc, char ** argv) {
|
||||
if (!common_prompt_batch_decode(ctx, embd, n_past, params.n_batch, path_session, save_now)) {
|
||||
return 1;
|
||||
}
|
||||
session_tokens.insert(session_tokens.end(), embd.begin(), embd.begin());
|
||||
session_tokens.insert(session_tokens.end(), embd.begin(), embd.end());
|
||||
n_session_consumed = session_tokens.size();
|
||||
session_do_save = false;
|
||||
|
||||
|
||||
@@ -143,11 +143,20 @@ static void compute_statistics(std::vector<tensor_statistics> & tstats, const st
|
||||
activations.reserve(e.values.size());
|
||||
|
||||
for (int i = 0; i < n_mat; ++i) {
|
||||
if (e.counts[i] == 0) {
|
||||
LOG_DBG("%s: skipping tensor %s due to zero count at index %d\n", __func__, name.c_str(), i);
|
||||
continue;
|
||||
}
|
||||
for (int j = 0; j < row_size; ++j) {
|
||||
activations.push_back(e.values[i*row_size + j] / e.counts[i]);
|
||||
}
|
||||
}
|
||||
|
||||
if (activations.empty()) {
|
||||
LOG_ERR("%s: all counts are zero for tensor %s, skipping statistics computation\n", __func__, name.c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
const float act_total = std::accumulate(activations.begin(), activations.end(), 0.0f);
|
||||
const float act_max = *std::max_element(activations.begin(), activations.end());
|
||||
const float act_min = *std::min_element(activations.begin(), activations.end());
|
||||
@@ -1142,10 +1151,12 @@ static bool show_statistics(const common_params & params) {
|
||||
blk = -1; // not a block layer
|
||||
}
|
||||
|
||||
const float entropy_norm = (tstat.elements > 0) ? 100.0f * (tstat.entropy / std::log2(tstat.elements)) : 0.0f;
|
||||
|
||||
LOG_INF("%5s\t%-20s\t%10.2f\t%8.4f\t%11.4f\t%6.2f\t%6.2f\t%8.2f%%\t%6d\t%10.4f\t%6.2f%%\t%10.2f%%\t%8.4f\n",
|
||||
layer.c_str(), name.c_str(), tstat.total_sqract, tstat.min_sqract, tstat.max_sqract, tstat.mean_sqract,
|
||||
tstat.stddev, tstat.active * 100.0f, tstat.elements, tstat.entropy,
|
||||
100.0f * (tstat.entropy / std::log2(tstat.elements)), 100.0f * tstat.zd, tstat.cossim);
|
||||
entropy_norm, 100.0f * tstat.zd, tstat.cossim);
|
||||
|
||||
const float weighted_bias = tstat.elements * tstat.total_sqract;
|
||||
const float weighted_zd = tstat.elements * tstat.zd;
|
||||
|
||||
@@ -1807,7 +1807,7 @@ struct markdown_printer : public printer {
|
||||
if (!is_cpu_backend) {
|
||||
fields.emplace_back("n_gpu_layers");
|
||||
}
|
||||
if (params.n_cpu_moe.size() > 1) {
|
||||
if (params.n_cpu_moe.size() > 1 || params.n_cpu_moe != cmd_params_defaults.n_cpu_moe) {
|
||||
fields.emplace_back("n_cpu_moe");
|
||||
}
|
||||
if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
|
||||
|
||||
@@ -5,6 +5,7 @@ find_package(Threads REQUIRED)
|
||||
add_library(mtmd
|
||||
mtmd.cpp
|
||||
mtmd-audio.cpp
|
||||
mtmd-image.cpp
|
||||
mtmd.h
|
||||
mtmd-helper.cpp
|
||||
mtmd-helper.h
|
||||
@@ -30,6 +31,7 @@ add_library(mtmd
|
||||
models/qwen3vl.cpp
|
||||
models/siglip.cpp
|
||||
models/whisper-enc.cpp
|
||||
models/deepseekocr.cpp
|
||||
models/mobilenetv5.cpp
|
||||
models/youtuvl.cpp
|
||||
)
|
||||
|
||||
+21
-4
@@ -51,13 +51,14 @@
|
||||
|
||||
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
|
||||
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
|
||||
#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
|
||||
#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
|
||||
#define KEY_WIN_ATTN_LAYER_INDEXES "clip.vision.wa_layer_indexes"
|
||||
#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
|
||||
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
|
||||
#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num"
|
||||
|
||||
#define KEY_SAM_N_HEAD "clip.vision.sam.head_count"
|
||||
#define KEY_SAM_N_BLOCK "clip.vision.sam.block_count"
|
||||
#define KEY_SAM_N_EMBD "clip.vision.sam.embedding_length"
|
||||
// audio-specific
|
||||
#define KEY_AUDIO_PROJ_TYPE "clip.audio.projector_type" // for models with mixed modalities
|
||||
#define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins"
|
||||
@@ -99,12 +100,13 @@
|
||||
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
|
||||
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
|
||||
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
|
||||
#define TN_IMAGE_NEWLINE "model.image_newline"
|
||||
#define TN_IMAGE_NEWLINE "v.image_newline"
|
||||
#define TN_IMAGE_SEPERATOR "v.view_seperator"
|
||||
#define TN_MM_INP_NORM "mm.input_norm.weight"
|
||||
#define TN_MM_INP_NORM_B "mm.input_norm.bias"
|
||||
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
|
||||
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
|
||||
#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3
|
||||
#define TN_MM_PROJECTOR "mm.model.fc.%s" // idefics3, deepseekocr
|
||||
#define TN_MM_PATCH_MERGER "mm.patch_merger.%s" // mistral small 3.1, glm4v
|
||||
#define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral
|
||||
#define TN_TOK_GLM_BOI "adapter.boi" // glm-edge (these embeddings are not in text model)
|
||||
@@ -143,6 +145,19 @@
|
||||
#define TN_TOK_BOI "v.boi"
|
||||
#define TN_TOK_EOI "v.eoi"
|
||||
|
||||
// deepseek-ocr
|
||||
#define TN_SAM_POS_EMBD "v.sam.pos_embd.%s"
|
||||
#define TN_SAM_PATCH_EMBD "v.sam.patch_embd.%s"
|
||||
#define TN_SAM_PRE_NORM "v.sam.blk.%d.pre_ln.%s"
|
||||
#define TN_SAM_POST_NORM "v.sam.blk.%d.post_ln.%s"
|
||||
#define TN_SAM_ATTN_POS_H "v.sam.blk.%d.attn.pos_h.%s"
|
||||
#define TN_SAM_ATTN_POS_W "v.sam.blk.%d.attn.pos_w.%s"
|
||||
#define TN_SAM_ATTN_QKV "v.sam.blk.%d.attn.qkv.%s"
|
||||
#define TN_SAM_ATTN_OUT "v.sam.blk.%d.attn.out.%s"
|
||||
#define TN_SAM_FFN_UP "v.sam.blk.%d.mlp.lin1.%s"
|
||||
#define TN_SAM_FFN_DOWN "v.sam.blk.%d.mlp.lin2.%s"
|
||||
#define TN_SAM_NECK "v.sam.neck.%d.%s"
|
||||
#define TN_SAM_NET "v.sam.net_%d.%s"
|
||||
// (conformer) lfm2
|
||||
#define TN_PRE_ENCODE_OUT "a.pre_encode.out.%s"
|
||||
#define TN_FFN_NORM "%s.blk.%d.ffn_norm.%s"
|
||||
@@ -236,6 +251,7 @@ enum projector_type {
|
||||
PROJECTOR_TYPE_LIGHTONOCR,
|
||||
PROJECTOR_TYPE_COGVLM,
|
||||
PROJECTOR_TYPE_JANUS_PRO,
|
||||
PROJECTOR_TYPE_DEEPSEEKOCR,
|
||||
PROJECTOR_TYPE_LFM2A,
|
||||
PROJECTOR_TYPE_GLM4V,
|
||||
PROJECTOR_TYPE_YOUTUVL,
|
||||
@@ -273,6 +289,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||
{ PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
|
||||
{ PROJECTOR_TYPE_COGVLM, "cogvlm"},
|
||||
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
|
||||
{ PROJECTOR_TYPE_DEEPSEEKOCR,"deepseekocr"},
|
||||
{ PROJECTOR_TYPE_LFM2A, "lfm2a"},
|
||||
{ PROJECTOR_TYPE_GLM4V, "glm4v"},
|
||||
{ PROJECTOR_TYPE_YOUTUVL, "youtuvl"},
|
||||
|
||||
+63
-4
@@ -28,6 +28,13 @@ enum patch_merge_type {
|
||||
PATCH_MERGE_SPATIAL_UNPAD,
|
||||
};
|
||||
|
||||
enum resize_algo {
|
||||
RESIZE_ALGO_BILINEAR, // stretch to target resolution
|
||||
RESIZE_ALGO_BICUBIC, // center-crop when aspect ratio doesn't match
|
||||
RESIZE_ALGO_BICUBIC_PILLOW,
|
||||
// RESIZE_ALGO_LANCZOS, // TODO
|
||||
};
|
||||
|
||||
struct clip_hparams {
|
||||
int32_t image_size = 0;
|
||||
int32_t patch_size = 0;
|
||||
@@ -37,13 +44,26 @@ struct clip_hparams {
|
||||
int32_t n_head = 0;
|
||||
int32_t n_layer = 0;
|
||||
// idefics3
|
||||
int32_t n_merge = 0; // number of patch merges **per-side**
|
||||
|
||||
// for preprocessor
|
||||
int32_t image_longest_edge = 0;
|
||||
int32_t image_min_pixels = -1;
|
||||
int32_t image_max_pixels = -1;
|
||||
int32_t n_merge = 0; // number of patch merges **per-side**
|
||||
resize_algo image_resize_algo = RESIZE_ALGO_BICUBIC;
|
||||
bool image_resize_pad = true; // if false, center-crop will be applied when resizing
|
||||
std::array<uint8_t, 3> image_pad_color = {0, 0, 0};
|
||||
|
||||
// (preprocessor) for llava-uhd style models
|
||||
std::vector<clip_image_size> image_res_candidates;
|
||||
int32_t preproc_min_tiles = 0;
|
||||
int32_t preproc_max_tiles = 0;
|
||||
resize_algo image_resize_algo_rf = RESIZE_ALGO_BICUBIC;
|
||||
resize_algo image_resize_algo_ov = RESIZE_ALGO_BILINEAR;
|
||||
bool image_pad_rf = true; // if true, refined image will be padded (e.g. llava-1.6)
|
||||
bool image_pad_ov = false; // if true, overview image will be padded (e.g. llava-1.6)
|
||||
std::array<uint8_t, 3> image_pad_color_rf = {0, 0, 0}; // padding color for refined image
|
||||
std::array<uint8_t, 3> image_pad_color_ov = {0, 0, 0}; // padding color for overview image
|
||||
|
||||
float image_mean[3];
|
||||
float image_std[3];
|
||||
@@ -60,13 +80,16 @@ struct clip_hparams {
|
||||
float eps = 1e-6;
|
||||
float rope_theta = 0.0;
|
||||
|
||||
std::vector<clip_image_size> image_res_candidates; // for llava-uhd style models
|
||||
int32_t image_crop_resolution;
|
||||
std::unordered_set<int32_t> vision_feature_layer;
|
||||
int32_t attn_window_size = 0;
|
||||
int32_t n_wa_pattern = 0;
|
||||
std::unordered_set<int32_t> wa_layer_indexes; // explicit layer indexes that use full attention (for irregular patterns like YoutuVL)
|
||||
|
||||
// deepseek-ocr (sam)
|
||||
int32_t sam_n_layer = 0;
|
||||
int32_t sam_n_head = 0;
|
||||
int32_t sam_n_embd = 0;
|
||||
|
||||
// audio
|
||||
int32_t n_mel_bins = 0; // whisper preprocessor
|
||||
int32_t proj_stack_factor = 0; // ultravox
|
||||
@@ -102,6 +125,21 @@ struct clip_hparams {
|
||||
warmup_image_size = n_tok_per_side * patch_size * cur_merge;
|
||||
// TODO: support warmup size for custom token numbers
|
||||
}
|
||||
// sam vit deepseek-ocr
|
||||
std::vector<int32_t> global_attn_indices() const {
|
||||
return { 2, 5, 8, 11 };
|
||||
}
|
||||
bool is_global_attn(int32_t layer) const {
|
||||
const auto indices = global_attn_indices();
|
||||
|
||||
for (const auto & idx : indices) {
|
||||
if (layer == idx) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
struct clip_layer {
|
||||
@@ -148,6 +186,9 @@ struct clip_layer {
|
||||
ggml_tensor * deepstack_fc2_w = nullptr;
|
||||
ggml_tensor * deepstack_fc2_b = nullptr;
|
||||
|
||||
// sam rel_pos
|
||||
ggml_tensor * rel_pos_w = nullptr;
|
||||
ggml_tensor * rel_pos_h = nullptr;
|
||||
// lfm2
|
||||
ggml_tensor * ff_norm_w = nullptr;
|
||||
ggml_tensor * ff_norm_b = nullptr;
|
||||
@@ -240,7 +281,6 @@ struct clip_model {
|
||||
ggml_tensor * post_ln_w;
|
||||
ggml_tensor * post_ln_b;
|
||||
|
||||
ggml_tensor * projection; // TODO: rename it to fc (fully connected layer)
|
||||
ggml_tensor * mm_fc_w;
|
||||
ggml_tensor * mm_fc_b;
|
||||
ggml_tensor * mm_ffn_up_w = nullptr;
|
||||
@@ -261,6 +301,8 @@ struct clip_model {
|
||||
ggml_tensor * mm_2_b = nullptr;
|
||||
|
||||
ggml_tensor * image_newline = nullptr;
|
||||
ggml_tensor * view_seperator = nullptr;
|
||||
|
||||
|
||||
// Yi type models with mlp+normalization projection
|
||||
ggml_tensor * mm_1_w = nullptr; // Yi type models have 0, 1, 3, 4
|
||||
@@ -372,6 +414,23 @@ struct clip_model {
|
||||
ggml_tensor * mm_boi = nullptr;
|
||||
ggml_tensor * mm_eoi = nullptr;
|
||||
|
||||
// deepseek ocr sam
|
||||
ggml_tensor * patch_embed_proj_w = nullptr;
|
||||
ggml_tensor * patch_embed_proj_b = nullptr;
|
||||
ggml_tensor * pos_embed = nullptr;
|
||||
|
||||
ggml_tensor * neck_0_w;
|
||||
ggml_tensor * neck_1_w;
|
||||
ggml_tensor * neck_1_b;
|
||||
ggml_tensor * neck_2_w;
|
||||
ggml_tensor * neck_3_w;
|
||||
ggml_tensor * neck_3_b;
|
||||
ggml_tensor * net_2;
|
||||
ggml_tensor * net_3;
|
||||
|
||||
int32_t n_sam_layers = 12; // used by deepseek-ocr sam encoder
|
||||
|
||||
std::vector<clip_layer> sam_layers;
|
||||
// lfm2 audio
|
||||
std::array<ggml_tensor *, 7> pre_encode_conv_X_w = {nullptr};
|
||||
std::array<ggml_tensor *, 7> pre_encode_conv_X_b = {nullptr};
|
||||
|
||||
+156
-1064
File diff suppressed because it is too large
Load Diff
@@ -97,9 +97,6 @@ struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch
|
||||
*/
|
||||
void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
|
||||
|
||||
/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
|
||||
bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
|
||||
|
||||
struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
|
||||
|
||||
bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
|
||||
|
||||
@@ -0,0 +1,324 @@
|
||||
#include "models.h"
|
||||
|
||||
// Implementation based on approach suggested by Acly
|
||||
// See: https://github.com/ggml-org/llama.cpp/pull/17383#issuecomment-3554227091
|
||||
static ggml_tensor * window_partition(ggml_context * ctx0, ggml_tensor * x, const int window) {
|
||||
auto [c, w, h, b] = x->ne;
|
||||
// same as
|
||||
// x = ggml_win_part(m, x, window);
|
||||
// x = ggml_reshape_3d(m, x, c, window * window, x->ne[3]);
|
||||
|
||||
const int64_t px = (window - w % window) % window;
|
||||
const int64_t py = (window - h % window) % window;
|
||||
const int64_t npw = (w + px) / window;
|
||||
const int64_t nph = (h + py) / window;
|
||||
|
||||
ggml_tensor * cur = x;
|
||||
if (px > 0 || py > 0) {
|
||||
cur = ggml_pad(ctx0, cur, 0, static_cast<int>(px), static_cast<int>(py), 0);
|
||||
}
|
||||
cur = ggml_reshape_4d(ctx0, cur, c * window, npw, window, nph * b);
|
||||
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3));
|
||||
cur = ggml_reshape_4d(ctx0, cur, c, window, window, npw * nph * b);
|
||||
return cur;
|
||||
}
|
||||
|
||||
// Implementation based on approach suggested by Acly
|
||||
// See: https://github.com/ggml-org/llama.cpp/pull/17383#issuecomment-3554227091
|
||||
static ggml_tensor * window_unpartition(ggml_context * ctx0,
|
||||
ggml_tensor * x,
|
||||
const int w,
|
||||
const int h,
|
||||
const int window) {
|
||||
const int64_t c = x->ne[0];
|
||||
// same as
|
||||
// x = ggml_reshape_4d(m, x, c, window, window, x->ne[2]);
|
||||
// x = ggml_win_unpart(m, x, w, h, window);
|
||||
|
||||
const int64_t px = (window - w % window) % window;
|
||||
const int64_t py = (window - h % window) % window;
|
||||
const int64_t npw = (w + px) / window;
|
||||
const int64_t nph = (h + py) / window;
|
||||
|
||||
const int64_t b = x->ne[3] / (npw * nph);
|
||||
ggml_tensor * cur = x;
|
||||
cur = ggml_reshape_4d(ctx0, cur, c * window, window, npw, nph * b);
|
||||
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3));
|
||||
cur = ggml_reshape_4d(ctx0, cur, c, w + px, h + py, b);
|
||||
cur = ggml_view_4d(ctx0, cur, cur->ne[0], w, h, cur->ne[3], cur->nb[1], cur->nb[2], cur->nb[3], 0);
|
||||
cur = ggml_cont(ctx0, cur);
|
||||
return cur;
|
||||
}
|
||||
|
||||
static ggml_tensor * get_rel_pos(ggml_context * ctx0,
|
||||
ggml_tensor * rel_pos, // [L, C]
|
||||
ggml_tensor * indices, // [q_size, k_size]
|
||||
const int q_size,
|
||||
const int k_size) {
|
||||
const int64_t C = rel_pos->ne[0]; // channels
|
||||
const int64_t L = rel_pos->ne[1]; // length
|
||||
|
||||
GGML_ASSERT(indices != nullptr);
|
||||
GGML_ASSERT(indices->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(indices->ne[0] == k_size);
|
||||
GGML_ASSERT(indices->ne[1] == q_size);
|
||||
|
||||
const auto max_rel_dist = 2 * std::max(q_size, k_size) - 1;
|
||||
ggml_tensor * cur = rel_pos;
|
||||
|
||||
if (max_rel_dist != L) {
|
||||
// Linear interpolation
|
||||
const int64_t ne0 = cur->ne[0];
|
||||
const int64_t ne1 = cur->ne[1];
|
||||
const int64_t ne2 = cur->ne[2];
|
||||
const int64_t ne3 = cur->ne[3];
|
||||
|
||||
cur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 0, 2, 3)), ne1, 1, ne0 * ne2 * ne3);
|
||||
cur = ggml_reshape_4d(
|
||||
ctx0, ggml_interpolate(ctx0, cur, max_rel_dist, 1, ne0 * ne2 * ne3, 1, GGML_SCALE_MODE_BILINEAR),
|
||||
max_rel_dist, ne0, ne2, ne3);
|
||||
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 0, 2, 3));
|
||||
}
|
||||
|
||||
// Flatten indices to 1D for ggml_get_rows
|
||||
const int qk = q_size * k_size;
|
||||
|
||||
cur = ggml_reshape_3d(ctx0, ggml_get_rows(ctx0, cur, ggml_reshape_1d(ctx0, indices, qk)), C, k_size, q_size);
|
||||
|
||||
return cur; // [C, k_size, q_size]
|
||||
}
|
||||
|
||||
ggml_cgraph * clip_graph_deepseekocr::build() {
|
||||
// patch embedding
|
||||
ggml_tensor * inp_raw = build_inp_raw();
|
||||
|
||||
ggml_tensor * sam_out;
|
||||
// Building SAM
|
||||
{
|
||||
const int n_embd = hparams.sam_n_embd;
|
||||
const int n_layer = hparams.sam_n_layer;
|
||||
const int n_heads = hparams.sam_n_head;
|
||||
const int d_heads = n_embd / n_heads;
|
||||
const int window = hparams.attn_window_size;
|
||||
|
||||
ggml_tensor * inpL;
|
||||
|
||||
inpL = ggml_conv_2d_sk_p0(ctx0, model.patch_embed_proj_w, inp_raw);
|
||||
inpL = ggml_add(ctx0, inpL, ggml_reshape_3d(ctx0, model.patch_embed_proj_b, 1, 1, n_embd));
|
||||
inpL = ggml_cont(ctx0, ggml_permute(ctx0, inpL, 1, 2, 0, 3));
|
||||
|
||||
ggml_tensor * rel_pos_indices_local;
|
||||
ggml_tensor * rel_pos_indices_global;
|
||||
|
||||
rel_pos_indices_local = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, window, window);
|
||||
rel_pos_indices_global = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, inpL->ne[1], inpL->ne[2]);
|
||||
ggml_set_name(rel_pos_indices_local, "rel_pos_indices_local");
|
||||
ggml_set_name(rel_pos_indices_global, "rel_pos_indices_global");
|
||||
ggml_set_input(rel_pos_indices_local);
|
||||
ggml_set_input(rel_pos_indices_global);
|
||||
|
||||
ggml_tensor * cur;
|
||||
const auto tgt_size = inpL->ne[1];
|
||||
const auto str_size = model.pos_embed->ne[1];
|
||||
|
||||
if (str_size != tgt_size) {
|
||||
ggml_tensor * old_pos_embed = nullptr;
|
||||
old_pos_embed = ggml_cont(ctx0, ggml_permute(ctx0, model.pos_embed, 2, 0, 1, 3));
|
||||
ggml_tensor * new_pos_embed =
|
||||
ggml_interpolate(ctx0, old_pos_embed, tgt_size, tgt_size, n_embd, 1, GGML_SCALE_MODE_BICUBIC);
|
||||
new_pos_embed = ggml_cont(ctx0, ggml_permute(ctx0, new_pos_embed, 1, 2, 0, 3));
|
||||
cur = ggml_add(ctx0, inpL, new_pos_embed);
|
||||
} else {
|
||||
cur = ggml_add(ctx0, inpL, model.pos_embed);
|
||||
}
|
||||
|
||||
// loop over layers
|
||||
for (int il = 0; il < n_layer; il++) {
|
||||
auto & layer = model.sam_layers[il];
|
||||
ggml_tensor * shortcut = cur;
|
||||
|
||||
// layernorm1
|
||||
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
|
||||
|
||||
const int64_t w0 = cur->ne[1];
|
||||
const int64_t h0 = cur->ne[2];
|
||||
|
||||
ggml_tensor * indices;
|
||||
|
||||
if (hparams.is_global_attn(il)) {
|
||||
indices = rel_pos_indices_global;
|
||||
} else {
|
||||
// local attention layer - apply window partition
|
||||
cur = window_partition(ctx0, cur, window);
|
||||
indices = rel_pos_indices_local;
|
||||
}
|
||||
|
||||
const int64_t W = cur->ne[1];
|
||||
const int64_t H = cur->ne[2];
|
||||
// self-attention
|
||||
{
|
||||
const int B = cur->ne[3];
|
||||
|
||||
cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
|
||||
cur = ggml_add(ctx0, cur, layer.qkv_b);
|
||||
cur = ggml_cont(ctx0, cur); // Ensure tensor is contiguous before reshape
|
||||
cur = ggml_reshape_4d(ctx0, cur, n_embd, 3, W * H, B);
|
||||
|
||||
ggml_tensor * Q;
|
||||
ggml_tensor * K;
|
||||
ggml_tensor * V;
|
||||
|
||||
Q = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 0 * cur->nb[1]);
|
||||
Q = ggml_reshape_4d(ctx0, ggml_cont(ctx0, Q), d_heads, n_heads, W * H, B);
|
||||
|
||||
K = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 1 * cur->nb[1]);
|
||||
K = ggml_reshape_4d(ctx0, ggml_cont(ctx0, K), d_heads, n_heads, W * H, B);
|
||||
|
||||
V = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 2 * cur->nb[1]);
|
||||
V = ggml_reshape_4d(ctx0, ggml_cont(ctx0, V), d_heads, n_heads, W * H, B);
|
||||
|
||||
ggml_tensor * mask;
|
||||
ggml_tensor * rw;
|
||||
ggml_tensor * rh;
|
||||
ggml_tensor * qr;
|
||||
|
||||
rw = get_rel_pos(ctx0, layer.rel_pos_w, indices, W, W); // [W, W, C]
|
||||
rh = get_rel_pos(ctx0, layer.rel_pos_h, indices, H, H); // [H, H, C]
|
||||
qr = ggml_permute(ctx0, Q, 0, 2, 1, 3);
|
||||
qr = ggml_reshape_4d(ctx0, ggml_cont(ctx0, qr), d_heads, W, H, B * n_heads);
|
||||
|
||||
rw = ggml_mul_mat(ctx0, rw,
|
||||
ggml_cont(ctx0, ggml_permute(ctx0, qr, 0, 2, 1, 3))); // [B*n_heads, W, H, W]
|
||||
rw = ggml_cont(ctx0, ggml_permute(ctx0, rw, 0, 2, 1, 3)); // [B*n_heads, H, W, W]
|
||||
rw = ggml_reshape_4d(ctx0, rw, W, 1, W * H, n_heads * B);
|
||||
rw = ggml_repeat_4d(ctx0, rw, W, H, W * H, n_heads * B);
|
||||
rh = ggml_mul_mat(ctx0, rh, qr); // [B*n_heads, H, W, H]
|
||||
rh = ggml_reshape_4d(ctx0, rh, 1, H, W * H, n_heads * B);
|
||||
mask = ggml_add(ctx0, rw, rh); // [B*n_heads, H*W, H, W]
|
||||
mask = ggml_reshape_4d(ctx0, mask, W * H, W * H, n_heads, B);
|
||||
mask = ggml_cast(ctx0, mask, GGML_TYPE_F16);
|
||||
|
||||
const float scale = 1.0f / sqrtf(static_cast<float>(d_heads));
|
||||
|
||||
cur = build_attn(layer.o_w, layer.o_b, Q, K, V, mask, scale,
|
||||
il); // [B, H*W, n_embd]
|
||||
cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur), n_embd, W, H, B);
|
||||
}
|
||||
|
||||
if (hparams.is_global_attn(il) == false) {
|
||||
// local attention layer - reverse window partition
|
||||
cur = window_unpartition(ctx0, cur, w0, h0, window);
|
||||
}
|
||||
|
||||
// re-add the layer input, e.g., residual
|
||||
cur = ggml_add(ctx0, cur, shortcut);
|
||||
|
||||
ggml_tensor * inpFF = cur;
|
||||
|
||||
// layernorm2
|
||||
cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
|
||||
|
||||
// ffn
|
||||
cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b,
|
||||
hparams.ffn_op, il);
|
||||
|
||||
// residual 2
|
||||
cur = ggml_add(ctx0, cur, inpFF);
|
||||
cb(cur, "sam_layer_out", il);
|
||||
}
|
||||
|
||||
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
|
||||
|
||||
cur = ggml_conv_2d(ctx0, model.neck_0_w, cur, 1, 1, 0, 0, 1, 1);
|
||||
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
|
||||
cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, hparams.eps, -1);
|
||||
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
|
||||
|
||||
cur = ggml_conv_2d(ctx0, model.neck_2_w, cur, 1, 1, 1, 1, 1, 1);
|
||||
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
|
||||
cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, hparams.eps, -1);
|
||||
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
|
||||
|
||||
cur = ggml_conv_2d(ctx0, model.net_2, cur, 2, 2, 1, 1, 1, 1);
|
||||
cur = ggml_conv_2d(ctx0, model.net_3, cur, 2, 2, 1, 1, 1, 1);
|
||||
cb(cur, "sam_output", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
sam_out = cur;
|
||||
}
|
||||
|
||||
ggml_tensor * clip_out;
|
||||
// Building DS-OCR CLIP
|
||||
{
|
||||
ggml_tensor * inp;
|
||||
|
||||
inp = ggml_cpy(ctx0, sam_out, ggml_dup_tensor(ctx0, sam_out));
|
||||
inp = ggml_reshape_2d(ctx0, inp, inp->ne[0] * inp->ne[1], inp->ne[2]);
|
||||
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
|
||||
|
||||
ggml_tensor * new_pos_embd =
|
||||
ggml_cpy(ctx0, model.position_embeddings, ggml_dup_tensor(ctx0, model.position_embeddings));
|
||||
|
||||
int n_pos = new_pos_embd->ne[1]; // +1 for [CLS]
|
||||
const auto tgt_size = static_cast<int>(std::sqrt(inp->ne[1]));
|
||||
const auto src_size = static_cast<int>(std::sqrt(n_pos - 1));
|
||||
|
||||
if (tgt_size != src_size) {
|
||||
ggml_tensor * old_pos_embd;
|
||||
ggml_tensor * cls_tok;
|
||||
|
||||
old_pos_embd = ggml_view_2d(ctx0, new_pos_embd, new_pos_embd->ne[0], src_size * src_size,
|
||||
ggml_row_size(new_pos_embd->type, new_pos_embd->ne[0]), 0);
|
||||
cls_tok = ggml_view_2d(ctx0, new_pos_embd, new_pos_embd->ne[0], 1,
|
||||
ggml_row_size(new_pos_embd->type, new_pos_embd->ne[0]), src_size * src_size);
|
||||
new_pos_embd = ggml_interpolate(ctx0, old_pos_embd, tgt_size, tgt_size, new_pos_embd->ne[0], 1,
|
||||
GGML_SCALE_MODE_BICUBIC);
|
||||
new_pos_embd = ggml_reshape_3d(ctx0, new_pos_embd, n_embd, tgt_size * tgt_size, 1);
|
||||
new_pos_embd = ggml_concat(ctx0, new_pos_embd, cls_tok, 1);
|
||||
n_pos = tgt_size * tgt_size + 1;
|
||||
}
|
||||
|
||||
// add CLS token
|
||||
inp = ggml_concat(ctx0, model.class_embedding, inp, 1);
|
||||
|
||||
// for selecting learned pos embd, used by ViT
|
||||
ggml_tensor * positions = ggml_cast(ctx0, ggml_arange(ctx0, 0, n_pos, 1), GGML_TYPE_I32);
|
||||
ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, new_pos_embd, positions);
|
||||
|
||||
ggml_tensor * cur = build_vit(inp, n_pos, NORM_TYPE_NORMAL, FFN_GELU_QUICK, learned_pos_embd, nullptr);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
clip_out = cur;
|
||||
}
|
||||
|
||||
const int clip_n_patches = sam_out->ne[0] * sam_out->ne[1];
|
||||
|
||||
sam_out = ggml_cont(ctx0, ggml_permute(ctx0, sam_out, 1, 2, 0, 3));
|
||||
sam_out = ggml_reshape_2d(ctx0, sam_out, sam_out->ne[0], clip_n_patches);
|
||||
clip_out = ggml_view_2d(ctx0, clip_out, n_embd, clip_n_patches, clip_out->nb[1], clip_out->nb[1]);
|
||||
|
||||
ggml_tensor * cur;
|
||||
cur = ggml_concat(ctx0, clip_out, sam_out, 0);
|
||||
cur = ggml_reshape_2d(ctx0, cur, 2 * n_embd, clip_n_patches);
|
||||
cur = ggml_cont(ctx0, cur);
|
||||
cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.mm_fc_b);
|
||||
|
||||
const auto h = static_cast<int>(std::sqrt(static_cast<float>(cur->ne[1])));
|
||||
const auto w = h;
|
||||
const auto n_dim = cur->ne[0];
|
||||
|
||||
ggml_tensor * imgnl;
|
||||
ggml_tensor * vs;
|
||||
|
||||
imgnl = ggml_repeat_4d(ctx0, model.image_newline, n_dim, 1, h, 1);
|
||||
vs = ggml_reshape_2d(ctx0, model.view_seperator, n_dim, 1); // (n_dim, 1)
|
||||
cur = ggml_reshape_3d(ctx0, cur, n_dim, w, h);
|
||||
cur = ggml_reshape_2d(ctx0, ggml_concat(ctx0, cur, imgnl, 1), n_dim, (w + 1) * h);
|
||||
cur = ggml_concat(ctx0, cur, vs, 1); // (n_dim, h*(w+1) + 1)
|
||||
|
||||
cb(cur, "dsocr_output", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
return gf;
|
||||
}
|
||||
@@ -97,7 +97,7 @@ ggml_cgraph * clip_graph_glm4v::build() {
|
||||
|
||||
// FC projector
|
||||
{
|
||||
cur = build_mm(model.projection, cur);
|
||||
cur = build_mm(model.mm_fc_w, cur);
|
||||
// default LayerNorm (post_projection_norm)
|
||||
cur = build_norm(cur, model.mm_post_norm_w, model.mm_post_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);
|
||||
cur = ggml_gelu_erf(ctx0, cur);
|
||||
|
||||
@@ -77,6 +77,11 @@ struct clip_graph_whisper_enc : clip_graph {
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_deepseekocr : clip_graph {
|
||||
clip_graph_deepseekocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_conformer : clip_graph {
|
||||
clip_graph_conformer(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
|
||||
@@ -43,7 +43,7 @@ ggml_cgraph * clip_graph_siglip::build() {
|
||||
// https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
|
||||
const int scale_factor = model.hparams.n_merge;
|
||||
cur = build_patch_merge_permute(cur, scale_factor);
|
||||
cur = build_mm(model.projection, cur);
|
||||
cur = build_mm(model.mm_fc_w, cur);
|
||||
|
||||
} else if (proj_type == PROJECTOR_TYPE_LFM2) {
|
||||
// pixel unshuffle block
|
||||
|
||||
@@ -13,23 +13,20 @@
|
||||
|
||||
constexpr bool DEBUG = false;
|
||||
|
||||
void mtmd_audio_cache::fill_sin_cos_table(int n) {
|
||||
void mtmd_audio_cache::fill_sin_cos_table(uint32_t n) {
|
||||
sin_vals.resize(n);
|
||||
cos_vals.resize(n);
|
||||
for (int i = 0; i < n; i++) {
|
||||
for (uint32_t i = 0; i < n; i++) {
|
||||
double theta = (2 * M_PI * i) / n;
|
||||
sin_vals[i] = sinf(theta);
|
||||
cos_vals[i] = cosf(theta);
|
||||
}
|
||||
}
|
||||
|
||||
void mtmd_audio_cache::fill_hann_window(int length, bool periodic) {
|
||||
void mtmd_audio_cache::fill_hann_window(uint32_t length, bool periodic) {
|
||||
hann_window.resize(length);
|
||||
int offset = -1;
|
||||
if (periodic) {
|
||||
offset = 0;
|
||||
}
|
||||
for (int i = 0; i < length; i++) {
|
||||
int offset = periodic ? 0 : -1;
|
||||
for (uint32_t i = 0; i < length; i++) {
|
||||
hann_window[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
|
||||
}
|
||||
}
|
||||
@@ -165,6 +162,7 @@ static void dft_impl(const mtmd_audio_cache & cache, const float * in, int N, fl
|
||||
// false = input is complex-valued (interleaved real/imag, stride 2)
|
||||
template <bool Inverse, bool RealInput>
|
||||
static void fft_impl(const mtmd_audio_cache & cache, float * in, int N, float * out) {
|
||||
GGML_ASSERT(N > 0);
|
||||
const int n_sin_cos_vals = cache.sin_vals.size();
|
||||
|
||||
if (N == 1) {
|
||||
@@ -407,6 +405,8 @@ static bool log_mel_spectrogram(
|
||||
}
|
||||
|
||||
|
||||
GGML_ASSERT(params.n_fft_bins > 0);
|
||||
GGML_ASSERT(params.hop_length > 0);
|
||||
out.n_mel = params.n_mel;
|
||||
out.n_len = (n_samples - frame_size) / frame_step + 1;
|
||||
// TODO: handle these checks better
|
||||
@@ -438,6 +438,7 @@ static bool log_mel_spectrogram(
|
||||
|
||||
const int effective_n_len = n_samples_in / frame_step;
|
||||
if (params.norm_per_feature) {
|
||||
GGML_ASSERT(effective_n_len > 1);
|
||||
for (int i = 0; i < out.n_mel; i++) {
|
||||
double mean = 0;
|
||||
for (int j = 0; j < effective_n_len; ++j) {
|
||||
@@ -639,6 +640,7 @@ mtmd_audio_streaming_istft::mtmd_audio_streaming_istft(int n_fft, int hop_length
|
||||
padding_to_remove((n_fft - hop_length) / 2),
|
||||
ifft_in(n_fft * 2 * 4, 0.0f), // extra space for recursive IFFT
|
||||
ifft_out(n_fft * 2 * 4, 0.0f) {
|
||||
GGML_ASSERT(n_fft > 0 && hop_length > 0 && hop_length <= n_fft);
|
||||
cache.fill_sin_cos_table(n_fft);
|
||||
cache.fill_hann_window(n_fft, true);
|
||||
}
|
||||
|
||||
@@ -33,9 +33,9 @@ struct mtmd_audio_cache {
|
||||
|
||||
mtmd_audio_mel_filters filters;
|
||||
|
||||
void fill_sin_cos_table(int n);
|
||||
void fill_sin_cos_table(uint32_t n);
|
||||
|
||||
void fill_hann_window(int length, bool periodic);
|
||||
void fill_hann_window(uint32_t length, bool periodic);
|
||||
|
||||
// Build mel filterbank matrix [n_mel × n_fft_bins] at runtime.
|
||||
// n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257.
|
||||
|
||||
@@ -127,6 +127,7 @@ struct decode_embd_batch {
|
||||
std::vector<int8_t> logits;
|
||||
llama_batch batch;
|
||||
decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
|
||||
GGML_ASSERT(n_tokens > 0 && n_pos_per_embd > 0 && n_mmproj_embd > 0);
|
||||
pos .resize(n_tokens * n_pos_per_embd);
|
||||
n_seq_id.resize(n_tokens);
|
||||
seq_ids .resize(n_tokens + 1);
|
||||
@@ -157,6 +158,7 @@ struct decode_embd_batch {
|
||||
// M-RoPE for image
|
||||
void set_position_mrope_2d(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
|
||||
GGML_ASSERT(n_pos_per_embd == 4);
|
||||
GGML_ASSERT(nx > 0 && ny > 0 && nx * ny == batch.n_tokens);
|
||||
seq_id_0[0] = seq_id;
|
||||
for (int y = 0; y < ny; y++) {
|
||||
for (int x = 0; x < nx; x++) {
|
||||
@@ -192,6 +194,7 @@ struct decode_embd_batch {
|
||||
}
|
||||
|
||||
llama_batch get_view(int offset, int n_tokens) {
|
||||
GGML_ASSERT(offset >= 0 && n_tokens > 0 && offset + n_tokens <= batch.n_tokens);
|
||||
llama_pos * pos_ptr;
|
||||
pos_view.clear();
|
||||
pos_view.reserve(n_tokens * n_pos_per_embd);
|
||||
@@ -235,6 +238,7 @@ int32_t mtmd_helper_decode_image_chunk(
|
||||
llama_seq_id seq_id,
|
||||
int32_t n_batch,
|
||||
llama_pos * new_n_past) {
|
||||
GGML_ASSERT(n_batch > 0);
|
||||
auto chunk_type = mtmd_input_chunk_get_type(chunk);
|
||||
const char * name = chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE ? "image" : "audio";
|
||||
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
||||
@@ -312,6 +316,7 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
|
||||
int32_t n_batch,
|
||||
bool logits_last,
|
||||
llama_pos * new_n_past) {
|
||||
GGML_ASSERT(n_batch > 0);
|
||||
int32_t ret;
|
||||
llama_batch text_batch = llama_batch_init(n_batch, 0, 1);
|
||||
auto chunk_type = mtmd_input_chunk_get_type(chunk);
|
||||
@@ -508,6 +513,11 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char *
|
||||
fseek(f, 0, SEEK_END);
|
||||
long file_size = ftell(f);
|
||||
fseek(f, 0, SEEK_SET);
|
||||
if (file_size < 0) {
|
||||
LOG_ERR("Failed to get file size of %s\n", fname);
|
||||
fclose(f);
|
||||
return nullptr;
|
||||
}
|
||||
buf.resize(file_size);
|
||||
|
||||
size_t n_read = fread(buf.data(), 1, file_size, f);
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,150 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "clip-model.h"
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#define MTMD_INTERNAL_HEADER
|
||||
|
||||
// base class, models must inherit from this class
|
||||
struct mtmd_image_preprocessor {
|
||||
const clip_hparams & hparams;
|
||||
|
||||
mtmd_image_preprocessor(const clip_ctx * ctx): hparams(*clip_get_hparams(ctx)) {}
|
||||
|
||||
virtual ~mtmd_image_preprocessor() = default;
|
||||
virtual bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) = 0;
|
||||
|
||||
void img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]);
|
||||
void img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst);
|
||||
};
|
||||
|
||||
/**
|
||||
* implementation of LLaVA-UHD:
|
||||
* - https://arxiv.org/pdf/2403.11703
|
||||
* - https://github.com/thunlp/LLaVA-UHD
|
||||
* - https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118
|
||||
*
|
||||
* overview:
|
||||
* - an image always have a single overview (downscaled image)
|
||||
* - an image can have 0 or multiple slices, depending on the image size
|
||||
* - each slice can then be considered as a separate image
|
||||
*
|
||||
* note: the term "slice" and "tile" are used interchangeably
|
||||
*
|
||||
* for example:
|
||||
*
|
||||
* [overview] --> [slice 1] --> [slice 2]
|
||||
* | |
|
||||
* +--> [slice 3] --> [slice 4]
|
||||
*/
|
||||
struct mtmd_image_preprocessor_llava_uhd : mtmd_image_preprocessor {
|
||||
mtmd_image_preprocessor_llava_uhd(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
|
||||
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||
|
||||
struct slice_coordinates {
|
||||
int x;
|
||||
int y;
|
||||
clip_image_size size;
|
||||
};
|
||||
|
||||
struct slice_instructions {
|
||||
clip_image_size overview_size; // size of downscaled image
|
||||
clip_image_size refined_size; // size of image right before slicing (must be multiple of slice size)
|
||||
clip_image_size grid_size; // grid_size.width * grid_size.height = number of slices
|
||||
std::vector<slice_coordinates> slices;
|
||||
};
|
||||
|
||||
// LFM2 override this function to implement its custom slicing logic
|
||||
virtual slice_instructions get_slice_instructions(const clip_image_size & original_size);
|
||||
|
||||
std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 & img, const slice_instructions & inst, bool overview_first = true);
|
||||
|
||||
private:
|
||||
clip_image_size get_best_resize(const clip_image_size & original_size, int scale_resolution, int patch_size, bool allow_upscale = false);
|
||||
|
||||
clip_image_size resize_maintain_aspect_ratio(const clip_image_size & orig, const clip_image_size & target_max);
|
||||
|
||||
/**
|
||||
* Selects the best resolution from a list of possible resolutions based on the original size.
|
||||
*
|
||||
* For example, when given a list of resolutions:
|
||||
* - 100x100
|
||||
* - 200x100
|
||||
* - 100x200
|
||||
* - 200x200
|
||||
*
|
||||
* And an input image of size 111x200, then 100x200 is the best fit (least wasted resolution).
|
||||
*
|
||||
* @param original_size The original size of the image
|
||||
* @param possible_resolutions A list of possible resolutions
|
||||
* @return The best fit resolution
|
||||
*/
|
||||
clip_image_size select_best_resolution(const clip_image_size & original_size, const std::vector<clip_image_size> & possible_resolutions);
|
||||
int ensure_divide(int length, int patch_size);
|
||||
clip_image_size get_refine_size(const clip_image_size & original_size, const clip_image_size & grid, int scale_resolution, int patch_size, bool allow_upscale = false);
|
||||
clip_image_size get_best_grid(const int max_slice_nums, const int multiple, const float log_ratio);
|
||||
};
|
||||
|
||||
// downscale or upscale the input image to fixed size
|
||||
struct mtmd_image_preprocessor_fixed_size : mtmd_image_preprocessor {
|
||||
mtmd_image_preprocessor_fixed_size(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
|
||||
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||
};
|
||||
|
||||
// resize image to multiple of patch_size*n_merge, while preserving aspect ratio
|
||||
// if image_resize_pad is true, the resized image will be padded, otherwise it will be either stretched or center-cropped depending on image_resize_pad
|
||||
// this is used by models with native support for dynamic image size, for example: Qwen-VL, Pixtral, Kimi-VL, etc
|
||||
struct mtmd_image_preprocessor_dyn_size : mtmd_image_preprocessor {
|
||||
mtmd_image_preprocessor_dyn_size(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
|
||||
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||
};
|
||||
|
||||
// similar to mtmd_image_preprocessor_dyn_size, but resize the image to have longest edge equal to hparams.image_longest_edge, while preserving aspect ratio
|
||||
struct mtmd_image_preprocessor_longest_edge : mtmd_image_preprocessor {
|
||||
mtmd_image_preprocessor_longest_edge(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
|
||||
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||
};
|
||||
|
||||
// custom llava-uhd slicing logic for LFM2
|
||||
// ref: https://github.com/huggingface/transformers/blob/v5.1.0/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py
|
||||
struct mtmd_image_preprocessor_lfm2 : mtmd_image_preprocessor_llava_uhd {
|
||||
// ref: https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B/blob/main/processor_config.json
|
||||
static constexpr int min_tiles = 2;
|
||||
static constexpr int max_tiles = 10;
|
||||
static constexpr float max_pixels_tolerance = 2.0f;
|
||||
static constexpr int tile_size = 512;
|
||||
|
||||
using mtmd_image_preprocessor_llava_uhd::mtmd_image_preprocessor_llava_uhd;
|
||||
slice_instructions get_slice_instructions(const clip_image_size & original_size) override;
|
||||
|
||||
private:
|
||||
clip_image_size find_closest_aspect_ratio(
|
||||
float aspect_ratio,
|
||||
const std::vector<clip_image_size> & target_ratios,
|
||||
int width, int height);
|
||||
std::vector<clip_image_size> get_target_ratios();
|
||||
clip_image_size get_grid_layout(int height, int width);
|
||||
};
|
||||
|
||||
struct mtmd_image_preprocessor_idefics3 : mtmd_image_preprocessor_llava_uhd {
|
||||
mtmd_image_preprocessor_idefics3(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {}
|
||||
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||
};
|
||||
|
||||
struct mtmd_image_preprocessor_internvl : mtmd_image_preprocessor_llava_uhd {
|
||||
mtmd_image_preprocessor_internvl(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {}
|
||||
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||
};
|
||||
|
||||
struct mtmd_image_preprocessor_deepseekocr : mtmd_image_preprocessor {
|
||||
mtmd_image_preprocessor_deepseekocr(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
|
||||
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||
};
|
||||
|
||||
struct mtmd_image_preprocessor_youtuvl : mtmd_image_preprocessor {
|
||||
mtmd_image_preprocessor_youtuvl(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
|
||||
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||
};
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user